summaryrefslogtreecommitdiff
path: root/learn_torch/text_transformer.py
diff options
context:
space:
mode:
authorzhang <zch921005@126.com>2022-05-04 08:47:54 +0800
committerzhang <zch921005@126.com>2022-05-04 08:47:54 +0800
commit2180c68999eb8dc0c7bcec015b2703f5b8b20223 (patch)
tree3ec71623038ff8b90a5bc4e32da14a7382d42d9d /learn_torch/text_transformer.py
parent70aebb73b81b50911e2107cd4519e69f09971021 (diff)
ndarray axis
Diffstat (limited to 'learn_torch/text_transformer.py')
-rw-r--r--learn_torch/text_transformer.py8
1 files changed, 8 insertions, 0 deletions
diff --git a/learn_torch/text_transformer.py b/learn_torch/text_transformer.py
new file mode 100644
index 0000000..1eb3877
--- /dev/null
+++ b/learn_torch/text_transformer.py
@@ -0,0 +1,8 @@
+from torchtext.datasets import WikiText2
+from torchtext.data.utils import get_tokenizer
+from torchtext.vocab import build_vocab_from_iterator
+
+train_iter = WikiText2(split='train')
+tokenizer = get_tokenizer('basic_english')
+vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=['<unk>'])
+vocab.set_default_index(vocab['<unk>']) \ No newline at end of file