diff options
| author | zhang <zch921005@126.com> | 2022-05-04 08:47:54 +0800 |
|---|---|---|
| committer | zhang <zch921005@126.com> | 2022-05-04 08:47:54 +0800 |
| commit | 2180c68999eb8dc0c7bcec015b2703f5b8b20223 (patch) | |
| tree | 3ec71623038ff8b90a5bc4e32da14a7382d42d9d /learn_torch/text_transformer.py | |
| parent | 70aebb73b81b50911e2107cd4519e69f09971021 (diff) | |
ndarray axis
Diffstat (limited to 'learn_torch/text_transformer.py')
| -rw-r--r-- | learn_torch/text_transformer.py | 8 |
1 files changed, 8 insertions, 0 deletions
diff --git a/learn_torch/text_transformer.py b/learn_torch/text_transformer.py new file mode 100644 index 0000000..1eb3877 --- /dev/null +++ b/learn_torch/text_transformer.py @@ -0,0 +1,8 @@ +from torchtext.datasets import WikiText2 +from torchtext.data.utils import get_tokenizer +from torchtext.vocab import build_vocab_from_iterator + +train_iter = WikiText2(split='train') +tokenizer = get_tokenizer('basic_english') +vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=['<unk>']) +vocab.set_default_index(vocab['<unk>'])
\ No newline at end of file |
