summaryrefslogtreecommitdiff
path: root/learn_torch/text_transformer.py
blob: 1eb3877520ae8bb791be44fa9a2fa1a7ddabbdab (plain)
1
2
3
4
5
6
7
8
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

train_iter = WikiText2(split='train')
tokenizer = get_tokenizer('basic_english')
vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=['<unk>'])
vocab.set_default_index(vocab['<unk>'])