diff options
| author | zhang <zch921005@126.com> | 2022-07-31 13:07:02 +0800 |
|---|---|---|
| committer | zhang <zch921005@126.com> | 2022-07-31 13:07:02 +0800 |
| commit | fd4e40ae2ae58c06226cc9eb4c2ae9bdcfb677fd (patch) | |
| tree | cba17079e81ba7ed99f818cfb3c2f30aceacf6f0 | |
| parent | 92d3bc06bad13095df6515111bba45e73f701018 (diff) | |
wordpiece
| -rw-r--r-- | nlp/gensim_demo/text8.zip | bin | 0 -> 31344016 bytes | |||
| -rw-r--r-- | nlp/gensim_demo/w2c.py | 26 | ||||
| -rw-r--r-- | projs/01-fashion-mnist/00_dataset_dataloader.py | 20 |
3 files changed, 46 insertions, 0 deletions
diff --git a/nlp/gensim_demo/text8.zip b/nlp/gensim_demo/text8.zip Binary files differnew file mode 100644 index 0000000..436e05b --- /dev/null +++ b/nlp/gensim_demo/text8.zip diff --git a/nlp/gensim_demo/w2c.py b/nlp/gensim_demo/w2c.py new file mode 100644 index 0000000..bf4ccb1 --- /dev/null +++ b/nlp/gensim_demo/w2c.py @@ -0,0 +1,26 @@ + +from gensim.models import word2vec, Word2Vec +from multiprocessing import cpu_count +import numpy as np +from gensim.matutils import unitvec + +if __name__ == '__main__': + + # sentences = word2vec.Text8Corpus('text8') + # model = Word2Vec(sentences, workers=cpu_count()//2) + # model.save('text8.model') + model = Word2Vec.load('text8.model') + # woman + king - man == ? + print(model.most_similar(positive=['woman', 'king'], negative=['man'], topn=2)) + + woman_vec = model.wv.word_vec('woman', use_norm=True) + king_vec = model.wv.word_vec('king', use_norm=True) + man_vec = model.wv.word_vec('man', use_norm=True) + + query = unitvec((woman_vec+king_vec-man_vec)/3) + sims = np.dot(model.wv.vectors_norm, query) + indices = sims.argsort()[::-1][:5] + for index in indices: + print(index, sims[index], model.wv.index2word[index]) + + diff --git a/projs/01-fashion-mnist/00_dataset_dataloader.py b/projs/01-fashion-mnist/00_dataset_dataloader.py new file mode 100644 index 0000000..e967821 --- /dev/null +++ b/projs/01-fashion-mnist/00_dataset_dataloader.py @@ -0,0 +1,20 @@ + +from torch.utils.data import Dataset +from torchvision import datasets +from torchvision import transforms as T +import torch + +training_dataset = datasets.FashionMNIST(root='./data', train=True, transform=T.ToTensor(), download=True) +test_dataset = datasets.FashionMNIST(root='./data', train=False, transform=T.ToTensor(), download=True) + + +print(training_dataset.classes) + +training_loader = torch.utils.data.DataLoader(training_dataset, batch_size=4, shuffle=True, num_workers=0) +validation_loader = torch.utils.data.DataLoader(test_dataset, batch_size=4, shuffle=False, num_workers=0) + +# next(iter(training_loader)) + +for i, data in enumerate(training_loader): + batch_images, batch_labels = data + break |
