summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--nlp/gensim_demo/text8.zipbin0 -> 31344016 bytes
-rw-r--r--nlp/gensim_demo/w2c.py26
-rw-r--r--projs/01-fashion-mnist/00_dataset_dataloader.py20
3 files changed, 46 insertions, 0 deletions
diff --git a/nlp/gensim_demo/text8.zip b/nlp/gensim_demo/text8.zip
new file mode 100644
index 0000000..436e05b
--- /dev/null
+++ b/nlp/gensim_demo/text8.zip
Binary files differ
diff --git a/nlp/gensim_demo/w2c.py b/nlp/gensim_demo/w2c.py
new file mode 100644
index 0000000..bf4ccb1
--- /dev/null
+++ b/nlp/gensim_demo/w2c.py
@@ -0,0 +1,26 @@
+
+from gensim.models import word2vec, Word2Vec
+from multiprocessing import cpu_count
+import numpy as np
+from gensim.matutils import unitvec
+
+if __name__ == '__main__':
+
+ # sentences = word2vec.Text8Corpus('text8')
+ # model = Word2Vec(sentences, workers=cpu_count()//2)
+ # model.save('text8.model')
+ model = Word2Vec.load('text8.model')
+ # woman + king - man == ?
+ print(model.most_similar(positive=['woman', 'king'], negative=['man'], topn=2))
+
+ woman_vec = model.wv.word_vec('woman', use_norm=True)
+ king_vec = model.wv.word_vec('king', use_norm=True)
+ man_vec = model.wv.word_vec('man', use_norm=True)
+
+ query = unitvec((woman_vec+king_vec-man_vec)/3)
+ sims = np.dot(model.wv.vectors_norm, query)
+ indices = sims.argsort()[::-1][:5]
+ for index in indices:
+ print(index, sims[index], model.wv.index2word[index])
+
+
diff --git a/projs/01-fashion-mnist/00_dataset_dataloader.py b/projs/01-fashion-mnist/00_dataset_dataloader.py
new file mode 100644
index 0000000..e967821
--- /dev/null
+++ b/projs/01-fashion-mnist/00_dataset_dataloader.py
@@ -0,0 +1,20 @@
+
+from torch.utils.data import Dataset
+from torchvision import datasets
+from torchvision import transforms as T
+import torch
+
+training_dataset = datasets.FashionMNIST(root='./data', train=True, transform=T.ToTensor(), download=True)
+test_dataset = datasets.FashionMNIST(root='./data', train=False, transform=T.ToTensor(), download=True)
+
+
+print(training_dataset.classes)
+
+training_loader = torch.utils.data.DataLoader(training_dataset, batch_size=4, shuffle=True, num_workers=0)
+validation_loader = torch.utils.data.DataLoader(test_dataset, batch_size=4, shuffle=False, num_workers=0)
+
+# next(iter(training_loader))
+
+for i, data in enumerate(training_loader):
+ batch_images, batch_labels = data
+ break