summaryrefslogtreecommitdiff
path: root/nlp/gensim_demo
diff options
context:
space:
mode:
Diffstat (limited to 'nlp/gensim_demo')
-rw-r--r--nlp/gensim_demo/text8.zipbin0 -> 31344016 bytes
-rw-r--r--nlp/gensim_demo/w2c.py26
2 files changed, 26 insertions, 0 deletions
diff --git a/nlp/gensim_demo/text8.zip b/nlp/gensim_demo/text8.zip
new file mode 100644
index 0000000..436e05b
--- /dev/null
+++ b/nlp/gensim_demo/text8.zip
Binary files differ
diff --git a/nlp/gensim_demo/w2c.py b/nlp/gensim_demo/w2c.py
new file mode 100644
index 0000000..bf4ccb1
--- /dev/null
+++ b/nlp/gensim_demo/w2c.py
@@ -0,0 +1,26 @@
+
+from gensim.models import word2vec, Word2Vec
+from multiprocessing import cpu_count
+import numpy as np
+from gensim.matutils import unitvec
+
+if __name__ == '__main__':
+
+ # sentences = word2vec.Text8Corpus('text8')
+ # model = Word2Vec(sentences, workers=cpu_count()//2)
+ # model.save('text8.model')
+ model = Word2Vec.load('text8.model')
+ # woman + king - man == ?
+ print(model.most_similar(positive=['woman', 'king'], negative=['man'], topn=2))
+
+ woman_vec = model.wv.word_vec('woman', use_norm=True)
+ king_vec = model.wv.word_vec('king', use_norm=True)
+ man_vec = model.wv.word_vec('man', use_norm=True)
+
+ query = unitvec((woman_vec+king_vec-man_vec)/3)
+ sims = np.dot(model.wv.vectors_norm, query)
+ indices = sims.argsort()[::-1][:5]
+ for index in indices:
+ print(index, sims[index], model.wv.index2word[index])
+
+