wordpiece

author: zhang <zch921005@126.com> 2022-07-31 13:07:02 +0800
committer: zhang <zch921005@126.com> 2022-07-31 13:07:02 +0800
commit: fd4e40ae2ae58c06226cc9eb4c2ae9bdcfb677fd (patch)
tree: cba17079e81ba7ed99f818cfb3c2f30aceacf6f0 /nlp
parent: 92d3bc06bad13095df6515111bba45e73f701018 (diff)
2 files changed, 26 insertions, 0 deletions
diff --git a/nlp/gensim_demo/text8.zip b/nlp/gensim_demo/text8.zip
new file mode 100644
index 0000000..436e05b
--- /dev/null
+++ b/nlp/gensim_demo/text8.zip
diff --git a/nlp/gensim_demo/w2c.py b/nlp/gensim_demo/w2c.py
new file mode 100644
index 0000000..bf4ccb1
--- /dev/null
+++ b/nlp/gensim_demo/w2c.py
@@ -0,0 +1,26 @@
+
+from gensim.models import word2vec, Word2Vec
+from multiprocessing import cpu_count
+import numpy as np
+from gensim.matutils import unitvec
+
+if __name__ == '__main__':
+
+    # sentences = word2vec.Text8Corpus('text8')
+    # model = Word2Vec(sentences, workers=cpu_count()//2)
+    # model.save('text8.model')
+    model = Word2Vec.load('text8.model')
+    # woman + king - man == ?
+    print(model.most_similar(positive=['woman', 'king'], negative=['man'], topn=2))
+
+    woman_vec = model.wv.word_vec('woman', use_norm=True)
+    king_vec = model.wv.word_vec('king', use_norm=True)
+    man_vec = model.wv.word_vec('man', use_norm=True)
+
+    query = unitvec((woman_vec+king_vec-man_vec)/3)
+    sims = np.dot(model.wv.vectors_norm, query)
+    indices = sims.argsort()[::-1][:5]
+    for index in indices:
+        print(index, sims[index], model.wv.index2word[index])
+
+
author	zhang <zch921005@126.com>	2022-07-31 13:07:02 +0800
committer	zhang <zch921005@126.com>	2022-07-31 13:07:02 +0800
commit	fd4e40ae2ae58c06226cc9eb4c2ae9bdcfb677fd (patch)
tree	cba17079e81ba7ed99f818cfb3c2f30aceacf6f0 /nlp
parent	92d3bc06bad13095df6515111bba45e73f701018 (diff)