diff options
| author | zhang <zch921005@126.com> | 2022-07-31 13:07:02 +0800 |
|---|---|---|
| committer | zhang <zch921005@126.com> | 2022-07-31 13:07:02 +0800 |
| commit | fd4e40ae2ae58c06226cc9eb4c2ae9bdcfb677fd (patch) | |
| tree | cba17079e81ba7ed99f818cfb3c2f30aceacf6f0 /nlp | |
| parent | 92d3bc06bad13095df6515111bba45e73f701018 (diff) | |
wordpiece
Diffstat (limited to 'nlp')
| -rw-r--r-- | nlp/gensim_demo/text8.zip | bin | 0 -> 31344016 bytes | |||
| -rw-r--r-- | nlp/gensim_demo/w2c.py | 26 |
2 files changed, 26 insertions, 0 deletions
diff --git a/nlp/gensim_demo/text8.zip b/nlp/gensim_demo/text8.zip Binary files differnew file mode 100644 index 0000000..436e05b --- /dev/null +++ b/nlp/gensim_demo/text8.zip diff --git a/nlp/gensim_demo/w2c.py b/nlp/gensim_demo/w2c.py new file mode 100644 index 0000000..bf4ccb1 --- /dev/null +++ b/nlp/gensim_demo/w2c.py @@ -0,0 +1,26 @@ + +from gensim.models import word2vec, Word2Vec +from multiprocessing import cpu_count +import numpy as np +from gensim.matutils import unitvec + +if __name__ == '__main__': + + # sentences = word2vec.Text8Corpus('text8') + # model = Word2Vec(sentences, workers=cpu_count()//2) + # model.save('text8.model') + model = Word2Vec.load('text8.model') + # woman + king - man == ? + print(model.most_similar(positive=['woman', 'king'], negative=['man'], topn=2)) + + woman_vec = model.wv.word_vec('woman', use_norm=True) + king_vec = model.wv.word_vec('king', use_norm=True) + man_vec = model.wv.word_vec('man', use_norm=True) + + query = unitvec((woman_vec+king_vec-man_vec)/3) + sims = np.dot(model.wv.vectors_norm, query) + indices = sims.argsort()[::-1][:5] + for index in indices: + print(index, sims[index], model.wv.index2word[index]) + + |
