From fd4e40ae2ae58c06226cc9eb4c2ae9bdcfb677fd Mon Sep 17 00:00:00 2001 From: zhang Date: Sun, 31 Jul 2022 13:07:02 +0800 Subject: wordpiece --- nlp/gensim_demo/text8.zip | Bin 0 -> 31344016 bytes nlp/gensim_demo/w2c.py | 26 ++++++++++++++++++++++++++ 2 files changed, 26 insertions(+) create mode 100644 nlp/gensim_demo/text8.zip create mode 100644 nlp/gensim_demo/w2c.py (limited to 'nlp/gensim_demo') diff --git a/nlp/gensim_demo/text8.zip b/nlp/gensim_demo/text8.zip new file mode 100644 index 0000000..436e05b Binary files /dev/null and b/nlp/gensim_demo/text8.zip differ diff --git a/nlp/gensim_demo/w2c.py b/nlp/gensim_demo/w2c.py new file mode 100644 index 0000000..bf4ccb1 --- /dev/null +++ b/nlp/gensim_demo/w2c.py @@ -0,0 +1,26 @@ + +from gensim.models import word2vec, Word2Vec +from multiprocessing import cpu_count +import numpy as np +from gensim.matutils import unitvec + +if __name__ == '__main__': + + # sentences = word2vec.Text8Corpus('text8') + # model = Word2Vec(sentences, workers=cpu_count()//2) + # model.save('text8.model') + model = Word2Vec.load('text8.model') + # woman + king - man == ? + print(model.most_similar(positive=['woman', 'king'], negative=['man'], topn=2)) + + woman_vec = model.wv.word_vec('woman', use_norm=True) + king_vec = model.wv.word_vec('king', use_norm=True) + man_vec = model.wv.word_vec('man', use_norm=True) + + query = unitvec((woman_vec+king_vec-man_vec)/3) + sims = np.dot(model.wv.vectors_norm, query) + indices = sims.argsort()[::-1][:5] + for index in indices: + print(index, sims[index], model.wv.index2word[index]) + + -- cgit v1.2.3