blob: cf3cd92540e50fafa0aec5dda135f553a5714fa1 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
|
from gensim.models import word2vec, Word2Vec
from multiprocessing import cpu_count
import numpy as np
from gensim.matutils import unitvec
if __name__ == '__main__':
# 包含了句子以及分词的处理
# sentences = word2vec.Text8Corpus('text8')
# # sentences = list of list of words
# model = Word2Vec(sentences, workers=cpu_count()//2)
# model.save('text8.model')
model = Word2Vec.load('text8.model')
# model.wv.vectors, model.wv.index2word
# woman + king - man == ?
# woman + king - man == queen
print(model.most_similar(positive=['woman', 'king'], negative=['man'], topn=2))
woman_vec = model.wv.word_vec('woman', use_norm=True)
king_vec = model.wv.word_vec('king', use_norm=True)
man_vec = model.wv.word_vec('man', use_norm=True)
query = unitvec((woman_vec+king_vec-man_vec)/3)
sims = np.dot(model.wv.vectors_norm, query)
indices = sims.argsort()[::-1][:5]
for index in indices:
print(index, sims[index], model.wv.index2word[index])
|