nlp/gensim_demo/w2c.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34


from gensim.models import word2vec, Word2Vec
from multiprocessing import cpu_count
import numpy as np
from gensim.matutils import unitvec

if __name__ == '__main__':

    # 包含了句子以及分词的处理
    # sentences = word2vec.Text8Corpus('text8')
    # # sentences = list of list of words
    # model = Word2Vec(sentences, workers=cpu_count()//2)
    # model.save('text8.model')

    model = Word2Vec.load('text8.model')

    # model.wv.vectors, model.wv.index2word
    # woman + king - man == ?
    # woman + king - man == queen
    print(model.most_similar(positive=['woman', 'king'], negative=['man'], topn=2))


    woman_vec = model.wv.word_vec('woman', use_norm=True)
    king_vec = model.wv.word_vec('king', use_norm=True)
    man_vec = model.wv.word_vec('man', use_norm=True)

    query = unitvec((woman_vec+king_vec-man_vec)/3)
    sims = np.dot(model.wv.vectors_norm, query)
    indices = sims.argsort()[::-1][:5]
    for index in indices:
        print(index, sims[index], model.wv.index2word[index])