summaryrefslogtreecommitdiff
path: root/nlp/gensim_demo/w2c.py
blob: bf4ccb13e7ef5edcdc545e3a8c3fae38e65c6c7b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26

from gensim.models import word2vec, Word2Vec
from multiprocessing import cpu_count
import numpy as np
from gensim.matutils import unitvec

if __name__ == '__main__':

    # sentences = word2vec.Text8Corpus('text8')
    # model = Word2Vec(sentences, workers=cpu_count()//2)
    # model.save('text8.model')
    model = Word2Vec.load('text8.model')
    # woman + king - man == ?
    print(model.most_similar(positive=['woman', 'king'], negative=['man'], topn=2))

    woman_vec = model.wv.word_vec('woman', use_norm=True)
    king_vec = model.wv.word_vec('king', use_norm=True)
    man_vec = model.wv.word_vec('man', use_norm=True)

    query = unitvec((woman_vec+king_vec-man_vec)/3)
    sims = np.dot(model.wv.vectors_norm, query)
    indices = sims.argsort()[::-1][:5]
    for index in indices:
        print(index, sims[index], model.wv.index2word[index])