posted Nov 24, 2015, 1:26 PM by Chris G
[
updated Jan 11, 2016, 11:03 AM
]
Some basic examples for the use of the Python Word2Vec implementation in Gensim:
#!/usr/bin/env python
from gensim.models import Word2Vec
sentences = [['first', 'sentence'], ['second', 'sentence']]
# train word2vec on the two sentences
model = gensim.models.Word2Vec(sentences, min_count=1)
#or with different options
model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)
#Persist a model to disk with:
model.save(fname)
#Advanced users can load a model and continue training it with more sentences:
model = gensim.models.Word2Vec.load('/tmp/mymodel')
model.train(more_sentences)
A more effective way...load sentences from a text file:
#!/usr/bin/env python
from gensim.models import Word2Vec
class MySentences(object):
def __init__(self, dirname):
self.dirname = dirname
def __iter__(self):
for fname in os.listdir(self.dirname):
for line in open(os.path.join(self.dirname, fname)):
yield line.split()
sentences = MySentences('/some/directory') # a memory-friendly iterator
model = gensim.models.Word2Vec(sentences)
Load an existing model, for example the “text8” corpus that can be downloaded from http://mattmahoney.net/dc/text8.zip .
#!/usr/bin/env python
from gensim.models import Word2Vec
#model = Word2Vec.load(path/to/your/model)
#model = Word2Vec.load_word2vec_format('/tmp/vectors.txt', binary=False) # C text format
model = Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) # C binary format
Some examples of use:
print model.similarity('france', 'spain')
print model.most_similar_cosmul(positive=['baghdad', 'england'], negative=['london'])
print model.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant'])
print model.n_similarity(['restaurant', 'japanese'], ['japanese', 'restaurant'])
print model.n_similarity(['sushi'], ['restaurant']) == model.similarity('sushi', 'restaurant')
print model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print model.doesnt_match("breakfast cereal dinner lunch".split())
print model.similarity('woman', 'man')
model.most_similar(['man'])
|
|