diff --git a/lda2vec/corpus.py b/lda2vec/corpus.py index b93aae0a..01971e77 100644 --- a/lda2vec/corpus.py +++ b/lda2vec/corpus.py @@ -1,6 +1,5 @@ from collections import defaultdict import numpy as np -import difflib import pandas as pd try: @@ -531,8 +530,10 @@ def compact_word_vectors(self, vocab, filename=None, array=None, True """ n_words = len(self.compact_to_loose) - from gensim.models.word2vec import Word2Vec - model = Word2Vec.load_word2vec_format(filename, binary=True) + import gensim + model = gensim.models.KeyedVectors.load_word2vec_format( + filename, binary=True) + n_dim = model.syn0.shape[1] data = np.random.normal(size=(n_words, n_dim)).astype('float32') data -= data.mean() @@ -548,9 +549,12 @@ def compact_word_vectors(self, vocab, filename=None, array=None, choices = np.array(keys, dtype='S') lengths = np.array(lens, dtype='int32') s, f = 0, 0 - rep0 = lambda w: w - rep1 = lambda w: w.replace(' ', '_') - rep2 = lambda w: w.title().replace(' ', '_') + + def rep0(w): return w + + def rep1(w): return w.replace(' ', '_') + + def rep2(w): return w.title().replace(' ', '_') reps = [rep0, rep1, rep2] for compact in np.arange(top): loose = self.compact_to_loose.get(compact, None) @@ -574,7 +578,6 @@ def compact_word_vectors(self, vocab, filename=None, array=None, sel = choices[idx] d = damerau_levenshtein_distance_withNPArray(word, sel) choice = np.array(keys_raw)[idx][np.argmin(d)] - # choice = difflib.get_close_matches(word, choices)[0] vector = model[choice] print compact, word, ' --> ', choice except IndexError: @@ -677,8 +680,9 @@ def compact_to_coocurrence(self, word_compact, indices, window_size=10): for name, index in indices.items(): tokens[name] = index a, b = tokens.copy(), tokens.copy() - mask = lambda x: np.prod([x[k + '_x'] == x[k + '_y'] - for k in indices.keys()], axis=0) + + def mask(x): return np.prod([x[k + '_x'] == x[k + '_y'] + for k in indices.keys()], axis=0) group_keys = ['word_index_x', 'word_index_y', ] group_keys += [k + '_x' for k in indices.keys()] total = []