From bb49fb812a646ff7e8db21525ffdc99fb1e01542 Mon Sep 17 00:00:00 2001 From: Albert Tsai Date: Tue, 12 Jul 2016 13:25:48 -0700 Subject: [PATCH] Made off-by-one adjustments for specials tokens --- lda2vec/corpus.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lda2vec/corpus.py b/lda2vec/corpus.py index b93aae0a..c502d5f7 100644 --- a/lda2vec/corpus.py +++ b/lda2vec/corpus.py @@ -96,6 +96,7 @@ def _loose_keys_ordered(self): """ Get the loose keys in order of decreasing frequency""" loose_counts = sorted(self.counts_loose.items(), key=lambda x: x[1], reverse=True) + loose_counts = [i for i in loose_counts if i[0] not in self.specials.values()] keys = np.array(loose_counts)[:, 0] counts = np.array(loose_counts)[:, 1] order = np.argsort(counts)[::-1].astype('int32') @@ -415,7 +416,7 @@ def compact_to_flat(self, word_compact, *components): self._check_finalized() n_docs = word_compact.shape[0] max_length = word_compact.shape[1] - idx = word_compact > self.n_specials + idx = word_compact >= self.n_specials components_raveled = [] msg = "Length of each component must much `word_compact` size" for component in components: @@ -552,7 +553,7 @@ def compact_word_vectors(self, vocab, filename=None, array=None, rep1 = lambda w: w.replace(' ', '_') rep2 = lambda w: w.title().replace(' ', '_') reps = [rep0, rep1, rep2] - for compact in np.arange(top): + for compact in np.arange(min(top, n_words)): loose = self.compact_to_loose.get(compact, None) if loose is None: continue