From 3a4379723ae8693c1e377ba217f396e7d92f69aa Mon Sep 17 00:00:00 2001 From: amirmohammadkz Date: Wed, 14 Aug 2019 16:28:05 +0430 Subject: [PATCH] add gensim, efficient memory use --- process_data.py | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/process_data.py b/process_data.py index c5ab027..98d45fc 100644 --- a/process_data.py +++ b/process_data.py @@ -5,7 +5,7 @@ import sys, re import pandas as pd import csv -import getpass +from gensim.models import KeyedVectors def build_data_cv(datafile, cv=10, clean_string=True): @@ -89,25 +89,14 @@ def load_bin_vec(fname, vocab): Loads 300x1 word vecs from Google (Mikolov) word2vec """ word_vecs = {} - with open(fname, "rb") as f: - header = f.readline() - vocab_size, layer1_size = map(int, header.split()) - binary_len = np.dtype(theano.config.floatX).itemsize * layer1_size - for line in xrange(vocab_size): - word = [] - while True: - ch = f.read(1) - if ch == ' ': - word = ''.join(word) - break - if ch != '\n': - word.append(ch) - if word in vocab: - word_vecs[word] = np.fromstring(f.read(binary_len), dtype=theano.config.floatX) - else: - f.read(binary_len) + model = KeyedVectors.load_word2vec_format(fname, binary=True) + for word in vocab: + try: + word_vecs[word] = model.get_vector(word) + except KeyError: + # Word not in the vocabulary + pass return word_vecs - def add_unknown_words(word_vecs, vocab, min_df=1, k=300): """ For words that occur in at least min_df documents, create a separate word vector.