diff --git a/datahandlers/__init__.py b/datahandlers/__init__.py deleted file mode 100644 index 8217ded..0000000 --- a/datahandlers/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from base_handler import DataHandlerException, BaseDataHandler -from imdb_handler import ImdbDataHandler \ No newline at end of file diff --git a/nlpdatahandlers/.DS_Store b/nlpdatahandlers/.DS_Store new file mode 100644 index 0000000..396b9a1 Binary files /dev/null and b/nlpdatahandlers/.DS_Store differ diff --git a/nlpdatahandlers/__init__.py b/nlpdatahandlers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/datahandlers/base_handler.py b/nlpdatahandlers/base.py similarity index 56% rename from datahandlers/base_handler.py rename to nlpdatahandlers/base.py index 328fa69..6094207 100644 --- a/datahandlers/base_handler.py +++ b/nlpdatahandlers/base.py @@ -1,8 +1,21 @@ +''' +base.py -- ABC for data handler. +''' + + +import abc + +import numpy as np + +from .util.parallel import parallel_run + class DataHandlerException(Exception): pass class BaseDataHandler(object): + __metaclass__ = abc.ABCMeta + DATA_ALL = 1 DATA_TRAIN = 2 DATA_VALIDATION = 4 @@ -11,6 +24,7 @@ class BaseDataHandler(object): def __init__(self, source): self.source = source + @abc.abstractmethod def get_data(self, type=DATA_ALL): """ Process the data from its source and returns two lists: texts and labels, ready for a classifier to be used @@ -18,10 +32,16 @@ def get_data(self, type=DATA_ALL): raise NotImplementedError() @staticmethod - def to_sentence_vectors(texts_list, sentences_per_paragraph, words_per_sentence, wv_container, prepend=False): + def shuffle_data(train_values, labels): + combined_lists = zip(train_values, labels) + np.random.shuffle(combined_lists) + return zip(*combined_lists) + + @staticmethod + def word_level_ix(texts_list, words_per_document, wv_container, prepend=False, needs_tokenizing=False): """ - Receives a list of texts. For each text, it converts the text into sentences and converts the words into - indices of a word vector container (Glove, WordToVec) for later use in the embedding of a neural network. + Receives a list of texts. For each text, it converts the text into indices of a word + vector container (Glove, WordToVec) for later use in the embedding of a neural network. Sentences are padded (or reduced) up to words_per_sentence elements. Texts ("paragraphs") are padded (or reduced) up to sentences_per_paragraph @@ -32,29 +52,41 @@ def to_sentence_vectors(texts_list, sentences_per_paragraph, words_per_sentence, [ [[5, 24, 3, 223], [123, 25, 0, 0]]. [[34, 25, 0, 0], [0, 0, 0, 0] ] using sentences_per_paragraph = 4, words_per_sentence = 4 """ - def parallel_run(f, params): - ''' - performs multi-core map of the function `f` - over the parameter space spanned by parms. - `f` MUST take only one argument. - ''' - from multiprocessing import Pool + + + if needs_tokenizing: + from util.language import tokenize_text + texts_list = parallel_run(tokenize_text, texts_list) + + text_with_normalized_documents = BaseDataHandler.__normalize(wv_container.get_indices(texts_list), words_per_document, prepend) + return text_with_normalized_documents + + + @staticmethod + def sentence_level_ix(texts_list, sentences_per_paragraph, words_per_sentence, wv_container, prepend=False): + """ + Receives a list of texts. For each text, it converts the text into sentences and converts the words into + indices of a word vector container (Glove, WordToVec) for later use in the embedding of a neural network. + + Sentences are padded (or reduced) up to words_per_sentence elements. + Texts ("paragraphs") are padded (or reduced) up to sentences_per_paragraph + If prepend = True, padding is added at the beginning - pool = Pool() - ret = pool.map(f, params) - pool.close() - pool.join() - return ret + Ex: [[This might be cumbersome. Hopefully not.], [Another text]] + to + [ [[5, 24, 3, 223], [123, 25, 0, 0]]. [[34, 25, 0, 0], [0, 0, 0, 0] ] + using sentences_per_paragraph = 4, words_per_sentence = 4 + """ from util.language import parse_paragraph text_sentences = parallel_run(parse_paragraph, texts_list) - paragraphs = [] - text_with_normalized_sentences = [BaseDataHandler.__normalize(review, words_per_sentence, prepend) + + text_with_normalized_sentences = [BaseDataHandler.__normalize(review, size=words_per_sentence, prepend=prepend) for review in wv_container.get_indices(text_sentences)] text_padded_paragraphs = BaseDataHandler.__normalize(text_with_normalized_sentences, - sentences_per_paragraph, [0] * words_per_sentence) + size=sentences_per_paragraph, filler=[0] * words_per_sentence) return text_padded_paragraphs diff --git a/nlpdatahandlers/imdb/__init__.py b/nlpdatahandlers/imdb/__init__.py new file mode 100644 index 0000000..373e8a6 --- /dev/null +++ b/nlpdatahandlers/imdb/__init__.py @@ -0,0 +1 @@ +from .imdb_handler import ImdbDataHandler \ No newline at end of file diff --git a/datahandlers/imdb_handler.py b/nlpdatahandlers/imdb/imdb_handler.py similarity index 73% rename from datahandlers/imdb_handler.py rename to nlpdatahandlers/imdb/imdb_handler.py index 34eaffc..2570029 100644 --- a/datahandlers/imdb_handler.py +++ b/nlpdatahandlers/imdb/imdb_handler.py @@ -1,4 +1,4 @@ -from base_handler import BaseDataHandler, DataHandlerException +from ..base import BaseDataHandler, DataHandlerException import glob import os @@ -9,16 +9,26 @@ class ImdbDataHandler(BaseDataHandler): http://ai.stanford.edu/~amaas/data/sentiment/ source defines the folder where the data is downloaded + + Args: + ----- + source: the path to the root aclImdb/ folder for the downloaded data + + Examples: + --------- + + >>> imdb = ImdbHandler('./aclImdb') + >>> train_data, train_labels = imdb.get_data() """ - def get_data(self, type=BaseDataHandler.DATA_TRAIN): + def get_data(self, type=BaseDataHandler.DATA_TRAIN, shuffle=True): """ Process the data from its source and returns two lists: texts and labels, ready for a classifier to be used Data is not shuffled """ if type not in (BaseDataHandler.DATA_TRAIN, BaseDataHandler.DATA_TEST): - raise DataHandlerException("Only train and test data supported for ImdbHandler") + raise DataHandlerException("Only train and test data supported for ImdbDataHandler") else: which_data = 'train' if type == BaseDataHandler.DATA_TRAIN else 'test' @@ -34,5 +44,7 @@ def get_data(self, type=BaseDataHandler.DATA_TRAIN): data.append((open(f, 'rb').read().lower()).replace('

', '\n')) labels.append(0) + if shuffle: + return self.shuffle_data(data, labels) return (data, labels) diff --git a/nlpdatahandlers/util/__init__.py b/nlpdatahandlers/util/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/nlpdatahandlers/util/language.py b/nlpdatahandlers/util/language.py new file mode 100644 index 0000000..3373543 --- /dev/null +++ b/nlpdatahandlers/util/language.py @@ -0,0 +1,71 @@ +try: + from spacy.en import English +except ImportError: + raise ImportError('[!] You need to install spaCy! Visit spacy.io/#install') + +# Spacy.en provides a faster tokenizer than nltk +nlp = English() + +def parse_paragraph(txt): + """ + Takes a text and returns a list of lists of tokens, where each sublist is a sentence + """ + sentences = nlp(u'' + txt.decode('ascii', errors='ignore')).sents + return [[t.text for t in s] for s in sentences] + +def tokenize_text(text): + """ + Gets tokens from a text in English + """ + if not isinstance(text, unicode): + text = unicode(text) + + tokens = [token.lower_ for token in nlp(text)] + + return tokens + +def _calculate_languages_ratios(text): + """ + Calculate probability of given text to be written in several languages and + return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0} + + @param text: Text whose language want to be detected + @type text: str + + @return: Dictionary with languages and unique stopwords seen in analyzed text + @rtype: dict + """ + from nltk.corpus import stopwords + + + languages_ratios = {} + tokens = tokenize_text(text) + + # Compute per language included in nltk number of unique stopwords appearing in analyzed text + for language in stopwords.fileids(): + stopwords_set = set(stopwords.words(language)) + words_set = set(tokens) + common_elements = words_set.intersection(stopwords_set) + + languages_ratios[language] = len(common_elements) # language "score" + + return languages_ratios + +def detect_language(text): + """ + Calculate probability of given text to be written in several languages and + return the highest scored. + + It uses a stopwords based approach, counting how many unique stopwords + are seen in analyzed text. + + @param text: Text whose language want to be detected + @type text: str + + @return: Most scored language guessed + @rtype: str + """ + ratios = _calculate_languages_ratios(text) + most_rated_language = max(ratios, key=ratios.get) + + return most_rated_language diff --git a/nlpdatahandlers/util/misc.py b/nlpdatahandlers/util/misc.py new file mode 100644 index 0000000..5b2f35c --- /dev/null +++ b/nlpdatahandlers/util/misc.py @@ -0,0 +1,35 @@ +from .language import tokenize_text + +def normalize_sos(sq, sz=30, filler=0, prepend=True): + ''' + Take a list of lists and ensure that they are all of length `sz` + + Args: + ----- + e: a non-generator iterable of lists + + sz: integer, the size that each sublist should be normalized to + + filler: obj -- what should be added to fill out the size? + + prepend: should `filler` be added to the front or the back of the list? + + ''' + if not prepend: + def _normalize(e, sz): + return e[:sz] if len(e) >= sz else e + [filler] * (sz - len(e)) + return [_normalize(e, sz) for e in sq] + else: + def _normalize(e, sz): + return e[-sz:] if len(e) >= sz else [filler] * (sz - len(e)) + e + return [_normalize(e, sz) for e in sq] + + +def to_glove_vectors(text, glovebox): + tokens = tokenize_text(text) + + wvs = [] + for token in tokens: + wvs.append(glovebox[token]) + + return wvs \ No newline at end of file diff --git a/nlpdatahandlers/util/parallel.py b/nlpdatahandlers/util/parallel.py new file mode 100644 index 0000000..971720c --- /dev/null +++ b/nlpdatahandlers/util/parallel.py @@ -0,0 +1,18 @@ +''' +util.py -- utilities for data loading +''' + +def parallel_run(f, params): + ''' + performs multi-core map of the function `f` + over the parameter space spanned by parms. + + `f` MUST take only one argument. + ''' + from multiprocessing import Pool + + pool = Pool() + ret = pool.map(f, params) + pool.close() + pool.join() + return ret \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..0fa54b6 --- /dev/null +++ b/setup.py @@ -0,0 +1,11 @@ +from setuptools import setup +from setuptools import find_packages + +setup(name='NLPDataHandlers', + version='0.0.1', + description='Library for loading datasets for deep learning.', + author='Luke de Oliveira, Alfredo Lainez', + author_email='lukedeo@stanford.edu, alainez@stanford.edu', + url='https://github.com/textclf/data-handler', + # install_requires=['pandas'], + packages=find_packages()) \ No newline at end of file