diff --git a/datahandlers/__init__.py b/datahandlers/__init__.py
deleted file mode 100644
index 8217ded..0000000
--- a/datahandlers/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from base_handler import DataHandlerException, BaseDataHandler
-from imdb_handler import ImdbDataHandler
\ No newline at end of file
diff --git a/nlpdatahandlers/.DS_Store b/nlpdatahandlers/.DS_Store
new file mode 100644
index 0000000..396b9a1
Binary files /dev/null and b/nlpdatahandlers/.DS_Store differ
diff --git a/nlpdatahandlers/__init__.py b/nlpdatahandlers/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/datahandlers/base_handler.py b/nlpdatahandlers/base.py
similarity index 56%
rename from datahandlers/base_handler.py
rename to nlpdatahandlers/base.py
index 328fa69..6094207 100644
--- a/datahandlers/base_handler.py
+++ b/nlpdatahandlers/base.py
@@ -1,8 +1,21 @@
+'''
+base.py -- ABC for data handler.
+'''
+
+
+import abc
+
+import numpy as np
+
+from .util.parallel import parallel_run
+
class DataHandlerException(Exception):
pass
class BaseDataHandler(object):
+ __metaclass__ = abc.ABCMeta
+
DATA_ALL = 1
DATA_TRAIN = 2
DATA_VALIDATION = 4
@@ -11,6 +24,7 @@ class BaseDataHandler(object):
def __init__(self, source):
self.source = source
+ @abc.abstractmethod
def get_data(self, type=DATA_ALL):
"""
Process the data from its source and returns two lists: texts and labels, ready for a classifier to be used
@@ -18,10 +32,16 @@ def get_data(self, type=DATA_ALL):
raise NotImplementedError()
@staticmethod
- def to_sentence_vectors(texts_list, sentences_per_paragraph, words_per_sentence, wv_container, prepend=False):
+ def shuffle_data(train_values, labels):
+ combined_lists = zip(train_values, labels)
+ np.random.shuffle(combined_lists)
+ return zip(*combined_lists)
+
+ @staticmethod
+ def word_level_ix(texts_list, words_per_document, wv_container, prepend=False, needs_tokenizing=False):
"""
- Receives a list of texts. For each text, it converts the text into sentences and converts the words into
- indices of a word vector container (Glove, WordToVec) for later use in the embedding of a neural network.
+ Receives a list of texts. For each text, it converts the text into indices of a word
+ vector container (Glove, WordToVec) for later use in the embedding of a neural network.
Sentences are padded (or reduced) up to words_per_sentence elements.
Texts ("paragraphs") are padded (or reduced) up to sentences_per_paragraph
@@ -32,29 +52,41 @@ def to_sentence_vectors(texts_list, sentences_per_paragraph, words_per_sentence,
[ [[5, 24, 3, 223], [123, 25, 0, 0]]. [[34, 25, 0, 0], [0, 0, 0, 0] ]
using sentences_per_paragraph = 4, words_per_sentence = 4
"""
- def parallel_run(f, params):
- '''
- performs multi-core map of the function `f`
- over the parameter space spanned by parms.
- `f` MUST take only one argument.
- '''
- from multiprocessing import Pool
+
+
+ if needs_tokenizing:
+ from util.language import tokenize_text
+ texts_list = parallel_run(tokenize_text, texts_list)
+
+ text_with_normalized_documents = BaseDataHandler.__normalize(wv_container.get_indices(texts_list), words_per_document, prepend)
+ return text_with_normalized_documents
+
+
+ @staticmethod
+ def sentence_level_ix(texts_list, sentences_per_paragraph, words_per_sentence, wv_container, prepend=False):
+ """
+ Receives a list of texts. For each text, it converts the text into sentences and converts the words into
+ indices of a word vector container (Glove, WordToVec) for later use in the embedding of a neural network.
+
+ Sentences are padded (or reduced) up to words_per_sentence elements.
+ Texts ("paragraphs") are padded (or reduced) up to sentences_per_paragraph
+ If prepend = True, padding is added at the beginning
- pool = Pool()
- ret = pool.map(f, params)
- pool.close()
- pool.join()
- return ret
+ Ex: [[This might be cumbersome. Hopefully not.], [Another text]]
+ to
+ [ [[5, 24, 3, 223], [123, 25, 0, 0]]. [[34, 25, 0, 0], [0, 0, 0, 0] ]
+ using sentences_per_paragraph = 4, words_per_sentence = 4
+ """
from util.language import parse_paragraph
text_sentences = parallel_run(parse_paragraph, texts_list)
- paragraphs = []
- text_with_normalized_sentences = [BaseDataHandler.__normalize(review, words_per_sentence, prepend)
+
+ text_with_normalized_sentences = [BaseDataHandler.__normalize(review, size=words_per_sentence, prepend=prepend)
for review in wv_container.get_indices(text_sentences)]
text_padded_paragraphs = BaseDataHandler.__normalize(text_with_normalized_sentences,
- sentences_per_paragraph, [0] * words_per_sentence)
+ size=sentences_per_paragraph, filler=[0] * words_per_sentence)
return text_padded_paragraphs
diff --git a/nlpdatahandlers/imdb/__init__.py b/nlpdatahandlers/imdb/__init__.py
new file mode 100644
index 0000000..373e8a6
--- /dev/null
+++ b/nlpdatahandlers/imdb/__init__.py
@@ -0,0 +1 @@
+from .imdb_handler import ImdbDataHandler
\ No newline at end of file
diff --git a/datahandlers/imdb_handler.py b/nlpdatahandlers/imdb/imdb_handler.py
similarity index 73%
rename from datahandlers/imdb_handler.py
rename to nlpdatahandlers/imdb/imdb_handler.py
index 34eaffc..2570029 100644
--- a/datahandlers/imdb_handler.py
+++ b/nlpdatahandlers/imdb/imdb_handler.py
@@ -1,4 +1,4 @@
-from base_handler import BaseDataHandler, DataHandlerException
+from ..base import BaseDataHandler, DataHandlerException
import glob
import os
@@ -9,16 +9,26 @@ class ImdbDataHandler(BaseDataHandler):
http://ai.stanford.edu/~amaas/data/sentiment/
source defines the folder where the data is downloaded
+
+ Args:
+ -----
+ source: the path to the root aclImdb/ folder for the downloaded data
+
+ Examples:
+ ---------
+
+ >>> imdb = ImdbHandler('./aclImdb')
+ >>> train_data, train_labels = imdb.get_data()
"""
- def get_data(self, type=BaseDataHandler.DATA_TRAIN):
+ def get_data(self, type=BaseDataHandler.DATA_TRAIN, shuffle=True):
"""
Process the data from its source and returns two lists: texts and labels, ready for a classifier to be used
Data is not shuffled
"""
if type not in (BaseDataHandler.DATA_TRAIN, BaseDataHandler.DATA_TEST):
- raise DataHandlerException("Only train and test data supported for ImdbHandler")
+ raise DataHandlerException("Only train and test data supported for ImdbDataHandler")
else:
which_data = 'train' if type == BaseDataHandler.DATA_TRAIN else 'test'
@@ -34,5 +44,7 @@ def get_data(self, type=BaseDataHandler.DATA_TRAIN):
data.append((open(f, 'rb').read().lower()).replace('
', '\n'))
labels.append(0)
+ if shuffle:
+ return self.shuffle_data(data, labels)
return (data, labels)
diff --git a/nlpdatahandlers/util/__init__.py b/nlpdatahandlers/util/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/nlpdatahandlers/util/language.py b/nlpdatahandlers/util/language.py
new file mode 100644
index 0000000..3373543
--- /dev/null
+++ b/nlpdatahandlers/util/language.py
@@ -0,0 +1,71 @@
+try:
+ from spacy.en import English
+except ImportError:
+ raise ImportError('[!] You need to install spaCy! Visit spacy.io/#install')
+
+# Spacy.en provides a faster tokenizer than nltk
+nlp = English()
+
+def parse_paragraph(txt):
+ """
+ Takes a text and returns a list of lists of tokens, where each sublist is a sentence
+ """
+ sentences = nlp(u'' + txt.decode('ascii', errors='ignore')).sents
+ return [[t.text for t in s] for s in sentences]
+
+def tokenize_text(text):
+ """
+ Gets tokens from a text in English
+ """
+ if not isinstance(text, unicode):
+ text = unicode(text)
+
+ tokens = [token.lower_ for token in nlp(text)]
+
+ return tokens
+
+def _calculate_languages_ratios(text):
+ """
+ Calculate probability of given text to be written in several languages and
+ return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}
+
+ @param text: Text whose language want to be detected
+ @type text: str
+
+ @return: Dictionary with languages and unique stopwords seen in analyzed text
+ @rtype: dict
+ """
+ from nltk.corpus import stopwords
+
+
+ languages_ratios = {}
+ tokens = tokenize_text(text)
+
+ # Compute per language included in nltk number of unique stopwords appearing in analyzed text
+ for language in stopwords.fileids():
+ stopwords_set = set(stopwords.words(language))
+ words_set = set(tokens)
+ common_elements = words_set.intersection(stopwords_set)
+
+ languages_ratios[language] = len(common_elements) # language "score"
+
+ return languages_ratios
+
+def detect_language(text):
+ """
+ Calculate probability of given text to be written in several languages and
+ return the highest scored.
+
+ It uses a stopwords based approach, counting how many unique stopwords
+ are seen in analyzed text.
+
+ @param text: Text whose language want to be detected
+ @type text: str
+
+ @return: Most scored language guessed
+ @rtype: str
+ """
+ ratios = _calculate_languages_ratios(text)
+ most_rated_language = max(ratios, key=ratios.get)
+
+ return most_rated_language
diff --git a/nlpdatahandlers/util/misc.py b/nlpdatahandlers/util/misc.py
new file mode 100644
index 0000000..5b2f35c
--- /dev/null
+++ b/nlpdatahandlers/util/misc.py
@@ -0,0 +1,35 @@
+from .language import tokenize_text
+
+def normalize_sos(sq, sz=30, filler=0, prepend=True):
+ '''
+ Take a list of lists and ensure that they are all of length `sz`
+
+ Args:
+ -----
+ e: a non-generator iterable of lists
+
+ sz: integer, the size that each sublist should be normalized to
+
+ filler: obj -- what should be added to fill out the size?
+
+ prepend: should `filler` be added to the front or the back of the list?
+
+ '''
+ if not prepend:
+ def _normalize(e, sz):
+ return e[:sz] if len(e) >= sz else e + [filler] * (sz - len(e))
+ return [_normalize(e, sz) for e in sq]
+ else:
+ def _normalize(e, sz):
+ return e[-sz:] if len(e) >= sz else [filler] * (sz - len(e)) + e
+ return [_normalize(e, sz) for e in sq]
+
+
+def to_glove_vectors(text, glovebox):
+ tokens = tokenize_text(text)
+
+ wvs = []
+ for token in tokens:
+ wvs.append(glovebox[token])
+
+ return wvs
\ No newline at end of file
diff --git a/nlpdatahandlers/util/parallel.py b/nlpdatahandlers/util/parallel.py
new file mode 100644
index 0000000..971720c
--- /dev/null
+++ b/nlpdatahandlers/util/parallel.py
@@ -0,0 +1,18 @@
+'''
+util.py -- utilities for data loading
+'''
+
+def parallel_run(f, params):
+ '''
+ performs multi-core map of the function `f`
+ over the parameter space spanned by parms.
+
+ `f` MUST take only one argument.
+ '''
+ from multiprocessing import Pool
+
+ pool = Pool()
+ ret = pool.map(f, params)
+ pool.close()
+ pool.join()
+ return ret
\ No newline at end of file
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..0fa54b6
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,11 @@
+from setuptools import setup
+from setuptools import find_packages
+
+setup(name='NLPDataHandlers',
+ version='0.0.1',
+ description='Library for loading datasets for deep learning.',
+ author='Luke de Oliveira, Alfredo Lainez',
+ author_email='lukedeo@stanford.edu, alainez@stanford.edu',
+ url='https://github.com/textclf/data-handler',
+ # install_requires=['pandas'],
+ packages=find_packages())
\ No newline at end of file