From c73eec5b2cd4d1dbd82a6e2967fe708b859e279c Mon Sep 17 00:00:00 2001
From: Jeremy-E-Johnson <Jeremy.Johnson14@ncf.edu>
Date: Fri, 17 Jun 2016 18:08:35 -0600
Subject: [PATCH 01/37] Created europal.py data itterator. Currently needs
 europal data unpacked in source directory.

---
 cortex/datasets/basic/europarl.py | 239 ++++++++++++++++++++++++++++++
 1 file changed, 239 insertions(+)
 create mode 100644 cortex/datasets/basic/europarl.py
diff --git a/cortex/datasets/basic/europarl.py b/cortex/datasets/basic/europarl.py
new file mode 100644
index 0000000..6e7fd1e
--- /dev/null
+++ b/cortex/datasets/basic/europarl.py
@@ -0,0 +1,239 @@
+"""
+Europarl dataset for machine translation.
+
+Currently only supports fr-en datasets.
+"""
+
+from .. import BasicDataset, make_one_hot
+import string
+import numpy as np
+from collections import defaultdict
+from functools import partial
+import logging
+from guppy import hpy
+
+
+class Europarl(BasicDataset):
+    """
+    Europarl dataset itterator.
+    """
+    def __init__(self, source=None, mode='train', english_to_french=True,
+                 name='europarl', out_path=None, max_words=5000,
+                 max_sentence=30, max_length=7000, **kwargs):
+
+        self.logger = logging.getLogger(
+            '.'.join([self.__module__, self.__class__.__name__]))
+        self.logger.info('Loading %s from %s' % (name, source))
+
+        if source is None:
+            raise ValueError('No source file provided.')
+        print 'Loading {name} ({mode}) from {source}'.format(
+            name=name, mode=mode, source=source)
+
+        self.masken = None
+        self.maskfr = None
+        self.masky = None
+        self.maskx = None
+        self.max_sentence = max_sentence
+        self.max_length = max_length
+        self.max_words = max_words
+        self.n_observations = 0
+        self.english_to_french = english_to_french
+        X, Y = self.get_data(source)
+        data = {name: X, 'label': Y}
+        distributions = {name: 'multinomial', 'label': 'multinomial'}
+
+        super(Europarl, self).__init__(data, distributions=distributions,
+                                       name=name, mode=mode, **kwargs)
+
+        self.out_path = out_path
+
+        if self.shuffle:
+            self.randomize()
+
+    def slice_data(self, idx, data=None):  # Function for restricting dataset in instance.
+        if data is None: data = self.data
+        for k, v in data.iteritems():
+            self.data[k] = v[idx]
+        self.n_observations = len(idx)
+        self.X = data[self.name]
+        if self.labels in data.keys():
+            self.Y = data[self.labels]
+        self.n = self.X.shape[0]
+
+    def get_data(self, source):
+        fr = open(source + 'europarl-v7.fr-en.fr') #### NOT SURE IF SOURCING IS CORRECT
+        en = open(source + 'europarl-v7.fr-en.en')
+
+        X = []
+        Y = []
+        fMax = 0
+        eMax = 0
+        self.itt_pos1 = 4
+        self.itt_pos2 = 4
+        self.frStringToToken = defaultdict(partial(self.count1, self.max_words, 3))
+        self.enStringToToken = defaultdict(partial(self.count2, self.max_words, 3))
+        special_tokens = {'<PAD>': 0, '<BEG>': 1, '<END>': 2, '<UNK>': 3}
+
+        i = 0
+        for eSentence, fSentence in zip(en.__iter__(), fr.__iter__()):  # Itterate through file lines
+            if len(self.string_process(eSentence)) <= self.max_sentence\
+                    and len(self.string_process(fSentence)) <= self.max_sentence:
+                X.append([self.enStringToToken[eWord] for eWord in self.string_process(eSentence)])  # Convert to numerical
+                if len(X[-1]) > eMax:  # Keep track of largest sentence in language.
+                    eMax = len(X[-1])
+                Y.append([self.frStringToToken[fWord] for fWord in self.string_process(fSentence)])
+                if len(Y[-1]) > fMax:
+                    fMax = len(Y[-1])
+                i += 1
+                if i >= self.max_length:
+                    break
+
+        fr.close()
+        en.close()
+        del fr
+        del en
+
+        print 'Data loaded, preprocessing...'
+        print 'Padding data.'
+        self.n_observations = len(X)  # Update sample size
+
+        X = map(partial(self.pad_array, length=(eMax + 2)), X)
+        Y = map(partial(self.pad_array, length=(fMax + 2)), Y)
+
+        print 'Creating masks.'
+        self.masken = map(self.create_mask, X)
+        self.maskfr = map(self.create_mask, Y)
+
+        self.masken = np.array(self.masken, dtype='float32')
+        self.maskfr = np.array(self.maskfr, dtype='float32')
+
+        print 'Converting to one-hot.'
+        # The following couple lines are really slow to run.
+        X = make_one_hot(np.array(X).reshape((eMax + 2) * self.n_observations))\
+            .reshape((self.n_observations, eMax + 2, max(self.enStringToToken.values()) + 1))  # Convert to one hot, (array -> vector -> one-hot -> array)
+        Y = make_one_hot(np.array(Y).reshape((fMax + 2) * self.n_observations))\
+            .reshape((self.n_observations, fMax + 2, max(self.frStringToToken.values()) + 1))
+
+        print 'Converting to float32.'
+        #  Conversion after one-hot as float32 arrays slow down one-hot conversion.
+        X = X.astype('float32')
+        Y = Y.astype('float32')
+
+        self.frStringToToken.update(special_tokens)
+        self.enStringToToken.update(special_tokens)
+
+        print 'Data prepared.'
+        if self.english_to_french:
+            self.maskx = self.masken
+            self.masky = self.maskfr
+            return X, Y
+        else:
+            self.maskx = self.maskfr
+            self.masky = self.masken
+            return Y, X
+
+    @staticmethod
+    def factory(C=None, split=None, idx=None, batch_sizes=None, **kwargs):
+        if C is None:
+            C = Europarl
+        europarl = C(batch_size=10, **kwargs)
+        if hasattr(europarl, 'logger'):
+            logger = europarl.logger
+            europarl.logger = None
+        else:
+            logger = logging.getLogger('.'.join([europarl.__module__, europarl.__class__.__name__]))
+
+        if idx is None:
+            logger.info('Splitting dataset into ratios %r' % split)
+            if round(np.sum(split), 5) != 1. or len(split) != 3:
+                raise ValueError(split)
+
+            if europarl.balance:
+                raise NotImplementedError()
+            else:
+                split_idx = []
+                accum = 0
+                for s in split:  # Create indicies from percentage values
+                    s_i = int(s * europarl.n_observations + accum)
+                    split_idx.append(s_i)
+                    accum += s_i
+                idx = range(europarl.n_observations)
+
+                train_idx = idx[:split_idx[0]]
+                valid_idx = idx[split_idx[0]:split_idx[1]]
+                test_idx = idx[split_idx[1]:]
+            idx = [train_idx, valid_idx, test_idx]
+        else:
+            logger.info('Splitting dataset into ratios  %.2f / %.2f /%.2f '
+                        'using given indices'
+                        % tuple(len(idx[i]) / float(europarl.n_observations)
+                                for i in range(3)))
+
+        assert len(batch_sizes) == len(idx)  # Shouldn't have different number of batch sizes than datasets
+
+        datasets = []
+        modes = ['train', 'valid', 'test']
+        data = europarl.data
+        europarl.data = dict()
+        for i, bs, mode in zip(idx, batch_sizes, modes):  # Create correctly restricted copies of dataset
+            if bs is None:
+                dataset = None
+            else:
+                dataset = europarl.copy()
+                dataset.slice_data(i, data=data)
+                dataset.batch_size = bs
+                dataset.logger = logger
+                dataset.mode = mode
+            datasets.append(dataset)
+
+        return datasets + [idx]
+
+    table = string.maketrans('', '')  # Variable for string_process()
+
+    def string_process(self, s):  # Helper method for get_data()
+        return s.lower().translate(self.table, string.punctuation).split()
+
+    @staticmethod
+    def pad_array(arr, length):  # Helper method for get_data()
+        return [1] + arr + [2] + ([0]*(length - len(arr) - 2))
+
+    @staticmethod  # Helper method for creating mask array from a list.
+    def create_mask(lst):
+        return np.array([int(bool(x)) for x in lst])
+
+    def count1(self, max_words, unknown_val):  # Crappy replacement for count as itterators can not be copied. :(
+        if self.itt_pos1 <= max_words:
+            self.itt_pos1 += 1
+            return self.itt_pos1 - 1
+        else:
+            return unknown_val
+
+    def count2(self, max_words, unknown_val):
+        if self.itt_pos2 <= max_words:
+            self.itt_pos2 += 1
+            return self.itt_pos2 - 1
+        else:
+            return unknown_val
+
+def count(start, max_words, unknown_val):  # Helper method for defaultdict in get_data()
+    s = start
+    while True:
+        if s <= max_words:
+            yield s
+            s += 1
+        else:
+            yield unknown_val
+
+
+def main():
+    #data = Europarl(source='/export/mialab/users/jjohnson/data/basic/', batch_size=10)
+    train, valid, test, idx = Europarl.factory(source='/export/mialab/users/jjohnson/data/basic/',
+                                               batch_sizes=[100, 100, 100], split=[0.7, 0.2, 0.1])
+    h = hpy()
+    print h.heap()
+    print train.data['europarl'].shape
+    print valid.data['europarl'].shape
+    print test.data['europarl'].shape
+
+main()

From 4a1d2c2d899a567a319b31c8c701f4d6dca49276 Mon Sep 17 00:00:00 2001
From: rdevon <devon@cs.unm.edu>
Date: Sat, 18 Jun 2016 13:01:48 -0600
Subject: [PATCH 02/37] updating europarl iterator

---
 cortex/datasets/basic/europarl.py | 380 +++++++++++++++++++-----------
 1 file changed, 238 insertions(+), 142 deletions(-)

diff --git a/cortex/datasets/basic/europarl.py b/cortex/datasets/basic/europarl.py
index 6e7fd1e..7129f57 100644
--- a/cortex/datasets/basic/europarl.py
+++ b/cortex/datasets/basic/europarl.py
@@ -1,23 +1,44 @@
-"""
+'''
 Europarl dataset for machine translation.
 
 Currently only supports fr-en datasets.
-"""
+'''
 
-from .. import BasicDataset, make_one_hot
-import string
-import numpy as np
 from collections import defaultdict
 from functools import partial
-import logging
 from guppy import hpy
+import logging
+import numpy as np
+from os import path
+from progressbar import (
+    Bar,
+    Percentage,
+    ProgressBar,
+    Timer
+)
+import string
+
+from ..datasets import BasicDataset, make_one_hot
+from ...utils import floatX, intX
 
 
+np.set_printoptions(threshold=np.nan)
+logger = logging.getLogger(__name__)
+
 class Europarl(BasicDataset):
-    """
-    Europarl dataset itterator.
-    """
-    def __init__(self, source=None, mode='train', english_to_french=True,
+    '''Europarl dataset itterator.
+
+    Attributes:
+        max_sentence (int): maximimum sentence length.
+
+    '''
+    _PAD = 0
+    _BEG = 1
+    _END = 2
+    _UNK = 3
+    table = string.maketrans('', '')
+
+    def __init__(self, source=None, english_to_french=True,
                  name='europarl', out_path=None, max_words=5000,
                  max_sentence=30, max_length=7000, **kwargs):
 
@@ -27,31 +48,34 @@ def __init__(self, source=None, mode='train', english_to_french=True,
 
         if source is None:
             raise ValueError('No source file provided.')
-        print 'Loading {name} ({mode}) from {source}'.format(
-            name=name, mode=mode, source=source)
 
-        self.masken = None
-        self.maskfr = None
-        self.masky = None
-        self.maskx = None
         self.max_sentence = max_sentence
         self.max_length = max_length
         self.max_words = max_words
-        self.n_observations = 0
         self.english_to_french = english_to_french
-        X, Y = self.get_data(source)
-        data = {name: X, 'label': Y}
-        distributions = {name: 'multinomial', 'label': 'multinomial'}
+
+        X, Y, Mx, My = self.get_data(source)
+        data = {name: X,
+                'label': Y,
+                'mask_in': Mx,
+                'mask_out': My}
+        distributions = {name: 'multinomial',
+                         'label': 'multinomial',
+                         'mask_in': None,
+                         'mask_out': None}
 
         super(Europarl, self).__init__(data, distributions=distributions,
-                                       name=name, mode=mode, **kwargs)
+                                       name=name, one_hot=False, **kwargs)
 
         self.out_path = out_path
 
         if self.shuffle:
             self.randomize()
 
-    def slice_data(self, idx, data=None):  # Function for restricting dataset in instance.
+    def slice_data(self, idx, data=None):
+        '''Function for restricting dataset in instance.
+
+        '''
         if data is None: data = self.data
         for k, v in data.iteritems():
             self.data[k] = v[idx]
@@ -62,76 +86,181 @@ def slice_data(self, idx, data=None):  # Function for restricting dataset in ins
         self.n = self.X.shape[0]
 
     def get_data(self, source):
-        fr = open(source + 'europarl-v7.fr-en.fr') #### NOT SURE IF SOURCING IS CORRECT
-        en = open(source + 'europarl-v7.fr-en.en')
-
-        X = []
-        Y = []
-        fMax = 0
-        eMax = 0
-        self.itt_pos1 = 4
-        self.itt_pos2 = 4
-        self.frStringToToken = defaultdict(partial(self.count1, self.max_words, 3))
-        self.enStringToToken = defaultdict(partial(self.count2, self.max_words, 3))
-        special_tokens = {'<PAD>': 0, '<BEG>': 1, '<END>': 2, '<UNK>': 3}
-
-        i = 0
-        for eSentence, fSentence in zip(en.__iter__(), fr.__iter__()):  # Itterate through file lines
-            if len(self.string_process(eSentence)) <= self.max_sentence\
-                    and len(self.string_process(fSentence)) <= self.max_sentence:
-                X.append([self.enStringToToken[eWord] for eWord in self.string_process(eSentence)])  # Convert to numerical
-                if len(X[-1]) > eMax:  # Keep track of largest sentence in language.
-                    eMax = len(X[-1])
-                Y.append([self.frStringToToken[fWord] for fWord in self.string_process(fSentence)])
-                if len(Y[-1]) > fMax:
-                    fMax = len(Y[-1])
-                i += 1
-                if i >= self.max_length:
-                    break
-
-        fr.close()
-        en.close()
-        del fr
-        del en
-
-        print 'Data loaded, preprocessing...'
-        print 'Padding data.'
-        self.n_observations = len(X)  # Update sample size
-
-        X = map(partial(self.pad_array, length=(eMax + 2)), X)
-        Y = map(partial(self.pad_array, length=(fMax + 2)), Y)
-
-        print 'Creating masks.'
-        self.masken = map(self.create_mask, X)
-        self.maskfr = map(self.create_mask, Y)
-
-        self.masken = np.array(self.masken, dtype='float32')
-        self.maskfr = np.array(self.maskfr, dtype='float32')
-
-        print 'Converting to one-hot.'
-        # The following couple lines are really slow to run.
-        X = make_one_hot(np.array(X).reshape((eMax + 2) * self.n_observations))\
-            .reshape((self.n_observations, eMax + 2, max(self.enStringToToken.values()) + 1))  # Convert to one hot, (array -> vector -> one-hot -> array)
-        Y = make_one_hot(np.array(Y).reshape((fMax + 2) * self.n_observations))\
-            .reshape((self.n_observations, fMax + 2, max(self.frStringToToken.values()) + 1))
-
-        print 'Converting to float32.'
-        #  Conversion after one-hot as float32 arrays slow down one-hot conversion.
-        X = X.astype('float32')
-        Y = Y.astype('float32')
-
-        self.frStringToToken.update(special_tokens)
-        self.enStringToToken.update(special_tokens)
-
-        print 'Data prepared.'
+        special_tokens = {
+            '<PAD>': self._PAD, '<BEG>': self._BEG,
+            '<END>': self._END, '<UNK>': self._UNK}
+
+        def preprocess(s):
+            '''Preprocesses string.
+
+            Args:
+                s (str): string to be preprocessed.
+
+            Returns:
+                str: preprocessed string.
+
+            '''
+            return s.lower().translate(self.table, string.punctuation).split()
+
+        def make_dictionary(sentences, n_lines, max_words=None):
+            '''Forms a dictionary from words in sentences.
+
+            If there are more words than max_words, use the top frequent ones.
+
+            Args:
+                sentences (file Handle)
+                n_lines (int): number of lines in file.
+                max_words (Optional[int]): maximum number of words. Default
+                    is self.max_words.
+
+            Returns:
+                dict: word string to token dictionary.
+                int: maximum length sentence.
+
+            '''
+            self.logger.info('Forming dictionary')
+            if max_words is None: max_words = self.max_words
+
+            count_dict = defaultdict(int)
+
+            widgets = ['Counting words' , ' (', Timer(), ') [', Percentage(), ']']
+            pbar = ProgressBar(widgets=widgets, maxval=n_lines).start()
+
+            max_len = 0
+            for i, sentence in enumerate(sentences):
+                ps = preprocess(sentence)
+                l = len(ps)
+                if l <= self.max_sentence:
+                    for word in ps:
+                        count_dict[word] += 1
+                    max_len = max(l, max_len)
+                pbar.update(i)
+
+            count_keys_sorted = sorted(
+                count_dict, key=count_dict.get, reverse=True)
+            vals_sorted = sorted(count_dict.values(), reverse=True)
+            keys = count_keys_sorted[:max_words]
+            omit_freq = sum(vals_sorted[max_words:]) / float(sum(vals_sorted))
+            self.logger.info('Setting %d words as <UNK> with total frequency '
+                             '%.3g.'
+                             % (max(0, len(count_keys_sorted) - max_words),
+                                omit_freq))
+            values = range(4, len(keys) + 4)
+
+            d = dict()
+            d.update(**special_tokens)
+            d.update(**dict(zip(keys, values)))
+            return d, max_len
+
+        def tokenize(sentence, d, pad_length):
+            '''Tokenize sentence using dictionary.
+
+            Args:
+                sentence (str): sentence to be tokenized.
+                d (dict): token dictionary.
+                pad_length (int): total length up to pad.
+
+            Returns:
+                list: tokenized sentence as list.
+
+            '''
+            ps = preprocess(sentence)
+            if len(ps) > self.max_sentence:
+                return []
+            s = [self._BEG] + [d.get(w, self._UNK) for w in ps] + [self._END]
+            s += [self._PAD] * max(0, pad_length + 2 - len(s))
+            return s
+
+        def read_and_tokenize(file_path):
+            '''Read and tokenize a file of sentences.
+
+            Args:
+                file_path (str): path to file.
+
+            Returns:
+                list: list of tokenized sentences.
+                dict: token disctionary.
+                dict: reverse dictionary.
+
+            '''
+            self.logger.info('Reading sentences from %s' % file_path)
+            with open(file_path) as f:
+                n_lines = sum(1 for line in f)
+                f.seek(0)
+                d, max_len = make_dictionary(f, n_lines)
+                r_d = dict((v, k) for k, v in d.iteritems())
+                tokenized_sentences = []
+
+                f.seek(0)
+                self.logger.info('Tokenizing sentences from %s' % file_path)
+                widgets = ['Tokenizing sentences' ,
+                           ' (', Timer(), ') [', Percentage(), ']']
+                pbar = ProgressBar(widgets=widgets, maxval=n_lines).start()
+                for i, sentence in enumerate(f):
+                    ts = tokenize(sentence, d, max_len)
+                    assert len(ts) <= self.max_sentence + 2, (ts, len(ts))
+                    tokenized_sentences.append(ts)
+                    pbar.update(i)
+            return tokenized_sentences, d, r_d
+
+        def match_and_trim(sentences_a, sentences_b):
+            '''Matches 2 lists of sentences and removes incomplete pairs.
+
+            If one of the pairs is `[]`, remove pair.
+
+            Args:
+                sentences_a (list).
+                sentences_b (list).
+
+            Returns:
+                list: new sentences_a
+                list: new sentences_b
+
+            '''
+            self.logger.info('Matching datasets and trimming')
+            if len(sentences_a) != len(sentences_b):
+                raise TypeError('Sentence lists are different lengths.')
+
+            sentences_a_tr = []
+            sentences_b_tr = []
+            widgets = ['Matching sentences' ,
+                       ' (', Timer(), ') [', Percentage(), ']']
+            trimmed = 0
+            pbar = ProgressBar(widgets=widgets, maxval=len(sentences_a)).start()
+            for i, (s_a, s_b) in enumerate(zip(sentences_a, sentences_b)):
+                if len(s_a) > 0 and len(s_b) > 0:
+                    sentences_a_tr.append(s_a)
+                    sentences_b_tr.append(s_b)
+                else:
+                    trimmed += 1
+                pbar.update(i)
+            self.logger.debug('Trimmed %d sentences' % trimmed)
+
+            return sentences_a_tr, sentences_b_tr
+
+        fr_sentences, self.fr_dict, self.fr_dict_r = read_and_tokenize(
+            path.join(path.join(source, 'europarl-v7.fr-en.fr')))
+
+        en_sentences, self.en_dict, self.en_dict_r = read_and_tokenize(
+            path.join(path.join(source, 'europarl-v7.fr-en.en')))
+
+        fr_sentences, en_sentences = match_and_trim(fr_sentences, en_sentences)
+
         if self.english_to_french:
-            self.maskx = self.masken
-            self.masky = self.maskfr
-            return X, Y
+            X = np.array(en_sentences).astype(intX)
+            Y = np.array(fr_sentences).astype(intX)
         else:
-            self.maskx = self.maskfr
-            self.masky = self.masken
-            return Y, X
+            X = np.array(fr_sentences).astype(intX)
+            Y = np.array(en_sentences).astype(intX)
+
+        self.nX_tokens = len(np.unique(X).tolist())
+        self.nY_tokens = len(np.unique(Y).tolist())
+
+        self.logger.info('Creating masks')
+        Mx = (X != 0).astype(intX)
+        My = (Y != 0).astype(intX)
+
+        return X, Y, Mx, My
 
     @staticmethod
     def factory(C=None, split=None, idx=None, batch_sizes=None, **kwargs):
@@ -141,8 +270,6 @@ def factory(C=None, split=None, idx=None, batch_sizes=None, **kwargs):
         if hasattr(europarl, 'logger'):
             logger = europarl.logger
             europarl.logger = None
-        else:
-            logger = logging.getLogger('.'.join([europarl.__module__, europarl.__class__.__name__]))
 
         if idx is None:
             logger.info('Splitting dataset into ratios %r' % split)
@@ -155,10 +282,10 @@ def factory(C=None, split=None, idx=None, batch_sizes=None, **kwargs):
                 split_idx = []
                 accum = 0
                 for s in split:  # Create indicies from percentage values
-                    s_i = int(s * europarl.n_observations + accum)
+                    s_i = int(s * europarl.n + accum)
                     split_idx.append(s_i)
                     accum += s_i
-                idx = range(europarl.n_observations)
+                idx = range(europarl.n)
 
                 train_idx = idx[:split_idx[0]]
                 valid_idx = idx[split_idx[0]:split_idx[1]]
@@ -167,16 +294,18 @@ def factory(C=None, split=None, idx=None, batch_sizes=None, **kwargs):
         else:
             logger.info('Splitting dataset into ratios  %.2f / %.2f /%.2f '
                         'using given indices'
-                        % tuple(len(idx[i]) / float(europarl.n_observations)
+                        % tuple(len(idx[i]) / float(europarl.n)
                                 for i in range(3)))
 
-        assert len(batch_sizes) == len(idx)  # Shouldn't have different number of batch sizes than datasets
+        # Shouldn't have different number of batch sizes than datasets
+        assert len(batch_sizes) == len(idx)
 
         datasets = []
         modes = ['train', 'valid', 'test']
         data = europarl.data
         europarl.data = dict()
-        for i, bs, mode in zip(idx, batch_sizes, modes):  # Create correctly restricted copies of dataset
+        # Create correctly restricted copies of dataset
+        for i, bs, mode in zip(idx, batch_sizes, modes):
             if bs is None:
                 dataset = None
             else:
@@ -189,51 +318,18 @@ def factory(C=None, split=None, idx=None, batch_sizes=None, **kwargs):
 
         return datasets + [idx]
 
-    table = string.maketrans('', '')  # Variable for string_process()
-
-    def string_process(self, s):  # Helper method for get_data()
-        return s.lower().translate(self.table, string.punctuation).split()
-
-    @staticmethod
-    def pad_array(arr, length):  # Helper method for get_data()
-        return [1] + arr + [2] + ([0]*(length - len(arr) - 2))
-
-    @staticmethod  # Helper method for creating mask array from a list.
-    def create_mask(lst):
-        return np.array([int(bool(x)) for x in lst])
-
-    def count1(self, max_words, unknown_val):  # Crappy replacement for count as itterators can not be copied. :(
-        if self.itt_pos1 <= max_words:
-            self.itt_pos1 += 1
-            return self.itt_pos1 - 1
-        else:
-            return unknown_val
-
-    def count2(self, max_words, unknown_val):
-        if self.itt_pos2 <= max_words:
-            self.itt_pos2 += 1
-            return self.itt_pos2 - 1
-        else:
-            return unknown_val
-
-def count(start, max_words, unknown_val):  # Helper method for defaultdict in get_data()
-    s = start
-    while True:
-        if s <= max_words:
-            yield s
-            s += 1
-        else:
-            yield unknown_val
+    def next(self, batch_size=None):
+        rval = super(Europarl, self).next(batch_size=batch_size)
+        rval[self.name] = make_one_hot(rval[self.name],
+                                       n_classes=self.nX_tokens)
+        rval['label'] = make_one_hot(rval['label'],
+                                     n_classes=self.nY_tokens)
+        return rval
 
+    def save_images(self, out_file=None):
+        '''Shows tokenized in terms of original words.
 
-def main():
-    #data = Europarl(source='/export/mialab/users/jjohnson/data/basic/', batch_size=10)
-    train, valid, test, idx = Europarl.factory(source='/export/mialab/users/jjohnson/data/basic/',
-                                               batch_sizes=[100, 100, 100], split=[0.7, 0.2, 0.1])
-    h = hpy()
-    print h.heap()
-    print train.data['europarl'].shape
-    print valid.data['europarl'].shape
-    print test.data['europarl'].shape
+        Uses reverse dictionary.
 
-main()
+        '''
+        raise NotImplementedError()

From 21542d7f687a8ef8f97ec65faf8557351e463ad3 Mon Sep 17 00:00:00 2001
From: Jeremy-E-Johnson <Jeremy.Johnson14@ncf.edu>
Date: Sun, 19 Jun 2016 12:51:20 -0600
Subject: [PATCH 03/37] Created test for europarl.py. Checks shapes and masks.

---
 cortex/datasets/basic/tests/__init__.py      |  0
 cortex/datasets/basic/tests/test_europarl.py | 24 ++++++++++++++++++++
 2 files changed, 24 insertions(+)
 create mode 100644 cortex/datasets/basic/tests/__init__.py
 create mode 100644 cortex/datasets/basic/tests/test_europarl.py

diff --git a/cortex/datasets/basic/tests/__init__.py b/cortex/datasets/basic/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/cortex/datasets/basic/tests/test_europarl.py b/cortex/datasets/basic/tests/test_europarl.py
new file mode 100644
index 0000000..df54bba
--- /dev/null
+++ b/cortex/datasets/basic/tests/test_europarl.py
@@ -0,0 +1,24 @@
+"""
+Tests europarl.py, try with nosetests test_europarl.py
+
+Checks that the shapes split properly and that the masks line up.
+"""
+
+from cortex.datasets.basic.europarl import Europarl
+
+
+def test_europarl(split=[0.7, 0.2, 0.1]):
+    train, valid, test, idx = Europarl.factory(split=split, batch_sizes=[10, 10, 10],
+                                               debug=True, source='/export/mialab/users/jjohnson/data/basic')
+
+    for i, dataset in enumerate([train, valid, test]):
+
+        for key in ['europarl', 'mask_in', 'label', 'mask_out']:  # Test shapes.
+            assert dataset.data[key].shape == (int(idx[2][-1] * split[i]) + 1, 32)
+
+        for k in idx[i]:  # Test masks.
+            relative_k = k - idx[i][0]
+            for j in range(0, len(dataset.data['europarl'][relative_k])):
+                assert bool(dataset.data['europarl'][relative_k][j]) == bool(dataset.data['mask_in'][relative_k][j])
+            for j in range(0, len(dataset.data['label'][relative_k])):
+                assert bool(dataset.data['label'][relative_k][j]) == bool(dataset.data['mask_out'][relative_k][j])

From 665283ec30e0fb6e23bfb9f31fe7f3d8622e78b4 Mon Sep 17 00:00:00 2001
From: Jeremy-E-Johnson <Jeremy.Johnson14@ncf.edu>
Date: Sun, 19 Jun 2016 13:01:29 -0600
Subject: [PATCH 04/37] Added support for max_length, added some documentation,
 removed some unnecessary code in split_data.

---
 cortex/datasets/basic/europarl.py | 67 +++++++++++++++++++++++--------
 1 file changed, 51 insertions(+), 16 deletions(-)

diff --git a/cortex/datasets/basic/europarl.py b/cortex/datasets/basic/europarl.py
index 7129f57..7a7e478 100644
--- a/cortex/datasets/basic/europarl.py
+++ b/cortex/datasets/basic/europarl.py
@@ -18,18 +18,23 @@
 )
 import string
 
-from ..datasets import BasicDataset, make_one_hot
+from .. import BasicDataset, make_one_hot
 from ...utils import floatX, intX
 
 
 np.set_printoptions(threshold=np.nan)
 logger = logging.getLogger(__name__)
 
+
 class Europarl(BasicDataset):
     '''Europarl dataset itterator.
 
     Attributes:
-        max_sentence (int): maximimum sentence length.
+        max_sentence (int): Maximum sentence length.
+        max_length (int): Maximum number of sentences.
+        max_words (int): Maximum size of vocabulary.
+        english_to_french (bool): If true English is under name key, and French under label key, else reversed.
+        debug (bool): If true restricts max_length to 1000.
 
     '''
     _PAD = 0
@@ -40,7 +45,19 @@ class Europarl(BasicDataset):
 
     def __init__(self, source=None, english_to_french=True,
                  name='europarl', out_path=None, max_words=5000,
-                 max_sentence=30, max_length=7000, **kwargs):
+                 max_sentence=30, max_length=7000, debug=False, **kwargs):
+        """
+        Args:
+            source (str): Path to where the europarl data is stored.
+            english_to_french (bool): True for English input French labels, False for reverse.
+            name (str): Name of dataset.
+            out_path (str): Path to save outs.
+            max_words (int): Maximum vocab size, extra words are marked unknown.
+            max_sentence (int): Maximum sentence length, longer sentences are ignored.
+            max_length (int): Maximum number of sentences.
+            debug (bool): If True restricts max_length to 1000.
+            **kwargs:
+        """
 
         self.logger = logging.getLogger(
             '.'.join([self.__module__, self.__class__.__name__]))
@@ -54,6 +71,9 @@ def __init__(self, source=None, english_to_french=True,
         self.max_words = max_words
         self.english_to_french = english_to_french
 
+        if debug:
+            self.max_length = 1000
+
         X, Y, Mx, My = self.get_data(source)
         data = {name: X,
                 'label': Y,
@@ -75,15 +95,14 @@ def __init__(self, source=None, english_to_french=True,
     def slice_data(self, idx, data=None):
         '''Function for restricting dataset in instance.
 
+        Args:
+            idx (list): Indices of data to be kept.
+            data (dict): Data to be sliced and kept.
+
         '''
         if data is None: data = self.data
         for k, v in data.iteritems():
             self.data[k] = v[idx]
-        self.n_observations = len(idx)
-        self.X = data[self.name]
-        if self.labels in data.keys():
-            self.Y = data[self.labels]
-        self.n = self.X.shape[0]
 
     def get_data(self, source):
         special_tokens = {
@@ -123,11 +142,11 @@ def make_dictionary(sentences, n_lines, max_words=None):
 
             count_dict = defaultdict(int)
 
-            widgets = ['Counting words' , ' (', Timer(), ') [', Percentage(), ']']
+            widgets = ['Counting words', ' (', Timer(), ') [', Percentage(), ']']
             pbar = ProgressBar(widgets=widgets, maxval=n_lines).start()
 
             max_len = 0
-            for i, sentence in enumerate(sentences):
+            for i, sentence in zip(range(0, n_lines), sentences):
                 ps = preprocess(sentence)
                 l = len(ps)
                 if l <= self.max_sentence:
@@ -155,6 +174,8 @@ def make_dictionary(sentences, n_lines, max_words=None):
         def tokenize(sentence, d, pad_length):
             '''Tokenize sentence using dictionary.
 
+            If sentence is longer than max_sentence, returns [].
+
             Args:
                 sentence (str): sentence to be tokenized.
                 d (dict): token dictionary.
@@ -171,11 +192,12 @@ def tokenize(sentence, d, pad_length):
             s += [self._PAD] * max(0, pad_length + 2 - len(s))
             return s
 
-        def read_and_tokenize(file_path):
+        def read_and_tokenize(file_path, max_length):
             '''Read and tokenize a file of sentences.
 
             Args:
                 file_path (str): path to file.
+                max_length (int): maximum number of lines to read.
 
             Returns:
                 list: list of tokenized sentences.
@@ -185,7 +207,7 @@ def read_and_tokenize(file_path):
             '''
             self.logger.info('Reading sentences from %s' % file_path)
             with open(file_path) as f:
-                n_lines = sum(1 for line in f)
+                n_lines = min(sum(1 for line in f), max_length)
                 f.seek(0)
                 d, max_len = make_dictionary(f, n_lines)
                 r_d = dict((v, k) for k, v in d.iteritems())
@@ -196,7 +218,7 @@ def read_and_tokenize(file_path):
                 widgets = ['Tokenizing sentences' ,
                            ' (', Timer(), ') [', Percentage(), ']']
                 pbar = ProgressBar(widgets=widgets, maxval=n_lines).start()
-                for i, sentence in enumerate(f):
+                for i, sentence in zip(range(0, n_lines), f):
                     ts = tokenize(sentence, d, max_len)
                     assert len(ts) <= self.max_sentence + 2, (ts, len(ts))
                     tokenized_sentences.append(ts)
@@ -223,7 +245,7 @@ def match_and_trim(sentences_a, sentences_b):
 
             sentences_a_tr = []
             sentences_b_tr = []
-            widgets = ['Matching sentences' ,
+            widgets = ['Matching sentences',
                        ' (', Timer(), ') [', Percentage(), ']']
             trimmed = 0
             pbar = ProgressBar(widgets=widgets, maxval=len(sentences_a)).start()
@@ -239,10 +261,10 @@ def match_and_trim(sentences_a, sentences_b):
             return sentences_a_tr, sentences_b_tr
 
         fr_sentences, self.fr_dict, self.fr_dict_r = read_and_tokenize(
-            path.join(path.join(source, 'europarl-v7.fr-en.fr')))
+            path.join(path.join(source, 'europarl-v7.fr-en.fr')), self.max_length)
 
         en_sentences, self.en_dict, self.en_dict_r = read_and_tokenize(
-            path.join(path.join(source, 'europarl-v7.fr-en.en')))
+            path.join(path.join(source, 'europarl-v7.fr-en.en')), self.max_length)
 
         fr_sentences, en_sentences = match_and_trim(fr_sentences, en_sentences)
 
@@ -264,6 +286,19 @@ def match_and_trim(sentences_a, sentences_b):
 
     @staticmethod
     def factory(C=None, split=None, idx=None, batch_sizes=None, **kwargs):
+        '''
+
+        Args:
+            C: Data iterator to use, defaults to Europarl.
+            split: List of percentage values for train, valid, and test datasets respectively.
+            idx: List of indices for train, valid and test datasets respectively.
+            batch_sizes: List of batch sizes for train, valid, and test datasets respectively.
+            **kwargs: Other arguments to be passed to the data iterator.
+
+        Returns: Train, valid, test,(datasets) indices(list of indices for data of each).
+
+        '''
+
         if C is None:
             C = Europarl
         europarl = C(batch_size=10, **kwargs)

From e8cfe60153ed71d8b0755ea5decc1db4a1fd03a0 Mon Sep 17 00:00:00 2001
From: Jeremy-E-Johnson <Jeremy.Johnson14@ncf.edu>
Date: Sun, 19 Jun 2016 14:24:20 -0600
Subject: [PATCH 05/37] Fixed bug where words that only appeared in sentences
 that are too long would cause make_one_hot to crash. Added batch size and
 iteration testing to test_europarl.py.

---
 cortex/datasets/__init__.py                  |  4 ++--
 cortex/datasets/basic/europarl.py            | 10 +++++++---
 cortex/datasets/basic/tests/test_europarl.py |  7 +++++--
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/cortex/datasets/__init__.py b/cortex/datasets/__init__.py
index f7efe01..b816a5e 100644
--- a/cortex/datasets/__init__.py
+++ b/cortex/datasets/__init__.py
@@ -70,7 +70,7 @@ def make_one_hot(Y, n_classes=None):
         class_list = np.unique(Y).tolist()
         n_classes = len(class_list)
     else:
-        class_list = range(n_classes)
+        class_list = range(0, n_classes)
 
     if Y.ndim == 2:
         reshape = Y.shape
@@ -86,7 +86,7 @@ def make_one_hot(Y, n_classes=None):
             i = class_list.index(Y[idx])
         except ValueError:
             raise ValueError('Class list is missing elements')
-        O[idx, i] = 1.;
+        O[idx, i] = 1.
 
     if reshape is not None:
         O = O.reshape(reshape + (n_classes,))
diff --git a/cortex/datasets/basic/europarl.py b/cortex/datasets/basic/europarl.py
index 7a7e478..a94e7ab 100644
--- a/cortex/datasets/basic/europarl.py
+++ b/cortex/datasets/basic/europarl.py
@@ -103,6 +103,7 @@ def slice_data(self, idx, data=None):
         if data is None: data = self.data
         for k, v in data.iteritems():
             self.data[k] = v[idx]
+        self.n = len(idx)
 
     def get_data(self, source):
         special_tokens = {
@@ -215,7 +216,7 @@ def read_and_tokenize(file_path, max_length):
 
                 f.seek(0)
                 self.logger.info('Tokenizing sentences from %s' % file_path)
-                widgets = ['Tokenizing sentences' ,
+                widgets = ['Tokenizing sentences',
                            ' (', Timer(), ') [', Percentage(), ']']
                 pbar = ProgressBar(widgets=widgets, maxval=n_lines).start()
                 for i, sentence in zip(range(0, n_lines), f):
@@ -275,8 +276,11 @@ def match_and_trim(sentences_a, sentences_b):
             X = np.array(fr_sentences).astype(intX)
             Y = np.array(en_sentences).astype(intX)
 
-        self.nX_tokens = len(np.unique(X).tolist())
-        self.nY_tokens = len(np.unique(Y).tolist())
+        self.nX_tokens = X.max() + 1
+        self.nY_tokens = Y.max() + 1
+
+        #self.nX_tokens = len(np.unique(X).tolist())
+        #self.nY_tokens = len(np.unique(Y).tolist())
 
         self.logger.info('Creating masks')
         Mx = (X != 0).astype(intX)
diff --git a/cortex/datasets/basic/tests/test_europarl.py b/cortex/datasets/basic/tests/test_europarl.py
index df54bba..edecfe1 100644
--- a/cortex/datasets/basic/tests/test_europarl.py
+++ b/cortex/datasets/basic/tests/test_europarl.py
@@ -7,8 +7,8 @@
 from cortex.datasets.basic.europarl import Europarl
 
 
-def test_europarl(split=[0.7, 0.2, 0.1]):
-    train, valid, test, idx = Europarl.factory(split=split, batch_sizes=[10, 10, 10],
+def test_europarl(split=[0.7, 0.2, 0.1], batch_sizes=[10, 10, 10]):
+    train, valid, test, idx = Europarl.factory(split=split, batch_sizes=batch_sizes,
                                                debug=True, source='/export/mialab/users/jjohnson/data/basic')
 
     for i, dataset in enumerate([train, valid, test]):
@@ -16,6 +16,9 @@ def test_europarl(split=[0.7, 0.2, 0.1]):
         for key in ['europarl', 'mask_in', 'label', 'mask_out']:  # Test shapes.
             assert dataset.data[key].shape == (int(idx[2][-1] * split[i]) + 1, 32)
 
+            for data in dataset:  # Test batch sizes.
+                assert len(data[key]) == batch_sizes[i]
+
         for k in idx[i]:  # Test masks.
             relative_k = k - idx[i][0]
             for j in range(0, len(dataset.data['europarl'][relative_k])):

From fdbb43832736078d1594181458ba3f939339dbd3 Mon Sep 17 00:00:00 2001
From: Jeremy-E-Johnson <Jeremy.Johnson14@ncf.edu>
Date: Sun, 19 Jun 2016 15:33:47 -0600
Subject: [PATCH 06/37] Fixed path sourcing for data loading, and added
 europarl to supported data.

---
 cortex/datasets/__init__.py       | 4 +++-
 cortex/datasets/basic/europarl.py | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/cortex/datasets/__init__.py b/cortex/datasets/__init__.py
index b816a5e..9aa6ea5 100644
--- a/cortex/datasets/__init__.py
+++ b/cortex/datasets/__init__.py
@@ -42,12 +42,14 @@ def resolve(c):
     from .basic.caltech import CALTECH
     from .basic.uci import UCI
     from .basic.cifar import CIFAR
+    from .basic.europarl import Europarl
 
     r_dict = {
         'mnist': MNIST,
         'cifar': CIFAR,
         'caltech': CALTECH,
-        'uci': UCI
+        'uci': UCI,
+        'europarl': Europarl
     }
 
     C = r_dict.get(c, None)
diff --git a/cortex/datasets/basic/europarl.py b/cortex/datasets/basic/europarl.py
index a94e7ab..c7e5773 100644
--- a/cortex/datasets/basic/europarl.py
+++ b/cortex/datasets/basic/europarl.py
@@ -20,7 +20,7 @@
 
 from .. import BasicDataset, make_one_hot
 from ...utils import floatX, intX
-
+from ...utils.tools import resolve_path
 
 np.set_printoptions(threshold=np.nan)
 logger = logging.getLogger(__name__)
@@ -65,6 +65,7 @@ def __init__(self, source=None, english_to_french=True,
 
         if source is None:
             raise ValueError('No source file provided.')
+        source = resolve_path(source)
 
         self.max_sentence = max_sentence
         self.max_length = max_length

From 7e72540985a90f831225e458ac3f6e7b8a3450f0 Mon Sep 17 00:00:00 2001
From: Jeremy-E-Johnson <Jeremy.Johnson14@ncf.edu>
Date: Mon, 20 Jun 2016 08:29:27 -0600
Subject: [PATCH 07/37] Added __init__ to demos_basic/tests to make imports for
 tests work better.

---
 cortex/demos/demos_basic/tests/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 cortex/demos/demos_basic/tests/__init__.py

diff --git a/cortex/demos/demos_basic/tests/__init__.py b/cortex/demos/demos_basic/tests/__init__.py
new file mode 100644
index 0000000..e69de29

From 5ef70a1385903d1b5950118abc019840ce3bd090 Mon Sep 17 00:00:00 2001
From: Jeremy-E-Johnson <Jeremy.Johnson14@ncf.edu>
Date: Mon, 20 Jun 2016 08:29:58 -0600
Subject: [PATCH 08/37] Start of an RNN demo.

---
 cortex/demos/demos_basic/rnn_europarl.py | 93 ++++++++++++++++++++++++
 1 file changed, 93 insertions(+)
 create mode 100644 cortex/demos/demos_basic/rnn_europarl.py

diff --git a/cortex/demos/demos_basic/rnn_europarl.py b/cortex/demos/demos_basic/rnn_europarl.py
new file mode 100644
index 0000000..e11adc8
--- /dev/null
+++ b/cortex/demos/demos_basic/rnn_europarl.py
@@ -0,0 +1,93 @@
+"""
+Demo for next word guessing using an RNN.
+
+Try with cortex-run 'rnn_europarl.py <optional .yaml>'
+"""
+
+from collections import OrderedDict
+import theano
+from cortex.models.rnn import SimpleRNN
+from cortex.datasets import resolve as resolve_dataset
+
+
+# Default arguments
+_learning_args = dict(
+    learning_rate=0.01,
+    learning_rate_scheduler=None,
+    optimizer='sgd',
+    optimizer_args=dict(),
+    epochs=100,
+    valid_key='error',
+    valid_sign='+',
+    excludes=[]
+)
+
+_dataset_args = dict(
+    train_batch_size=100,
+    valid_batch_size=100,
+    dataset='europarl',
+    distribution='multinomial',
+    source='$data/basic/'
+)
+
+_model_args = dict(
+    dim_h=None,
+    l2_decay=None,
+)
+
+simple_rnn_args = dict(
+    dim_h=100,
+    input_layer='europarl',
+    output='label',
+    dropout=None
+)
+
+extra_arg_keys = ['simple_rnn_args']
+
+
+def _build(module):
+    models = OrderedDict()
+    dataset = module.dataset
+    simple_rnn_args = module.simple_rnn_args
+    dim_in = dataset.dims[dataset.name]
+    dim_out = dataset.dims['label']
+    distribution = dataset.distributions[simple_rnn_args['output']]
+
+    model = SimpleRNN.factory(dim_in=dim_in, dim_out=dim_out, distribution=distribution, **simple_rnn_args)
+
+    models['rnn'] = model
+    return models
+
+
+def _cost(module):
+    models = module.models
+
+    X = module.inputs[module.dataset.name]
+    used_inputs = [module.dataset.name]
+
+    model = models['rnn']
+    outputs = model(X)
+
+    results = OrderedDict()
+    p = outputs[0]['p']
+    base_cost = model.neg_log_prob(X[1:], p[:-1]).sum(axis=0)
+    print base_cost, '####'
+    print model.neg_log_prob(X[1:], p[:-1]), '#####'
+    cost = base_cost
+
+    updates = theano.OrderedUpdates()
+    constants = []
+
+    l2_decay = module.l2_decay
+    if l2_decay is not False and l2_decay > 0.:
+        module.logger.info('Adding %.5f L2 weight decay' % l2_decay)
+        l2_rval = model.l2_decay(l2_decay)
+        l2_cost = l2_rval.pop('cost')
+        cost += l2_cost
+        results['l2_cost'] = l2_cost
+
+    # results['error'] = (Y * (1 - p)).sum(axis=1).mean()
+    results['-sum log p(x | y)'] = base_cost
+    results['cost'] = cost
+
+    return used_inputs, results, updates, constants, outputs[0]

From 38dd209cea1a4f12b3eef6f9428cdb6a6d522744 Mon Sep 17 00:00:00 2001
From: Jeremy-E-Johnson <Jeremy.Johnson14@ncf.edu>
Date: Mon, 20 Jun 2016 16:53:18 -0600
Subject: [PATCH 09/37] Added field (self.dimall) for holding all dimensions of
 data. (useful if shuffleing order of indicies)

---
 cortex/datasets/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cortex/datasets/__init__.py b/cortex/datasets/__init__.py
index 9aa6ea5..acb6749 100644
--- a/cortex/datasets/__init__.py
+++ b/cortex/datasets/__init__.py
@@ -369,6 +369,7 @@ def __init__(self, data, distributions=None, labels='label', name=None,
         self.balance = balance
 
         self.dims = dict()
+        self.dimsall = dict()
         if distributions is None:
             self.distributions = dict()
         else:
@@ -393,6 +394,7 @@ def __init__(self, data, distributions=None, labels='label', name=None,
                                     'number of samples (shape[0]), '
                                     '(%d vs %d)' % (self.n, v.shape[0]))
             self.dims[k] = v.shape[1]
+            self.dimsall[k] = v.shape
             if not k in self.distributions.keys():
                 self.distributions[k] = 'binomial'
 

From 65b767a5292f8c361f6a58826b9ef5240a805909 Mon Sep 17 00:00:00 2001
From: Jeremy-E-Johnson <Jeremy.Johnson14@ncf.edu>
Date: Mon, 20 Jun 2016 16:54:25 -0600
Subject: [PATCH 10/37] Fixed source path.

---
 cortex/datasets/basic/tests/test_europarl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cortex/datasets/basic/tests/test_europarl.py b/cortex/datasets/basic/tests/test_europarl.py
index edecfe1..ba13101 100644
--- a/cortex/datasets/basic/tests/test_europarl.py
+++ b/cortex/datasets/basic/tests/test_europarl.py
@@ -9,7 +9,7 @@
 
 def test_europarl(split=[0.7, 0.2, 0.1], batch_sizes=[10, 10, 10]):
     train, valid, test, idx = Europarl.factory(split=split, batch_sizes=batch_sizes,
-                                               debug=True, source='/export/mialab/users/jjohnson/data/basic')
+                                               debug=True, source='/export/mialab/users/jjohnson/data/basic/europarl')
 
     for i, dataset in enumerate([train, valid, test]):
 

From c796f5a4745111787fa0f26aff67072d04cbb798 Mon Sep 17 00:00:00 2001
From: Jeremy-E-Johnson <Jeremy.Johnson14@ncf.edu>
Date: Mon, 20 Jun 2016 16:56:05 -0600
Subject: [PATCH 11/37] Added functionality for not counting words that appear
 in corrospondence with long sentences of the other language.

---
 cortex/datasets/basic/europarl.py | 62 ++++++++++++++++++++++++-------
 1 file changed, 49 insertions(+), 13 deletions(-)

diff --git a/cortex/datasets/basic/europarl.py b/cortex/datasets/basic/europarl.py
index c7e5773..5e66197 100644
--- a/cortex/datasets/basic/europarl.py
+++ b/cortex/datasets/basic/europarl.py
@@ -88,6 +88,9 @@ def __init__(self, source=None, english_to_french=True,
         super(Europarl, self).__init__(data, distributions=distributions,
                                        name=name, one_hot=False, **kwargs)
 
+        self.dimsall['europarl'] = self.dimsall['europarl'][0], self.dimsall['europarl'][1], self.nX_tokens
+        self.dimsall['label'] = self.dimsall['label'][0], self.dimsall['label'][1], self.nY_tokens
+
         self.out_path = out_path
 
         if self.shuffle:
@@ -123,7 +126,18 @@ def preprocess(s):
             '''
             return s.lower().translate(self.table, string.punctuation).split()
 
-        def make_dictionary(sentences, n_lines, max_words=None):
+        def find_long_sentences(epath, fpath, n_lines):
+            with open(epath) as e:
+                with open(fpath) as f:
+                    e.seek(0)
+                    f.seek(0)
+                    too_long_indices = []
+                    for i, eSentence, fSentence in zip(range(0, n_lines), e, f):
+                        if len(preprocess(eSentence)) > self.max_sentence or len(preprocess(fSentence)) > self.max_sentence:
+                            too_long_indices.append(i)
+                    return too_long_indices
+
+        def make_dictionary(sentences, n_lines, max_words=None, too_long_indices=None):
             '''Forms a dictionary from words in sentences.
 
             If there are more words than max_words, use the top frequent ones.
@@ -141,6 +155,7 @@ def make_dictionary(sentences, n_lines, max_words=None):
             '''
             self.logger.info('Forming dictionary')
             if max_words is None: max_words = self.max_words
+            if too_long_indices is None: too_long_indices = []
 
             count_dict = defaultdict(int)
 
@@ -151,7 +166,7 @@ def make_dictionary(sentences, n_lines, max_words=None):
             for i, sentence in zip(range(0, n_lines), sentences):
                 ps = preprocess(sentence)
                 l = len(ps)
-                if l <= self.max_sentence:
+                if l <= self.max_sentence and i not in too_long_indices:
                     for word in ps:
                         count_dict[word] += 1
                     max_len = max(l, max_len)
@@ -194,7 +209,7 @@ def tokenize(sentence, d, pad_length):
             s += [self._PAD] * max(0, pad_length + 2 - len(s))
             return s
 
-        def read_and_tokenize(file_path, max_length):
+        def read_and_tokenize(file_path, max_length, too_long_indices=None):
             '''Read and tokenize a file of sentences.
 
             Args:
@@ -211,7 +226,7 @@ def read_and_tokenize(file_path, max_length):
             with open(file_path) as f:
                 n_lines = min(sum(1 for line in f), max_length)
                 f.seek(0)
-                d, max_len = make_dictionary(f, n_lines)
+                d, max_len = make_dictionary(f, n_lines, too_long_indices=too_long_indices)
                 r_d = dict((v, k) for k, v in d.iteritems())
                 tokenized_sentences = []
 
@@ -262,11 +277,14 @@ def match_and_trim(sentences_a, sentences_b):
 
             return sentences_a_tr, sentences_b_tr
 
+        too_long_indices = find_long_sentences(path.join(path.join(source, 'europarl-v7.fr-en.en')),
+                         path.join(path.join(source, 'europarl-v7.fr-en.fr')), self.max_length)
+
         fr_sentences, self.fr_dict, self.fr_dict_r = read_and_tokenize(
-            path.join(path.join(source, 'europarl-v7.fr-en.fr')), self.max_length)
+            path.join(path.join(source, 'europarl-v7.fr-en.fr')), self.max_length, too_long_indices=too_long_indices)
 
         en_sentences, self.en_dict, self.en_dict_r = read_and_tokenize(
-            path.join(path.join(source, 'europarl-v7.fr-en.en')), self.max_length)
+            path.join(path.join(source, 'europarl-v7.fr-en.en')), self.max_length, too_long_indices=too_long_indices)
 
         fr_sentences, en_sentences = match_and_trim(fr_sentences, en_sentences)
 
@@ -277,11 +295,8 @@ def match_and_trim(sentences_a, sentences_b):
             X = np.array(fr_sentences).astype(intX)
             Y = np.array(en_sentences).astype(intX)
 
-        self.nX_tokens = X.max() + 1
-        self.nY_tokens = Y.max() + 1
-
-        #self.nX_tokens = len(np.unique(X).tolist())
-        #self.nY_tokens = len(np.unique(Y).tolist())
+        self.nX_tokens = len(np.unique(X).tolist()) + int(3 not in np.unique(X).tolist())
+        self.nY_tokens = len(np.unique(Y).tolist()) + int(3 not in np.unique(Y).tolist())
 
         self.logger.info('Creating masks')
         Mx = (X != 0).astype(intX)
@@ -366,10 +381,31 @@ def next(self, batch_size=None):
                                      n_classes=self.nY_tokens)
         return rval
 
-    def save_images(self, out_file=None):
+    def save_images(self, image, english=True, out_path=None):
         '''Shows tokenized in terms of original words.
 
         Uses reverse dictionary.
 
         '''
-        raise NotImplementedError()
+        print image.shape
+        sentences = []
+        for sentence in image:
+            sen = []
+            for token in sentence:
+
+                if english:
+                    sen.append(self.en_dict_r[token.argmax()])
+                else:
+                    sen.append(self.fr_dict_r[token.argmax])
+            sentences.append(sen)
+
+        if out_path is None:
+            for sentence in sentences:
+                print ' '.join(sentence) + '.'
+
+        else:
+            with open(out_path, 'w') as f:
+                for sentence in sentences:
+                    f.write(' '.join(sentence) + '.')
+
+        return sentences

From b84c88f114c5f26f1e59e1e2712c4e78c6358e8b Mon Sep 17 00:00:00 2001
From: Jeremy-E-Johnson <Jeremy.Johnson14@ncf.edu>
Date: Mon, 20 Jun 2016 16:56:45 -0600
Subject: [PATCH 12/37] Added _vis(). Appears to be functional.

---
 cortex/demos/demos_basic/rnn_europarl.py | 59 +++++++++++++++++-------
 1 file changed, 43 insertions(+), 16 deletions(-)

diff --git a/cortex/demos/demos_basic/rnn_europarl.py b/cortex/demos/demos_basic/rnn_europarl.py
index e11adc8..cabfa3e 100644
--- a/cortex/demos/demos_basic/rnn_europarl.py
+++ b/cortex/demos/demos_basic/rnn_europarl.py
@@ -6,7 +6,9 @@
 
 from collections import OrderedDict
 import theano
+import numpy as np
 from cortex.models.rnn import SimpleRNN
+from cortex.utils import intX, floatX
 from cortex.datasets import resolve as resolve_dataset
 
 
@@ -14,20 +16,21 @@
 _learning_args = dict(
     learning_rate=0.01,
     learning_rate_scheduler=None,
-    optimizer='sgd',
+    optimizer='rmsprop',
     optimizer_args=dict(),
     epochs=100,
-    valid_key='error',
+    valid_key='-sum log p(x | y)',
     valid_sign='+',
     excludes=[]
 )
 
 _dataset_args = dict(
-    train_batch_size=100,
-    valid_batch_size=100,
+    train_batch_size=10,
+    valid_batch_size=10,
+    debug=False,
     dataset='europarl',
     distribution='multinomial',
-    source='$data/basic/'
+    source='$data/basic/europarl'
 )
 
 _model_args = dict(
@@ -36,7 +39,7 @@
 )
 
 simple_rnn_args = dict(
-    dim_h=100,
+    dim_h=1000,
     input_layer='europarl',
     output='label',
     dropout=None
@@ -44,13 +47,19 @@
 
 extra_arg_keys = ['simple_rnn_args']
 
+#theano.config.compute_test_value = 'warn'
+
+#theano.config.exception_verbosity = 'high'
+
+#theano.config.optimizer = 'None'
+
 
 def _build(module):
     models = OrderedDict()
     dataset = module.dataset
     simple_rnn_args = module.simple_rnn_args
-    dim_in = dataset.dims[dataset.name]
-    dim_out = dataset.dims['label']
+    dim_in = dataset.dimsall[dataset.name][2]
+    dim_out = dataset.dimsall[dataset.name][2]
     distribution = dataset.distributions[simple_rnn_args['output']]
 
     model = SimpleRNN.factory(dim_in=dim_in, dim_out=dim_out, distribution=distribution, **simple_rnn_args)
@@ -62,20 +71,17 @@ def _build(module):
 def _cost(module):
     models = module.models
 
-    X = module.inputs[module.dataset.name]
+    X = module.inputs[module.dataset.name].transpose(1, 0, 2)
     used_inputs = [module.dataset.name]
 
     model = models['rnn']
-    outputs = model(X)
+    outputs, preact, updates = model(X)
 
     results = OrderedDict()
-    p = outputs[0]['p']
-    base_cost = model.neg_log_prob(X[1:], p[:-1]).sum(axis=0)
-    print base_cost, '####'
-    print model.neg_log_prob(X[1:], p[:-1]), '#####'
+    p = outputs['p']
+    base_cost = model.neg_log_prob(X[1:], p[:-1]).sum(0).mean()
     cost = base_cost
 
-    updates = theano.OrderedUpdates()
     constants = []
 
     l2_decay = module.l2_decay
@@ -90,4 +96,25 @@ def _cost(module):
     results['-sum log p(x | y)'] = base_cost
     results['cost'] = cost
 
-    return used_inputs, results, updates, constants, outputs[0]
+    return used_inputs, results, updates, constants, outputs
+
+
+def _viz(module, outputs, results, n_samples=10, n_steps=10):
+    out_path = module.out_path
+    out_path = None #### For testing purposes
+    n_tokens = int(module.dataset.dimsall[module.dataset.name][2])
+
+    pvals = np.zeros((n_samples, n_tokens)) + 1./float(n_tokens)
+    x0 = module.models['rnn'].trng.multinomial(pvals=pvals, dtype=floatX)
+
+    outputs, updates = module.models['rnn'].sample(x0=x0, n_steps=n_steps)
+
+    updates = theano.OrderedUpdates(updates)
+
+    f_vis = theano.function([], outputs['x'], updates=updates)
+
+    def f_analysis():
+        out = f_vis()
+        return module.dataset.save_images(out, out_path=out_path)
+
+    return f_analysis

From a88a6c2f983420fa2f259ae79faa07ba944144ea Mon Sep 17 00:00:00 2001
From: Jeremy-E-Johnson <Jeremy.Johnson14@ncf.edu>
Date: Thu, 23 Jun 2016 18:18:25 -0600
Subject: [PATCH 13/37] Added voc.py the start of a 2D classification dataset
 iterator class.

---
 cortex/datasets/basic/voc.py | 171 +++++++++++++++++++++++++++++++++++
 1 file changed, 171 insertions(+)
 create mode 100644 cortex/datasets/basic/voc.py

diff --git a/cortex/datasets/basic/voc.py b/cortex/datasets/basic/voc.py
new file mode 100644
index 0000000..66eab2d
--- /dev/null
+++ b/cortex/datasets/basic/voc.py
@@ -0,0 +1,171 @@
+"""
+Data itteratoor for VOC classification data.
+"""
+
+from .. import BasicDataset
+from os import path
+import logging
+from ...utils.tools import resolve_path
+from PIL import Image
+import random
+
+
+class VOC(BasicDataset):
+    """Dataset iterator for VOC classification data. (Designed for use with Pyramid RNNs)
+
+    Attributes:
+
+    """
+
+    def __init__(self, images_loaded=10, chunk_size=5, out_path=None, chunks=1000,
+                 mode='train', source=None, name='voc', **kwargs):
+        """
+
+        Args:
+            images_loaded (int): How many images to load
+            chunk_size (int): Dimension of chunks to be made.
+            mode (str): Type of data to load, train, trainval, val.
+            source (str): Path to directory containing VOCdevkit
+            name: Name of iterator
+            **kwargs:
+        """
+
+        self.logger = logging.getLogger(
+            '.'.join([self.__module__, self.__class__.__name__]))
+        self.logger.info('Loading %s from %s' % (name, source))
+
+        if source is None:
+            raise ValueError('No source file provided.')
+        source = resolve_path(source)
+
+        self.chunks = chunks
+        self.images_loaded = images_loaded
+        if chunk_size % 2:
+            self.chunk_size = chunk_size
+        else:
+            self.logger.info('Using %d + 1 to get an odd chunk size.' % chunk_size)
+            self.chunk_size = chunk_size + 1
+
+        X, Y = self.get_data(source, mode)
+        data = {name: X, 'label': Y}
+        distributions = {name: 'multinomial', 'label': 'multinomial'}
+
+        super(VOC, self).__init__(data, distributions=distributions,
+                                  name=name, **kwargs)
+
+        self.out_path = out_path
+
+        if self.shuffle:
+            self.randomize()
+
+    def get_data(self, source, mode):
+        """Gets data given source, chunks it, and returns chunks with center labels.
+
+        Args:
+            source (str): File path to directory containing VOCdevkit.
+            mode (str): Mode of data, eg. train, trainval, val.
+
+        Returns:
+
+        """
+        rand = random.Random()
+        buff_dist = (self.chunk_size + 1)/2
+
+        def get_unique(pixels):
+            """Helper function for get_data, returns the number of unique classifiers in an image.
+
+            Args:
+                im (list of lists): Pixel classifier values.
+
+            Returns (int): Number of unique classifiers in image.
+
+            """
+            unique = []
+            for line in pixels:
+                for i in line:
+                    if i not in unique:
+                        unique.append(i)
+            return len(unique)
+
+        def image_to_pixels(im):
+            """
+
+            Args:
+                im (Image): Image object form PIL
+
+            Returns (list of lists): Pixels
+
+            """
+            pixels = list(im.getdata())
+            width, height, = im.size
+            return [pixels[i * width:(i + 1) * width] for i in xrange(height)]
+
+
+        def project_to_binary(pixels):
+            """Helper function for get_data, returns binary version of input pixels.
+
+            Args:
+                pixels (list of lists of ints): pixels of an image.
+
+            Returns: Pixels projected to binary.
+
+            """
+            retval = []
+            for ln in pixels:
+                retval.append([bool(val) for val in ln])
+            return retval
+
+        def get_random_chunk(pixels_data, pixels_label):
+            """Helper function for get_data, gets random chunk from data, and returns label for center.
+
+            Args:
+                pixels_data (list of lists): Image pixels of data.
+                pixels_label (list of lists): Image pixels of label.
+
+            Returns: data_chunk (list of lists subsection of pixels_data), label_val (value of label at center of chunk)
+
+            """
+            y = rand.randint(buff_dist, len(pixels_data) - buff_dist)
+            x = rand.randint(buff_dist, len(pixels_data[0]) - buff_dist)
+            data_chunk = []
+            label_val = pixels_label[y][x]
+            for i in range(y -buff_dist + 1, y + buff_dist):
+                data_chunk.append(pixels_data[i][x - buff_dist + 1 :  buff_dist - 1])
+            assert len(data_chunk) == self.chunk_size and len(data_chunk[0]) == self.chunk_size
+            return data_chunk, label_val
+
+        names = []
+        with open(source + 'VOCdevkit/VOC2010/ImageSets/Segmentation/' + mode + '.txt') as f:
+            for line in f:
+                names.append(line[:-1])
+
+        self.data_images = []
+        self.label_images = []
+        images_loaded = 0
+        for name in names:
+            if images_loaded < self.images_loaded:
+                label_im = Image.open(source + '/VOCdevkit/VOC2010/SegmentationObject/' + name + '.png')
+                label_pixels = image_to_pixels(label_im)
+                if get_unique(label_pixels) == 3:
+                    self.label_images.append(project_to_binary(label_pixels))
+
+                    data_im = Image.open(source + 'VOCdevkit/VOC2010/JPEGImages/' + name + '.jpeg').convert('1')
+                    self.data_images.append(image_to_pixels(data_im))
+
+                    images_loaded += 1
+            else:
+                break
+
+        X = []
+        Y = []
+        for i in xrange(0, self.chunks):
+            k = rand.randint(0, images_loaded - 1)
+            x, y = get_random_chunk(self.data_images[k], self.label_images[k])
+            X.append(x)
+            Y.append(y)
+
+        assert len(X) == self.chunks and len(Y) == self.chunks
+
+        return X, Y
+
+

From 6e11504c9ea470e4ca96bf0a141f05b8925f61f8 Mon Sep 17 00:00:00 2001
From: Jeremy-E-Johnson <Jeremy.Johnson14@ncf.edu>
Date: Thu, 23 Jun 2016 18:19:29 -0600
Subject: [PATCH 14/37] Added pyramid_rnn model and basic tests for its
 construction and step function. Still a work in progress.

---
 cortex/models/pyramid_rnn.py            | 75 +++++++++++++++++++++++++
 cortex/models/tests/test_pyramid_rnn.py | 39 +++++++++++++
 2 files changed, 114 insertions(+)
 create mode 100644 cortex/models/pyramid_rnn.py
 create mode 100644 cortex/models/tests/test_pyramid_rnn.py

diff --git a/cortex/models/pyramid_rnn.py b/cortex/models/pyramid_rnn.py
new file mode 100644
index 0000000..85758e7
--- /dev/null
+++ b/cortex/models/pyramid_rnn.py
@@ -0,0 +1,75 @@
+"""
+Module for 2 dimensional pyramid RNN layers.
+"""
+
+from .rnn import RNN
+import collections
+import theano.tensor as T
+import numpy as np
+from ..utils import tools
+
+
+class Pyramid_RNN(RNN):
+
+    def __init__(self, dim_in, dim_hs, dim_out=None, output_net=None,
+                 input_net=None, name='pyramid', **kwargs):
+
+        if dim_out is None:
+            self.dim_out = 1
+        super(Pyramid_RNN, self).__init__(dim_in=dim_in, dim_hs=dim_hs, name=name,
+                                          output_net=output_net, input_net=input_net, **kwargs)
+
+    @staticmethod
+    def factory(dim_in=None, dim_out=None, dim_hs=None, **kwargs):
+        '''Factory for creating MLPs for Pyramid_RNN and returning .
+
+        Convenience to quickly create MLPs from dictionaries, linking all
+        relevant dimensions and distributions.
+
+        Args:
+            dim_in (int): input dimension.
+            dim_hs (list): dimensions of recurrent units.
+            dim_out (Optional[int]): output dimension. If not provided, assumed
+                to be dim_in.
+
+        Returns:
+            RNN
+
+        '''
+        assert len(dim_hs) > 0
+        if dim_out is None:
+            dim_out = 1
+        mlps, kwargs = RNN.mlp_factory(dim_in, dim_out, dim_hs, **kwargs)
+        kwargs.update(**mlps)
+
+        return Pyramid_RNN(dim_in, dim_hs, dim_out=dim_out, **kwargs)
+
+    def set_params(self):
+        '''Initialize RNN parameters.
+
+        '''
+        self.params = collections.OrderedDict()
+        for i, dim_h in enumerate(self.dim_hs):
+            Ur = tools.norm_weight(3 * dim_h, dim_h)
+            self.params['Ur%d' % i] = Ur
+
+        self.set_net_params()
+
+    def _step(self, m, y, h_, Ur):
+        '''Step function for RNN call.
+
+        Args:
+            m (T.tensor): masks.
+            y (T.tensor): inputs.
+            h_ (T.tensor): recurrent state.
+            Ur (theano.shared): recurrent connection.
+
+        Returns:
+            T.tensor: next recurrent state.
+
+        '''
+        H_t    = T.concatenate((h_, T.roll(h_, 1, 2), T.roll(h_, -1, 2)), 2)
+        preact = y + T.dot(H_t, Ur)
+        h      = T.tanh(preact)
+        h      = m * h + (1 - m) * h_
+        return h
diff --git a/cortex/models/tests/test_pyramid_rnn.py b/cortex/models/tests/test_pyramid_rnn.py
new file mode 100644
index 0000000..2573fa9
--- /dev/null
+++ b/cortex/models/tests/test_pyramid_rnn.py
@@ -0,0 +1,39 @@
+'''
+Module for testing 2D pyramid RNN.
+'''
+
+from cortex.models.pyramid_rnn import Pyramid_RNN
+import numpy as np
+import theano
+import theano.tensor as T
+
+
+def test_build(dim_in=13, dim_h=17):
+    pyramid = Pyramid_RNN.factory(dim_in=dim_in, dim_hs=[dim_h],
+                                  dim_out=1)
+    pyramid.set_tparams()
+
+    return pyramid
+
+
+def test_step(pyramid=None, dim_in=13, dim_h=17):
+    if pyramid is None:
+        pyramid = test_build(dim_in=dim_in, dim_h=dim_h)
+
+    m = theano.tensor.tensor3()
+    y = theano.tensor.tensor3()
+    h_ = theano.tensor.tensor3()
+    Ur = theano.tensor.matrix()
+
+    activation = pyramid._step(m, y, h_, Ur)
+    f = theano.function([m, y, h_, Ur], activation)
+
+
+    t = f(np.ones((10, dim_in, dim_h), dtype='float32'), np.ones((10, dim_in, dim_h), dtype='float32'),
+          np.ones((10, dim_in, dim_h), dtype='float32'), pyramid.params['Ur0'])
+
+    preact = np.ones((10, dim_in, dim_h), dtype='float32') + \
+             np.dot(np.ones((10, dim_in, 3*dim_h), dtype='float32'), pyramid.params['Ur0'])
+    n = np.tanh(preact)
+
+    np.testing.assert_almost_equal(t, n)

From 8688d7512173150838b1beaee209aa10ec1be852 Mon Sep 17 00:00:00 2001
From: Jeremy-E-Johnson <Jeremy.Johnson14@ncf.edu>
Date: Fri, 24 Jun 2016 15:12:41 -0600
Subject: [PATCH 15/37] Working on the __call__ function. Not currently
 functional.

---
 cortex/models/pyramid_rnn.py | 117 +++++++++++++++++++++++++++++++++++
 1 file changed, 117 insertions(+)

diff --git a/cortex/models/pyramid_rnn.py b/cortex/models/pyramid_rnn.py
index 85758e7..71949da 100644
--- a/cortex/models/pyramid_rnn.py
+++ b/cortex/models/pyramid_rnn.py
@@ -5,8 +5,11 @@
 from .rnn import RNN
 import collections
 import theano.tensor as T
+import theano
 import numpy as np
 from ..utils import tools
+import collections as coll
+from ..utils import floatX
 
 
 class Pyramid_RNN(RNN):
@@ -73,3 +76,117 @@ def _step(self, m, y, h_, Ur):
         h      = T.tanh(preact)
         h      = m * h + (1 - m) * h_
         return h
+
+    def step_call(self, x, m, h0s, *params):
+        '''Step version of __call__ for scan
+
+        Args:
+            x (T.tensor): input.
+            m (T.tensor): mask.
+            h0s (list): list of recurrent initial states. Calculated in this function now, ie NOT IMPLEMENTED
+            *params: list of theano.shared.
+
+        Returns:
+            OrderedDict: dictionary of results. and now calculated h0s
+
+        '''
+        n_steps = (x.shape[0] + 1)/2
+        n_samples = x.shape[1]
+        input = x
+
+        updates = theano.OrderedUpdates()
+
+        h0s = []
+        hs = []
+        directional_values = []
+        for k in range(0, 4):  # Iterate through directions.
+            x = np.swapaxes(np.rot90(np.swapaxes(input, 1, 2), k), 1, 2)[0:(self.dim_in + 1)/2].astype('float32')
+            h0s.append([T.alloc(0., x.shape[1], self.dim_in, dim_h).astype(floatX) for dim_h in self.dim_hs])
+            for i, h0 in enumerate(h0s[k]):
+                seqs         = [m[:, :, None]] + self.call_seqs(x, None, i, *params)
+                outputs_info = [h0]
+                non_seqs     = [self.get_recurrent_args(*params)[i]]
+                h, updates_ = theano.scan(
+                    self._step,
+                    sequences=seqs,
+                    outputs_info=outputs_info,
+                    non_sequences=non_seqs,
+                    name=self.name + '_recurrent_steps_%d' % i,
+                    n_steps=n_steps)
+                hs.append(h)
+                x = h
+                updates += updates_
+            directional_values.append(h[(self.dim_in + 1)/2])  # Remember directional outputs.
+
+        o_params    = self.get_output_args(*params)
+        out_net_out = self.output_net.step_call(sum(directional_values), *o_params)  # Sum different directions.
+        preact      = out_net_out['z']
+        p           = out_net_out['p']
+
+        return coll.OrderedDict(hs=hs, p=p, z=preact), updates, h0s
+
+    def __call__(self, x, m=None, h0s=None, condition_on=None):
+        '''Call function.
+
+        For learning RNNs.
+
+        Args:
+            x (T.tensor): input sequence. window x batch x dim (a x b x a) where a is chunk size, b is batch size.
+            m (T.tensor): mask. window x batch. For masking in recurrent steps. NOT IMPLEMENTED
+            h0s (Optional[list]): initial h0s. NOT IMPLEMENTED
+            condition_on (Optional[T.tensor]): conditional for recurrent step.
+
+        Returns:
+            OrderedDict: dictionary of results: hiddens, probabilities, and
+                preacts.
+            theano.OrderedUpdates.
+
+        '''
+        constants = []
+        input_rotations = []
+
+        ''' Calculating h0s in step_call so that rotations of data happen once and don't need to be saved.
+        if h0s is None and self.init_net is not None:
+            h0s = self.init_net.initialize(x[0])
+            constants += h0s
+        elif h0s is None:
+            h0s = [T.alloc(0., x.shape[1], dim_h).astype(floatX) for dim_h in self.dim_hs]
+        '''
+
+        if m is None:
+            m = T.ones((x.shape[0], x.shape[1])).astype(floatX)
+
+        params = self.get_sample_params()
+
+        results, updates, h0s = self.step_call(x, m, h0s, *params)
+        results['h0s'] = h0s
+        return results, updates, constants
+
+    def call_seqs(self, x, condition_on, level, *params):
+        '''Prepares the input for `__call__`.
+
+        Args:
+            x (T.tensor): input
+            condtion_on (T.tensor or None): tensor to condition recurrence on.
+            level (int): reccurent level.
+            *params: list of theano.shared.
+
+        Returns:
+            list: list of scan inputs.
+
+        '''
+        print x.shape
+        if level == 0:
+            i_params = self.get_input_args(*params)
+            a = self.input_net.step_preact(x, *i_params)
+        else:
+            i_params = self.get_inter_args(level - 1, *params)
+            a = self.inter_nets[level - 1].step_preact(x, *i_params)
+        print a.shape
+        print self.input_net.dim_in
+        print self.input_net.dim_out
+
+        if condition_on is not None:
+            a += condition_on
+
+        return [a]

From 2cb1caaa08f38833f27a6fafa50f7edb7127d944 Mon Sep 17 00:00:00 2001
From: Jeremy-E-Johnson <Jeremy.Johnson14@ncf.edu>
Date: Fri, 24 Jun 2016 15:13:07 -0600
Subject: [PATCH 16/37] Test for new __call__ function added.

---
 cortex/models/tests/test_pyramid_rnn.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/cortex/models/tests/test_pyramid_rnn.py b/cortex/models/tests/test_pyramid_rnn.py
index 2573fa9..58ee4a8 100644
--- a/cortex/models/tests/test_pyramid_rnn.py
+++ b/cortex/models/tests/test_pyramid_rnn.py
@@ -7,6 +7,7 @@
 import theano
 import theano.tensor as T
 
+theano.config.optimizer = 'None'
 
 def test_build(dim_in=13, dim_h=17):
     pyramid = Pyramid_RNN.factory(dim_in=dim_in, dim_hs=[dim_h],
@@ -37,3 +38,18 @@ def test_step(pyramid=None, dim_in=13, dim_h=17):
     n = np.tanh(preact)
 
     np.testing.assert_almost_equal(t, n)
+
+
+def test_call(pyramid=None, dim_in=13, dim_h=17):
+    if pyramid is None:
+        pyramid = test_build(dim_in=dim_in, dim_h=dim_h)
+
+    rng = np.random.RandomState()
+
+    x = rng.randn(13, 10, 13)
+
+    y = pyramid(x)
+
+    f = theano.function([], y[0]['p'])
+
+    print f()

From ee2236f00ed5ac1a0359e6139cb436b5fddfb170 Mon Sep 17 00:00:00 2001
From: Jeremy-E-Johnson <Jeremy.Johnson14@ncf.edu>
Date: Fri, 24 Jun 2016 15:13:25 -0600
Subject: [PATCH 17/37] Tests constructor for voc data iterator.

---
 cortex/datasets/basic/tests/test_voc.py | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 cortex/datasets/basic/tests/test_voc.py

diff --git a/cortex/datasets/basic/tests/test_voc.py b/cortex/datasets/basic/tests/test_voc.py
new file mode 100644
index 0000000..ba8810f
--- /dev/null
+++ b/cortex/datasets/basic/tests/test_voc.py
@@ -0,0 +1,10 @@
+"""
+Tests voc.py.
+"""
+
+from ..voc import VOC
+
+
+def test_voc():
+    test = VOC(source='$data', batch_size=10)
+

From eff32b830807f3cd47b5068ebe9cf936d0f8c533 Mon Sep 17 00:00:00 2001
From: Jeremy-E-Johnson <Jeremy.Johnson14@ncf.edu>
Date: Fri, 24 Jun 2016 15:13:46 -0600
Subject: [PATCH 18/37] Tests and minor bug fixes. Normalized outputs.

---
 cortex/datasets/basic/voc.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/cortex/datasets/basic/voc.py b/cortex/datasets/basic/voc.py
index 66eab2d..7e0c1e8 100644
--- a/cortex/datasets/basic/voc.py
+++ b/cortex/datasets/basic/voc.py
@@ -7,7 +7,9 @@
 import logging
 from ...utils.tools import resolve_path
 from PIL import Image
+import PIL
 import random
+import numpy as np
 
 
 class VOC(BasicDataset):
@@ -100,7 +102,6 @@ def image_to_pixels(im):
             width, height, = im.size
             return [pixels[i * width:(i + 1) * width] for i in xrange(height)]
 
-
         def project_to_binary(pixels):
             """Helper function for get_data, returns binary version of input pixels.
 
@@ -112,7 +113,7 @@ def project_to_binary(pixels):
             """
             retval = []
             for ln in pixels:
-                retval.append([bool(val) for val in ln])
+                retval.append([int(bool(val)) for val in ln])
             return retval
 
         def get_random_chunk(pixels_data, pixels_label):
@@ -129,13 +130,13 @@ def get_random_chunk(pixels_data, pixels_label):
             x = rand.randint(buff_dist, len(pixels_data[0]) - buff_dist)
             data_chunk = []
             label_val = pixels_label[y][x]
-            for i in range(y -buff_dist + 1, y + buff_dist):
-                data_chunk.append(pixels_data[i][x - buff_dist + 1 :  buff_dist - 1])
+            for index in range(y - buff_dist + 1, y + buff_dist):
+                data_chunk.append(pixels_data[index][x - buff_dist + 1:x + buff_dist])
             assert len(data_chunk) == self.chunk_size and len(data_chunk[0]) == self.chunk_size
             return data_chunk, label_val
 
         names = []
-        with open(source + 'VOCdevkit/VOC2010/ImageSets/Segmentation/' + mode + '.txt') as f:
+        with open(source + '/basic/VOCdevkit/VOC2010/ImageSets/Segmentation/' + mode + '.txt') as f:
             for line in f:
                 names.append(line[:-1])
 
@@ -144,12 +145,12 @@ def get_random_chunk(pixels_data, pixels_label):
         images_loaded = 0
         for name in names:
             if images_loaded < self.images_loaded:
-                label_im = Image.open(source + '/VOCdevkit/VOC2010/SegmentationObject/' + name + '.png')
+                label_im = Image.open(source + '/basic/VOCdevkit/VOC2010/SegmentationObject/' + name + '.png')
                 label_pixels = image_to_pixels(label_im)
                 if get_unique(label_pixels) == 3:
                     self.label_images.append(project_to_binary(label_pixels))
 
-                    data_im = Image.open(source + 'VOCdevkit/VOC2010/JPEGImages/' + name + '.jpeg').convert('1')
+                    data_im = Image.open(source + '/basic/VOCdevkit/VOC2010/JPEGImages/' + name + '.jpg').convert('1')
                     self.data_images.append(image_to_pixels(data_im))
 
                     images_loaded += 1
@@ -161,11 +162,11 @@ def get_random_chunk(pixels_data, pixels_label):
         for i in xrange(0, self.chunks):
             k = rand.randint(0, images_loaded - 1)
             x, y = get_random_chunk(self.data_images[k], self.label_images[k])
-            X.append(x)
-            Y.append(y)
+            X.append(np.array(x, dtype='float32')/255)  # Normalize
+            Y.append(np.array(y, dtype='float32'))
 
         assert len(X) == self.chunks and len(Y) == self.chunks
 
-        return X, Y
+        return np.array(X), np.array(Y)
 
 

From af3d9d57f81d6acc6d5412470e4cf5ddaa1a2de9 Mon Sep 17 00:00:00 2001
From: Jeremy-E-Johnson <Jeremy.Johnson14@ncf.edu>
Date: Wed, 29 Jun 2016 14:17:52 -0600
Subject: [PATCH 19/37] Trying to get a successful demo running. Has a lot of
 debugging mess in it. Not working.

---
 cortex/demos/demos_basic/pyramid_voc.py | 125 ++++++++++++++++++++++++
 1 file changed, 125 insertions(+)
 create mode 100644 cortex/demos/demos_basic/pyramid_voc.py

diff --git a/cortex/demos/demos_basic/pyramid_voc.py b/cortex/demos/demos_basic/pyramid_voc.py
new file mode 100644
index 0000000..b4d35f4
--- /dev/null
+++ b/cortex/demos/demos_basic/pyramid_voc.py
@@ -0,0 +1,125 @@
+"""
+Demo for Pyramid RNN on VOC classification dataset.
+
+Try with 'cortex-run pyramid_voc.py'
+"""
+
+"""
+Demo for next word guessing using an RNN.
+
+Try with cortex-run 'rnn_europarl.py <optional .yaml>'
+"""
+
+from collections import OrderedDict
+import theano
+import numpy as np
+from cortex.models.pyramid_rnn import Pyramid_RNN
+from cortex.utils import intX, floatX
+from cortex.datasets import resolve as resolve_dataset
+import theano.tensor as T
+
+
+# Default arguments
+_learning_args = dict(
+    learning_rate=0.01,
+    learning_rate_scheduler=None,
+    optimizer='rmsprop',
+    optimizer_args=dict(),
+    epochs=100,
+    valid_key='-sum log p(x | y)',
+    valid_sign='+',
+    excludes=[]
+)
+
+_dataset_args = dict(
+    train_batch_size=10,
+    valid_batch_size=10,
+    #test_batch_size=10,
+    debug=False,
+    dataset='voc',
+    chunks=1000,
+    distribution='multinomial',
+    chunk_size=15,
+    source='$data'
+)
+
+_model_args = dict(
+    dim_h=None,
+    l2_decay=None,
+    mask_in='mask_in'
+)
+
+pyramid_args = dict(
+    dim_hs=[17],
+    input_layer='voc',
+    output='label',
+)
+
+extra_arg_keys = ['pyramid_args']
+
+theano.config.on_unused_input = 'ignore'
+theano.config.optimizer = 'None'
+#theano.config.exception_verbosity = 'high'
+#theano.config.compute_test_value = 'warn'
+
+
+def _build(module):
+    models = OrderedDict()
+    dataset = module.dataset
+    pyramid_args = module.pyramid_args
+    width = dataset.chunk_size
+    dim_in = 1
+    dim_out = 1
+    distribution = dataset.distributions[pyramid_args['output']]
+
+    model = Pyramid_RNN.factory(dim_in=dim_in, dim_out=dim_out, distribution=distribution,
+                                width=width, **pyramid_args)
+
+    models['pyramid_rnn'] = model
+    return models
+
+
+def _cost(module):
+    models = module.models
+
+    X = module.inputs[module.dataset.name]#.swapaxes(0, 1)
+    Y = module.inputs['label']
+    used_inputs = [module.dataset.name, 'label']
+
+    model = models['pyramid_rnn']
+    main(model)
+
+    outputs, preact, updates = model(X)
+
+    results = OrderedDict()
+    p = outputs['p']
+    base_cost = model.neg_log_prob(Y, p).sum(0).mean()
+    cost = base_cost
+
+    constants = []
+
+    l2_decay = module.l2_decay
+    if l2_decay is not False and l2_decay > 0.:
+        module.logger.info('Adding %.5f L2 weight decay' % l2_decay)
+        l2_rval = model.l2_decay(l2_decay)
+        l2_cost = l2_rval.pop('cost')
+        cost += l2_cost
+        results['l2_cost'] = l2_cost
+
+    # results['error'] = (Y * (1 - p)).sum(axis=1).mean()
+    results['-sum log p(x | y)'] = base_cost
+    results['cost'] = cost
+
+    return used_inputs, results, updates, constants, outputs
+
+
+def main(model):
+    x = T.alloc(1, 8, 10, 17)
+
+    params = model.get_sample_params()
+
+    #print params
+
+    a = model.call_seqs(x, None, 0, *params)[0]
+
+    print a.eval().shape, '****************************************'

From b8275cbdcd255a6532c7e15ff3e054153fa93584 Mon Sep 17 00:00:00 2001
From: Jeremy-E-Johnson <Jeremy.Johnson14@ncf.edu>
Date: Wed, 29 Jun 2016 14:18:53 -0600
Subject: [PATCH 20/37] Added factory, support for train/valid/test modes, and
 fixed output types.

---
 cortex/datasets/basic/voc.py | 63 +++++++++++++++++++++++++++++++-----
 1 file changed, 55 insertions(+), 8 deletions(-)

diff --git a/cortex/datasets/basic/voc.py b/cortex/datasets/basic/voc.py
index 7e0c1e8..627f4cd 100644
--- a/cortex/datasets/basic/voc.py
+++ b/cortex/datasets/basic/voc.py
@@ -20,26 +20,30 @@ class VOC(BasicDataset):
     """
 
     def __init__(self, images_loaded=10, chunk_size=5, out_path=None, chunks=1000,
-                 mode='train', source=None, name='voc', **kwargs):
+                 start_image=0, mode='train', source=None, name='voc', **kwargs):
         """
 
         Args:
             images_loaded (int): How many images to load
             chunk_size (int): Dimension of chunks to be made.
-            mode (str): Type of data to load, train, trainval, val.
+            mode (str): Type of data to load, train, valid, test.
             source (str): Path to directory containing VOCdevkit
             name: Name of iterator
             **kwargs:
         """
 
+        self.mode_resolve = {'train': 'train', 'valid': 'trainval', 'test': 'val'}
+        self.mode = self.mode_resolve[mode]
+
         self.logger = logging.getLogger(
             '.'.join([self.__module__, self.__class__.__name__]))
-        self.logger.info('Loading %s from %s' % (name, source))
+        self.logger.info('Loading %s from %s as %s' % (name, source, self.mode))
 
         if source is None:
             raise ValueError('No source file provided.')
         source = resolve_path(source)
 
+        self.start_image = start_image
         self.chunks = chunks
         self.images_loaded = images_loaded
         if chunk_size % 2:
@@ -48,7 +52,7 @@ def __init__(self, images_loaded=10, chunk_size=5, out_path=None, chunks=1000,
             self.logger.info('Using %d + 1 to get an odd chunk size.' % chunk_size)
             self.chunk_size = chunk_size + 1
 
-        X, Y = self.get_data(source, mode)
+        X, Y = self.get_data(source, self.mode)
         data = {name: X, 'label': Y}
         distributions = {name: 'multinomial', 'label': 'multinomial'}
 
@@ -60,12 +64,42 @@ def __init__(self, images_loaded=10, chunk_size=5, out_path=None, chunks=1000,
         if self.shuffle:
             self.randomize()
 
+    @staticmethod
+    def factory(split=None, idx=None, batch_sizes=None, **kwargs):
+        if split is None:
+            raise NotImplementedError('Idx are not supported for this dataset yet.')
+        if batch_sizes is None:
+            raise ValueError('Need batch sizes')
+
+        chunks = kwargs['chunks']
+        chunk_ammounts = []
+        for val in split:
+            chunk_ammounts.append(int(chunks * val))
+
+        train = VOC(images_loaded=10, start_image=0, chunk_size=kwargs['chunk_size'],
+                    chunks=chunk_ammounts[0], mode='train', source=kwargs['source'],
+                    batch_size=batch_sizes[0])
+        valid = VOC(images_loaded=5, start_image=10, chunk_size=kwargs['chunk_size'],
+                    chunks=chunk_ammounts[1], mode='valid', source=kwargs['source'],
+                    batch_size=batch_sizes[1])
+        test = VOC(images_loaded=5, start_image=15, chunk_size=kwargs['chunk_size'],
+                   chunks=chunk_ammounts[2], mode='test', source=kwargs['source'],
+                   batch_size=batch_sizes[2])
+
+        accum = 0
+        idx = []
+        for val in chunk_ammounts:
+            idx.append(range(accum, accum + val))
+            accum += val
+
+        return train, valid, test, idx
+
     def get_data(self, source, mode):
         """Gets data given source, chunks it, and returns chunks with center labels.
 
         Args:
             source (str): File path to directory containing VOCdevkit.
-            mode (str): Mode of data, eg. train, trainval, val.
+            mode (str): Mode of data, eg. train, valid, test.
 
         Returns:
 
@@ -119,7 +153,7 @@ def project_to_binary(pixels):
         def get_random_chunk(pixels_data, pixels_label):
             """Helper function for get_data, gets random chunk from data, and returns label for center.
 
-            Args:
+            Args:self, VOC
                 pixels_data (list of lists): Image pixels of data.
                 pixels_label (list of lists): Image pixels of label.
 
@@ -144,7 +178,7 @@ def get_random_chunk(pixels_data, pixels_label):
         self.label_images = []
         images_loaded = 0
         for name in names:
-            if images_loaded < self.images_loaded:
+            if images_loaded < (self.images_loaded + self.start_image) and images_loaded >= self.start_image:
                 label_im = Image.open(source + '/basic/VOCdevkit/VOC2010/SegmentationObject/' + name + '.png')
                 label_pixels = image_to_pixels(label_im)
                 if get_unique(label_pixels) == 3:
@@ -154,13 +188,15 @@ def get_random_chunk(pixels_data, pixels_label):
                     self.data_images.append(image_to_pixels(data_im))
 
                     images_loaded += 1
+            elif images_loaded < self.start_image:
+                images_loaded += 1
             else:
                 break
 
         X = []
         Y = []
         for i in xrange(0, self.chunks):
-            k = rand.randint(0, images_loaded - 1)
+            k = rand.randint(0, len(self.data_images) - 1)
             x, y = get_random_chunk(self.data_images[k], self.label_images[k])
             X.append(np.array(x, dtype='float32')/255)  # Normalize
             Y.append(np.array(y, dtype='float32'))
@@ -169,4 +205,15 @@ def get_random_chunk(pixels_data, pixels_label):
 
         return np.array(X), np.array(Y)
 
+    def next(self):
+        rval = super(VOC, self).next()
+
+        #data = rval['voc']
+
+        #rval['voc'] = []
+        #for k in range(0, 4):
+        #    rval['voc'].append(np.swapaxes(np.rot90(np.swapaxes(data, 1, 2), k), 1, 2)[0:(self.chunk_size + 1)/2].astype('float32'))
+
+        rval['label'] = np.array([b[1] for b in rval['label']])
 
+        return rval

From a6400abe95f51c507038dd21a4639cedb1605087 Mon Sep 17 00:00:00 2001
From: Jeremy-E-Johnson <Jeremy.Johnson14@ncf.edu>
Date: Wed, 29 Jun 2016 14:20:35 -0600
Subject: [PATCH 21/37] Attempted to get demo running. Not currently working.
 Has much excess debug code.

---
 cortex/models/pyramid_rnn.py | 127 ++++++++++++++++++++++++++++-------
 1 file changed, 101 insertions(+), 26 deletions(-)

diff --git a/cortex/models/pyramid_rnn.py b/cortex/models/pyramid_rnn.py
index 71949da..c8f4f7f 100644
--- a/cortex/models/pyramid_rnn.py
+++ b/cortex/models/pyramid_rnn.py
@@ -14,16 +14,19 @@
 
 class Pyramid_RNN(RNN):
 
-    def __init__(self, dim_in, dim_hs, dim_out=None, output_net=None,
-                 input_net=None, name='pyramid', **kwargs):
+    def __init__(self, dim_in, dim_hs, width, dim_out=None,
+                 output_net=None, input_net=None, name='pyramid', **kwargs):
 
         if dim_out is None:
             self.dim_out = 1
+
+        self.width = width
+
         super(Pyramid_RNN, self).__init__(dim_in=dim_in, dim_hs=dim_hs, name=name,
                                           output_net=output_net, input_net=input_net, **kwargs)
 
     @staticmethod
-    def factory(dim_in=None, dim_out=None, dim_hs=None, **kwargs):
+    def factory(dim_in=None, dim_out=None, dim_hs=None, width=None, **kwargs):
         '''Factory for creating MLPs for Pyramid_RNN and returning .
 
         Convenience to quickly create MLPs from dictionaries, linking all
@@ -34,6 +37,7 @@ def factory(dim_in=None, dim_out=None, dim_hs=None, **kwargs):
             dim_hs (list): dimensions of recurrent units.
             dim_out (Optional[int]): output dimension. If not provided, assumed
                 to be dim_in.
+            width (int): How wide the input block is.
 
         Returns:
             RNN
@@ -42,10 +46,41 @@ def factory(dim_in=None, dim_out=None, dim_hs=None, **kwargs):
         assert len(dim_hs) > 0
         if dim_out is None:
             dim_out = 1
-        mlps, kwargs = RNN.mlp_factory(dim_in, dim_out, dim_hs, **kwargs)
+        mlps, kwargs = Pyramid_RNN.mlp_factory(dim_in, dim_out, dim_hs, **kwargs)
         kwargs.update(**mlps)
 
-        return Pyramid_RNN(dim_in, dim_hs, dim_out=dim_out, **kwargs)
+        return Pyramid_RNN(dim_in, dim_hs, width, dim_out=dim_out, **kwargs)
+
+    @staticmethod
+    def mlp_factory(dim_in, dim_out, dim_hs, o_dim_in=None, i_net=None,
+                    o_net=None, c_net=None, data_distribution='binomial',
+                    initialization=None, init_args=None, **kwargs):
+        '''Factory for creating MLPs for Pyramid RNN.
+
+        Args:
+            dim_in (int): input dimension.
+            dim_out (int): output dimension. If not provided, assumed
+                to be dim_in.
+            dim_hs (list): dimensions of recurrent units.
+            o_dim_in (Optional[int]): optional input dimension for output
+                net. If not provided, then use the last hidden dim.
+            i_net (dict): input network args.
+            o_net (dict): output network args.
+            c_net (dict): conditional network args.
+            data_distribution (str): distribution of the output.
+            initialization (str): type of initialization.
+            init_args (dict): initialization keyword arguments.
+            **kwargs: extra keyword arguments.
+
+        Returns:
+            dict: MLPs.
+            dict: extra keyword arguments.
+
+        '''
+
+        mlps, kwargs = RNN.mlp_factory(dim_in=dim_in, dim_out=dim_out, dim_hs=dim_hs, **kwargs)
+
+        return mlps, kwargs
 
     def set_params(self):
         '''Initialize RNN parameters.
@@ -95,15 +130,17 @@ def step_call(self, x, m, h0s, *params):
         input = x
 
         updates = theano.OrderedUpdates()
+        print 'width = ', self.width, '**************** ', (self.width + 1)/2, self.dim_hs, '++++++++++++++++++++++++++'
 
         h0s = []
         hs = []
-        directional_values = []
+        #directional_values = []
         for k in range(0, 4):  # Iterate through directions.
-            x = np.swapaxes(np.rot90(np.swapaxes(input, 1, 2), k), 1, 2)[0:(self.dim_in + 1)/2].astype('float32')
-            h0s.append([T.alloc(0., x.shape[1], self.dim_in, dim_h).astype(floatX) for dim_h in self.dim_hs])
+            #x = np.swapaxes(np.rot90(np.swapaxes(input, 1, 2), k), 1, 2)[0:(self.width + 1)/2].astype('float32')
+            x = self.rotate(input, k)[:(self.width + 1)/2, :, :].astype('float32')
+            h0s.append([T.alloc(0., (self.width + 1)/2, x.shape[1], dim_h).astype(floatX) for dim_h in self.dim_hs])
             for i, h0 in enumerate(h0s[k]):
-                seqs         = [m[:, :, None]] + self.call_seqs(x, None, i, *params)
+                seqs         = [m[:, :, :, None]] + self.call_seqs(x, None, i, *params)
                 outputs_info = [h0]
                 non_seqs     = [self.get_recurrent_args(*params)[i]]
                 h, updates_ = theano.scan(
@@ -116,14 +153,19 @@ def step_call(self, x, m, h0s, *params):
                 hs.append(h)
                 x = h
                 updates += updates_
-            directional_values.append(h[(self.dim_in + 1)/2])  # Remember directional outputs.
+            if k == 0:
+                output = [h[-1, :, (self.width + 1)/2, :]]
+            else:
+                output = output + [h[-1, :, (self.width + 1)/2, :]]
+            #directional_values += [h[-1, :, (self.width + 1)/2, :]]  # Remember directional outputs.
 
+        #print T.sum(output)
         o_params    = self.get_output_args(*params)
-        out_net_out = self.output_net.step_call(sum(directional_values), *o_params)  # Sum different directions.
+        out_net_out = self.output_net.step_call(T.sum(output), *o_params)  # Sum different directions.
         preact      = out_net_out['z']
         p           = out_net_out['p']
 
-        return coll.OrderedDict(hs=hs, p=p, z=preact), updates, h0s
+        return coll.OrderedDict(hs=hs, p=p, z=preact), updates, h0s[0]
 
     def __call__(self, x, m=None, h0s=None, condition_on=None):
         '''Call function.
@@ -143,18 +185,9 @@ def __call__(self, x, m=None, h0s=None, condition_on=None):
 
         '''
         constants = []
-        input_rotations = []
-
-        ''' Calculating h0s in step_call so that rotations of data happen once and don't need to be saved.
-        if h0s is None and self.init_net is not None:
-            h0s = self.init_net.initialize(x[0])
-            constants += h0s
-        elif h0s is None:
-            h0s = [T.alloc(0., x.shape[1], dim_h).astype(floatX) for dim_h in self.dim_hs]
-        '''
 
         if m is None:
-            m = T.ones((x.shape[0], x.shape[1])).astype(floatX)
+            m = T.ones((x.shape)).astype(floatX)
 
         params = self.get_sample_params()
 
@@ -175,18 +208,60 @@ def call_seqs(self, x, condition_on, level, *params):
             list: list of scan inputs.
 
         '''
-        print x.shape
+        """
+        #print x.shape
+        x = x[:, :, :, None]#.swapaxes(0, 3)
+        #print x.shape
         if level == 0:
             i_params = self.get_input_args(*params)
             a = self.input_net.step_preact(x, *i_params)
         else:
             i_params = self.get_inter_args(level - 1, *params)
             a = self.inter_nets[level - 1].step_preact(x, *i_params)
-        print a.shape
-        print self.input_net.dim_in
-        print self.input_net.dim_out
+        #print a.shape
+        #print self.input_net.dim_in
+        #print self.input_net.dim_out
+        """
+
+        params = list(params)
+
+        print params
+
+        W = params.pop(1)
+        b = params.pop(1)
+
+        print W, type(W)
+
+        print params
+
+        for i, z in enumerate(x):
+            a = T.concatenate([(T.dot(c, W) + b)[:, None, None] for c in z.swapaxes(0, 1)], 1).swapaxes(0, 1)
+            if i:
+                T.concatenate([rval, a], 2)
+            else:
+                rval = a
+
+        #a = T.dot(a, T.alloc(0, 2, 2))
 
         if condition_on is not None:
             a += condition_on
 
         return [a]
+
+    def rotate(self, tensor, n_times):
+        #retval = tensor.copy()
+        if n_times == 0:
+            return tensor
+
+        #for i in range(0, self.width):
+        #    retval[:, :, tensor.shape[0] - i] = tensor[i, :, :]
+
+        #retval = np.array([tensor[:, :, self.width - 1 - i] for i in range(0, self.width)])
+        #retval =
+        retval = tensor.swapaxes(0, 2)[::-1]
+
+
+        if n_times == 1:
+            return retval
+        elif n_times > 1:
+            return self.rotate(retval, n_times - 1)

From 7c468ce9d94e1612cbc8a0c07d478784360ec2a2 Mon Sep 17 00:00:00 2001
From: Jeremy-E-Johnson <Jeremy.Johnson14@ncf.edu>
Date: Thu, 21 Jul 2016 16:32:44 -0600
Subject: [PATCH 22/37] Added support for VOC dataset.

---
 cortex/datasets/__init__.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cortex/datasets/__init__.py b/cortex/datasets/__init__.py
index acb6749..354c36f 100644
--- a/cortex/datasets/__init__.py
+++ b/cortex/datasets/__init__.py
@@ -43,13 +43,15 @@ def resolve(c):
     from .basic.uci import UCI
     from .basic.cifar import CIFAR
     from .basic.europarl import Europarl
+    from .basic.voc import VOC
 
     r_dict = {
         'mnist': MNIST,
         'cifar': CIFAR,
         'caltech': CALTECH,
         'uci': UCI,
-        'europarl': Europarl
+        'europarl': Europarl,
+        'voc': VOC
     }
 
     C = r_dict.get(c, None)
@@ -178,7 +180,7 @@ def dataset_factory(resolve_dataset, dataset=None, split=[0.7, 0.2, 0.1],
             valid_batch_size=valid_batch_size, test_batch_size=test_batch_size,
             **dataset_args)
     else:
-        train, valid, test, idx =  C.factory(
+        train, valid, test, idx = C.factory(
             split=split, idx=idx,
             batch_sizes=[train_batch_size, valid_batch_size, test_batch_size],
             **dataset_args)

From 578ca1d32a4ca2d4aad17289bb2418ab38e9d978 Mon Sep 17 00:00:00 2001
From: Jeremy-E-Johnson <Jeremy.Johnson14@ncf.edu>
Date: Thu, 21 Jul 2016 16:34:24 -0600
Subject: [PATCH 23/37] Bugfixes and cleanup for demo.

---
 cortex/models/pyramid_rnn.py | 53 ++++--------------------------------
 1 file changed, 6 insertions(+), 47 deletions(-)

diff --git a/cortex/models/pyramid_rnn.py b/cortex/models/pyramid_rnn.py
index c8f4f7f..6280810 100644
--- a/cortex/models/pyramid_rnn.py
+++ b/cortex/models/pyramid_rnn.py
@@ -130,15 +130,12 @@ def step_call(self, x, m, h0s, *params):
         input = x
 
         updates = theano.OrderedUpdates()
-        print 'width = ', self.width, '**************** ', (self.width + 1)/2, self.dim_hs, '++++++++++++++++++++++++++'
 
         h0s = []
         hs = []
-        #directional_values = []
         for k in range(0, 4):  # Iterate through directions.
-            #x = np.swapaxes(np.rot90(np.swapaxes(input, 1, 2), k), 1, 2)[0:(self.width + 1)/2].astype('float32')
             x = self.rotate(input, k)[:(self.width + 1)/2, :, :].astype('float32')
-            h0s.append([T.alloc(0., (self.width + 1)/2, x.shape[1], dim_h).astype(floatX) for dim_h in self.dim_hs])
+            h0s.append([T.alloc(0, x.shape[1], self.width, dim_h).astype(floatX) for dim_h in self.dim_hs])
             for i, h0 in enumerate(h0s[k]):
                 seqs         = [m[:, :, :, None]] + self.call_seqs(x, None, i, *params)
                 outputs_info = [h0]
@@ -157,15 +154,13 @@ def step_call(self, x, m, h0s, *params):
                 output = [h[-1, :, (self.width + 1)/2, :]]
             else:
                 output = output + [h[-1, :, (self.width + 1)/2, :]]
-            #directional_values += [h[-1, :, (self.width + 1)/2, :]]  # Remember directional outputs.
 
-        #print T.sum(output)
         o_params    = self.get_output_args(*params)
-        out_net_out = self.output_net.step_call(T.sum(output), *o_params)  # Sum different directions.
+        out_net_out = self.output_net.step_call(T.sum(output, 0), *o_params)  # Sum different directions.
         preact      = out_net_out['z']
         p           = out_net_out['p']
 
-        return coll.OrderedDict(hs=hs, p=p, z=preact), updates, h0s[0]
+        return coll.OrderedDict(hs=hs, p=p[:, 0], z=preact), updates, h0s[0]
 
     def __call__(self, x, m=None, h0s=None, condition_on=None):
         '''Call function.
@@ -187,7 +182,7 @@ def __call__(self, x, m=None, h0s=None, condition_on=None):
         constants = []
 
         if m is None:
-            m = T.ones((x.shape)).astype(floatX)
+            m = T.ones(x.shape).astype(floatX)
 
         params = self.get_sample_params()
 
@@ -208,59 +203,23 @@ def call_seqs(self, x, condition_on, level, *params):
             list: list of scan inputs.
 
         '''
-        """
-        #print x.shape
-        x = x[:, :, :, None]#.swapaxes(0, 3)
-        #print x.shape
+
+        x = x[:, :, :, None]
         if level == 0:
             i_params = self.get_input_args(*params)
             a = self.input_net.step_preact(x, *i_params)
         else:
             i_params = self.get_inter_args(level - 1, *params)
             a = self.inter_nets[level - 1].step_preact(x, *i_params)
-        #print a.shape
-        #print self.input_net.dim_in
-        #print self.input_net.dim_out
-        """
-
-        params = list(params)
-
-        print params
-
-        W = params.pop(1)
-        b = params.pop(1)
-
-        print W, type(W)
-
-        print params
-
-        for i, z in enumerate(x):
-            a = T.concatenate([(T.dot(c, W) + b)[:, None, None] for c in z.swapaxes(0, 1)], 1).swapaxes(0, 1)
-            if i:
-                T.concatenate([rval, a], 2)
-            else:
-                rval = a
-
-        #a = T.dot(a, T.alloc(0, 2, 2))
-
-        if condition_on is not None:
-            a += condition_on
 
         return [a]
 
     def rotate(self, tensor, n_times):
-        #retval = tensor.copy()
         if n_times == 0:
             return tensor
 
-        #for i in range(0, self.width):
-        #    retval[:, :, tensor.shape[0] - i] = tensor[i, :, :]
-
-        #retval = np.array([tensor[:, :, self.width - 1 - i] for i in range(0, self.width)])
-        #retval =
         retval = tensor.swapaxes(0, 2)[::-1]
 
-
         if n_times == 1:
             return retval
         elif n_times > 1:

From 90e37a5a1ff3f5f7c6fd25de0cd89b89302383d2 Mon Sep 17 00:00:00 2001
From: Jeremy-E-Johnson <Jeremy.Johnson14@ncf.edu>
Date: Thu, 21 Jul 2016 16:35:06 -0600
Subject: [PATCH 24/37] Bugfix, demo runs now but doesnt seem to train
 particularly well.

---
 cortex/demos/demos_basic/pyramid_voc.py | 24 ++++++------------------
 1 file changed, 6 insertions(+), 18 deletions(-)

diff --git a/cortex/demos/demos_basic/pyramid_voc.py b/cortex/demos/demos_basic/pyramid_voc.py
index b4d35f4..df8ece9 100644
--- a/cortex/demos/demos_basic/pyramid_voc.py
+++ b/cortex/demos/demos_basic/pyramid_voc.py
@@ -14,7 +14,7 @@
 import theano
 import numpy as np
 from cortex.models.pyramid_rnn import Pyramid_RNN
-from cortex.utils import intX, floatX
+from cortex.utils import intX, floatX, logger
 from cortex.datasets import resolve as resolve_dataset
 import theano.tensor as T
 
@@ -34,12 +34,12 @@
 _dataset_args = dict(
     train_batch_size=10,
     valid_batch_size=10,
-    #test_batch_size=10,
+    # test_batch_size=10,
     debug=False,
     dataset='voc',
     chunks=1000,
     distribution='multinomial',
-    chunk_size=15,
+    chunk_size=7,
     source='$data'
 )
 
@@ -50,7 +50,7 @@
 )
 
 pyramid_args = dict(
-    dim_hs=[17],
+    dim_hs=[25],
     input_layer='voc',
     output='label',
 )
@@ -82,18 +82,17 @@ def _build(module):
 def _cost(module):
     models = module.models
 
-    X = module.inputs[module.dataset.name]#.swapaxes(0, 1)
+    X = module.inputs[module.dataset.name].swapaxes(0, 1)
     Y = module.inputs['label']
     used_inputs = [module.dataset.name, 'label']
 
     model = models['pyramid_rnn']
-    main(model)
 
     outputs, preact, updates = model(X)
 
     results = OrderedDict()
     p = outputs['p']
-    base_cost = model.neg_log_prob(Y, p).sum(0).mean()
+    base_cost = model.neg_log_prob(Y, p).mean()
     cost = base_cost
 
     constants = []
@@ -112,14 +111,3 @@ def _cost(module):
 
     return used_inputs, results, updates, constants, outputs
 
-
-def main(model):
-    x = T.alloc(1, 8, 10, 17)
-
-    params = model.get_sample_params()
-
-    #print params
-
-    a = model.call_seqs(x, None, 0, *params)[0]
-
-    print a.eval().shape, '****************************************'

From 09f7eb8642945ce331020cdd959cc3301e8e5863 Mon Sep 17 00:00:00 2001
From: Jeremy-E-Johnson <Jeremy.Johnson14@ncf.edu>
Date: Thu, 21 Jul 2016 16:36:22 -0600
Subject: [PATCH 25/37] Added support for masked input.

---
 cortex/demos/demos_basic/rnn_europarl.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cortex/demos/demos_basic/rnn_europarl.py b/cortex/demos/demos_basic/rnn_europarl.py
index cabfa3e..d2f764a 100644
--- a/cortex/demos/demos_basic/rnn_europarl.py
+++ b/cortex/demos/demos_basic/rnn_europarl.py
@@ -36,6 +36,7 @@
 _model_args = dict(
     dim_h=None,
     l2_decay=None,
+    mask_in='mask_in'
 )
 
 simple_rnn_args = dict(
@@ -71,11 +72,12 @@ def _build(module):
 def _cost(module):
     models = module.models
 
+    mask_in = module.inputs['mask_in'].transpose(1, 0)
     X = module.inputs[module.dataset.name].transpose(1, 0, 2)
-    used_inputs = [module.dataset.name]
+    used_inputs = [module.dataset.name, 'mask_in']
 
     model = models['rnn']
-    outputs, preact, updates = model(X)
+    outputs, preact, updates = model(X, m=mask_in)
 
     results = OrderedDict()
     p = outputs['p']

From 2de7e897e852c8f3cc804bca9b0317bb1d67e44a Mon Sep 17 00:00:00 2001
From: Jeremy-E-Johnson <Jeremy.Johnson14@ncf.edu>
Date: Thu, 21 Jul 2016 16:38:32 -0600
Subject: [PATCH 26/37] Fixed up __call__ test which should work now.

---
 cortex/models/tests/test_pyramid_rnn.py | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/cortex/models/tests/test_pyramid_rnn.py b/cortex/models/tests/test_pyramid_rnn.py
index 58ee4a8..16e8146 100644
--- a/cortex/models/tests/test_pyramid_rnn.py
+++ b/cortex/models/tests/test_pyramid_rnn.py
@@ -9,17 +9,18 @@
 
 theano.config.optimizer = 'None'
 
-def test_build(dim_in=13, dim_h=17):
+
+def test_build(dim_in=1, dim_h=17, width=13):
     pyramid = Pyramid_RNN.factory(dim_in=dim_in, dim_hs=[dim_h],
-                                  dim_out=1)
+                                  width=width, dim_out=1)
     pyramid.set_tparams()
 
     return pyramid
 
 
-def test_step(pyramid=None, dim_in=13, dim_h=17):
+def test_step(pyramid=None, dim_in=1, dim_h=17, width=13):
     if pyramid is None:
-        pyramid = test_build(dim_in=dim_in, dim_h=dim_h)
+        pyramid = test_build(dim_in=dim_in, dim_h=dim_h, width=width)
 
     m = theano.tensor.tensor3()
     y = theano.tensor.tensor3()
@@ -29,20 +30,22 @@ def test_step(pyramid=None, dim_in=13, dim_h=17):
     activation = pyramid._step(m, y, h_, Ur)
     f = theano.function([m, y, h_, Ur], activation)
 
+    t = f(np.ones((10, width, dim_h), dtype='float32'), np.ones((10, width, dim_h), dtype='float32'),
+          np.ones((10, width, dim_h), dtype='float32'), pyramid.params['Ur0'])
 
-    t = f(np.ones((10, dim_in, dim_h), dtype='float32'), np.ones((10, dim_in, dim_h), dtype='float32'),
-          np.ones((10, dim_in, dim_h), dtype='float32'), pyramid.params['Ur0'])
+    preact = np.ones((10, width, dim_h), dtype='float32') + \
+             np.dot(np.ones((10, width, 3*dim_h), dtype='float32'), pyramid.params['Ur0'])
 
-    preact = np.ones((10, dim_in, dim_h), dtype='float32') + \
-             np.dot(np.ones((10, dim_in, 3*dim_h), dtype='float32'), pyramid.params['Ur0'])
     n = np.tanh(preact)
 
+    print n.shape
+
     np.testing.assert_almost_equal(t, n)
 
 
-def test_call(pyramid=None, dim_in=13, dim_h=17):
+def test_call(pyramid=None, dim_in=1, dim_h=17, width=13):
     if pyramid is None:
-        pyramid = test_build(dim_in=dim_in, dim_h=dim_h)
+        pyramid = test_build(dim_in=dim_in, dim_h=dim_h, width=width)
 
     rng = np.random.RandomState()
 
@@ -51,5 +54,3 @@ def test_call(pyramid=None, dim_in=13, dim_h=17):
     y = pyramid(x)
 
     f = theano.function([], y[0]['p'])
-
-    print f()

From 4ec1f81dd9dee8cfadd5d23f4147f34336d6371f Mon Sep 17 00:00:00 2001
From: Jeremy-E-Johnson <Jeremy.Johnson14@ncf.edu>
Date: Thu, 21 Jul 2016 16:39:12 -0600
Subject: [PATCH 27/37] Added chunk_size specifier to arguments. Trivial
 change.

---
 cortex/datasets/basic/tests/test_voc.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cortex/datasets/basic/tests/test_voc.py b/cortex/datasets/basic/tests/test_voc.py
index ba8810f..356b1a1 100644
--- a/cortex/datasets/basic/tests/test_voc.py
+++ b/cortex/datasets/basic/tests/test_voc.py
@@ -6,5 +6,4 @@
 
 
 def test_voc():
-    test = VOC(source='$data', batch_size=10)
-
+    test = VOC(source='$data', batch_size=10, chunk_size=15)

From 638fccc4ee311412d0171ea2e7534e2ff7b4c044 Mon Sep 17 00:00:00 2001
From: Jeremy-E-Johnson <Jeremy.Johnson14@ncf.edu>
Date: Thu, 21 Jul 2016 16:46:45 -0600
Subject: [PATCH 28/37] These changes make no sense but without either or both
 of them the pyramid demo will not run.

---
 cortex/utils/trainer.py  | 19 ++++++++++++++++++-
 cortex/utils/training.py | 15 ++++++++++++++-
 2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/cortex/utils/trainer.py b/cortex/utils/trainer.py
index 6f2ba19..ab5de97 100644
--- a/cortex/utils/trainer.py
+++ b/cortex/utils/trainer.py
@@ -29,6 +29,7 @@
 
 logger = logging.getLogger('cortex')
 
+
 def setup(module):
     '''Sets up module.
 
@@ -37,6 +38,7 @@ def setup(module):
         print_section('Running setup')
         module.setup()
 
+
 def set_data(module):
     '''Sets the datasets.
 
@@ -61,6 +63,7 @@ def set_data(module):
     module.valid_dataset = datasets['valid']
     module.test_dataset = datasets['test']
 
+
 def make_inputs(module):
     '''Forms the inputs from the dataset
 
@@ -95,6 +98,7 @@ def make_inputs(module):
     dataset.reset()
     module.inputs = inps
 
+
 def build(module, model_to_load=None):
     '''Forms the models.
 
@@ -109,6 +113,7 @@ def build(module, model_to_load=None):
     module.models = models
     return set_tparams(module)
 
+
 def set_tparams(module):
     tparams = OrderedDict()
     for k, v in module.models.iteritems():
@@ -116,6 +121,7 @@ def set_tparams(module):
     module.tparams = tparams
     return tparams
 
+
 def set_cost(module):
     '''Sets costs.
 
@@ -135,6 +141,7 @@ def set_cost(module):
     module.inputs = inputs
     return results, updates, constants, outputs
 
+
 def set_test_function(module, results, outputs):
     '''Sets the test function of a module.
 
@@ -142,9 +149,10 @@ def set_test_function(module, results, outputs):
     if hasattr(module, 'test'):
         f_test = module.test(results, outputs)
     else:
-        f_test = theano.function(module.inputs.values(), results)
+        f_test = theano.function(module.inputs.values()[::-1], results)
     return f_test
 
+
 def set_out_function(module, results, outputs):
     '''Sets function for outputs.
     '''
@@ -158,6 +166,7 @@ def set_out_function(module, results, outputs):
     f_outs = theano.function(module.inputs.values(), outs)
     return f_outs
 
+
 def set_save_function(module, tparams):
     '''Sets the save function of a module.
 
@@ -174,6 +183,7 @@ def save(outfile):
         f_save = save
     return f_save
 
+
 def set_viz_function(module, results, outputs):
     '''Sets the visualization function of a module.
 
@@ -188,12 +198,14 @@ def set_viz_function(module, results, outputs):
     else:
         return None
 
+
 def set_eval_functions(module, **kwargs):
     if hasattr(module, 'eval'):
         return module.eval(**kwargs)
     else:
         return OrderedDict()
 
+
 def check(module):
     '''Runs checks.
 
@@ -202,6 +214,7 @@ def check(module):
         logger.info('Checking experiment.')
         module.check()
 
+
 def finish(module):
     '''Extra finishing-up.
 
@@ -210,6 +223,7 @@ def finish(module):
         logger.info('Finishing up setup')
         module.finish()
 
+
 def train(module, cost, tparams, updates, constants, f_test=None, f_save=None,
           f_viz=None, f_outs=None, test_every=10, show_every=10,
           monitor_gradients=False):
@@ -289,6 +303,7 @@ def show(self):
         for k in self.eval_keys:
             self.__dict__[k]()
 
+
 class ModuleContainer(object):
     __required_methods = ['_build', '_cost']
     __optional_methods = ['_setup', '_data', '_test', '_save', '_viz', '_check',
@@ -411,6 +426,7 @@ def add_component(component):
     if len(component_list) > 0:
         flatten_component_layers(component_list, model_dict)
 
+
 def load_module(model_file, strict=True):
     '''Loads pretrained model.
 
@@ -503,6 +519,7 @@ def load_module(model_file, strict=True):
     set_tparams(module)
     return module
 
+
 def main(args=None):
     if args is None:
         args = sys.argv[1:]
diff --git a/cortex/utils/training.py b/cortex/utils/training.py
index 4796806..6f5bfb1 100644
--- a/cortex/utils/training.py
+++ b/cortex/utils/training.py
@@ -12,7 +12,7 @@
 from collections import OrderedDict
 from glob import glob
 import logging
-if not 'matplotlib' in sys.modules:
+if 'matplotlib' not in sys.modules:
     import matplotlib
     matplotlib.use('Agg')
 import numpy as np
@@ -47,6 +47,7 @@
 np.set_printoptions(threshold=np.nan)
 logger = logging.getLogger(__name__)
 
+
 def make_argument_parser():
     '''Generic experiment parser.
 
@@ -69,6 +70,7 @@ def make_argument_parser():
                         help='Verbosity of the logging. (0, 1, 2)')
     return parser
 
+
 def make_argument_parser_trainer():
     '''Generic experiment parser for a trainer.
 
@@ -93,6 +95,7 @@ def make_argument_parser_trainer():
                         help='Verbosity of the logging. (0, 1, 2)')
     return parser
 
+
 def make_argument_parser_test():
     '''Generic experiment parser for testing.
 
@@ -112,6 +115,7 @@ def make_argument_parser_test():
                         help='Verbosity of the logging. (0, 1, 2)')
     return parser
 
+
 def set_experiment(args):
     '''Generic experiment setup method.
 
@@ -186,6 +190,7 @@ def set_experiment(args):
     exp_dict['model_to_load'] = model_to_load
     return exp_dict
 
+
 def reload_model(args):
     '''Reloads a model from argparse args.
 
@@ -237,6 +242,7 @@ def reload_model(args):
     exp_dict.update(**args)
     return exp_dict
 
+
 def set_model(create_model, model_to_load, unpack, **kwargs):
     '''Convenience method for creating new or loading old model.
 
@@ -267,6 +273,7 @@ def create_model():
         models = create_model()
     return models
 
+
 def set_tparams(model_dict):
     '''Generic tparams setter.
 
@@ -279,6 +286,7 @@ def set_tparams(model_dict):
         tparams.update(**model.set_tparams())
     return tparams
 
+
 def set_params(tparams, updates, excludes=[]):
     '''Sets params, removing updates from tparams.
 
@@ -307,6 +315,7 @@ def set_params(tparams, updates, excludes=[]):
 
     return tparams, all_params
 
+
 def set_optimizer(inputs, cost, tparams, constants, updates, extra_outs,
                   optimizer='sgd', optimizer_args=None,
                   **learning_args):
@@ -343,6 +352,7 @@ def set_optimizer(inputs, cost, tparams, constants, updates, extra_outs,
 
     return f_grad_shared, f_grad_updates, learning_args
 
+
 def test(data_iter, f_test, f_test_keys, input_keys, n_samples=None):
     '''Tests the model using a data iterator.
 
@@ -406,6 +416,7 @@ def test(data_iter, f_test, f_test_keys, input_keys, n_samples=None):
 
     return results
 
+
 def validate(results, best_valid, e, best_epoch, save=None, valid_key=None,
              valid_sign=None, bestfile=None, **kwargs):
     '''Generic validation method.
@@ -444,6 +455,7 @@ def validate(results, best_valid, e, best_epoch, save=None, valid_key=None,
 
     return best_valid, best_epoch
 
+
 def main_loop(train, valid,
               f_grad_shared, f_grad_updates, f_test,
               f_test_keys=None,
@@ -583,6 +595,7 @@ def main_loop(train, valid,
             if e > epochs:
                 break
 
+            inps.reverse()  # THIS IS A SPOT FIX FOR 'pyramid_voc.py' DEMO!
             rval = f_grad_shared(*inps)
 
             if check_bad_nums(rval):

From 5c8b4633464339c351f3bd45dc26910852452070 Mon Sep 17 00:00:00 2001
From: Jeremy-E-Johnson <Jeremy.Johnson14@ncf.edu>
Date: Thu, 21 Jul 2016 16:47:38 -0600
Subject: [PATCH 29/37] Fixed an issue with training data being binary and a
 docstring typo.

---
 cortex/datasets/basic/voc.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/cortex/datasets/basic/voc.py b/cortex/datasets/basic/voc.py
index 627f4cd..bce23f2 100644
--- a/cortex/datasets/basic/voc.py
+++ b/cortex/datasets/basic/voc.py
@@ -111,7 +111,7 @@ def get_unique(pixels):
             """Helper function for get_data, returns the number of unique classifiers in an image.
 
             Args:
-                im (list of lists): Pixel classifier values.
+                pixels (list of lists): Pixel classifier values.
 
             Returns (int): Number of unique classifiers in image.
 
@@ -184,7 +184,7 @@ def get_random_chunk(pixels_data, pixels_label):
                 if get_unique(label_pixels) == 3:
                     self.label_images.append(project_to_binary(label_pixels))
 
-                    data_im = Image.open(source + '/basic/VOCdevkit/VOC2010/JPEGImages/' + name + '.jpg').convert('1')
+                    data_im = Image.open(source + '/basic/VOCdevkit/VOC2010/JPEGImages/' + name + '.jpg').convert('L')
                     self.data_images.append(image_to_pixels(data_im))
 
                     images_loaded += 1
@@ -198,7 +198,7 @@ def get_random_chunk(pixels_data, pixels_label):
         for i in xrange(0, self.chunks):
             k = rand.randint(0, len(self.data_images) - 1)
             x, y = get_random_chunk(self.data_images[k], self.label_images[k])
-            X.append(np.array(x, dtype='float32')/255)  # Normalize
+            X.append(np.array(x, dtype='float32')/255.0)  # Normalize
             Y.append(np.array(y, dtype='float32'))
 
         assert len(X) == self.chunks and len(Y) == self.chunks
@@ -208,12 +208,6 @@ def get_random_chunk(pixels_data, pixels_label):
     def next(self):
         rval = super(VOC, self).next()
 
-        #data = rval['voc']
-
-        #rval['voc'] = []
-        #for k in range(0, 4):
-        #    rval['voc'].append(np.swapaxes(np.rot90(np.swapaxes(data, 1, 2), k), 1, 2)[0:(self.chunk_size + 1)/2].astype('float32'))
-
         rval['label'] = np.array([b[1] for b in rval['label']])
 
         return rval

From 608eccfc280d4f9794fa65b8a2fa8b9dcee674bb Mon Sep 17 00:00:00 2001
From: Jeremy-E-Johnson <Jeremy.Johnson14@ncf.edu>
Date: Thu, 21 Jul 2016 17:08:11 -0600
Subject: [PATCH 30/37] Removed unnecessary print statement.

---
 cortex/models/tests/test_pyramid_rnn.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cortex/models/tests/test_pyramid_rnn.py b/cortex/models/tests/test_pyramid_rnn.py
index 16e8146..f5e322a 100644
--- a/cortex/models/tests/test_pyramid_rnn.py
+++ b/cortex/models/tests/test_pyramid_rnn.py
@@ -38,8 +38,6 @@ def test_step(pyramid=None, dim_in=1, dim_h=17, width=13):
 
     n = np.tanh(preact)
 
-    print n.shape
-
     np.testing.assert_almost_equal(t, n)
 
 

From 78c527415bbeef3b9bea5ae25a9c2cf4ae87c284 Mon Sep 17 00:00:00 2001
From: Jeremy-E-Johnson <Jeremy.Johnson14@ncf.edu>
Date: Fri, 22 Jul 2016 12:54:34 -0600
Subject: [PATCH 31/37] Removed reversal of input order.

---
 cortex/utils/training.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cortex/utils/training.py b/cortex/utils/training.py
index 6f5bfb1..1f279fc 100644
--- a/cortex/utils/training.py
+++ b/cortex/utils/training.py
@@ -595,7 +595,6 @@ def main_loop(train, valid,
             if e > epochs:
                 break
 
-            inps.reverse()  # THIS IS A SPOT FIX FOR 'pyramid_voc.py' DEMO!
             rval = f_grad_shared(*inps)
 
             if check_bad_nums(rval):

From 4f83fe19e2387ecf2fe8d7561638cf04d3d06fb3 Mon Sep 17 00:00:00 2001
From: Jeremy-E-Johnson <Jeremy.Johnson14@ncf.edu>
Date: Fri, 22 Jul 2016 12:54:50 -0600
Subject: [PATCH 32/37] Removed reversal of input order.

---
 cortex/utils/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cortex/utils/trainer.py b/cortex/utils/trainer.py
index ab5de97..c9f759c 100644
--- a/cortex/utils/trainer.py
+++ b/cortex/utils/trainer.py
@@ -149,7 +149,7 @@ def set_test_function(module, results, outputs):
     if hasattr(module, 'test'):
         f_test = module.test(results, outputs)
     else:
-        f_test = theano.function(module.inputs.values()[::-1], results)
+        f_test = theano.function(module.inputs.values(), results)  # Not sure why this change was made, but it seems necessary.
     return f_test
 
 

From bf899380a2cbfc8defa880943bb1feb61e507e1a Mon Sep 17 00:00:00 2001
From: Jeremy-E-Johnson <Jeremy.Johnson14@ncf.edu>
Date: Fri, 22 Jul 2016 12:55:24 -0600
Subject: [PATCH 33/37] Fixed some name shadowing through scopes.

---
 cortex/datasets/basic/voc.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cortex/datasets/basic/voc.py b/cortex/datasets/basic/voc.py
index bce23f2..0851eac 100644
--- a/cortex/datasets/basic/voc.py
+++ b/cortex/datasets/basic/voc.py
@@ -117,10 +117,10 @@ def get_unique(pixels):
 
             """
             unique = []
-            for line in pixels:
-                for i in line:
-                    if i not in unique:
-                        unique.append(i)
+            for l in pixels:
+                for j in l:
+                    if j not in unique:
+                        unique.append(j)
             return len(unique)
 
         def image_to_pixels(im):

From 77ebe3a324aac1d80d4305d9fc073148e93ee641 Mon Sep 17 00:00:00 2001
From: Jeremy-E-Johnson <Jeremy.Johnson14@ncf.edu>
Date: Fri, 22 Jul 2016 12:55:59 -0600
Subject: [PATCH 34/37] Added proper test for __call__ function that compares
 it to numpy calculated values.

---
 cortex/models/tests/test_pyramid_rnn.py | 28 ++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/cortex/models/tests/test_pyramid_rnn.py b/cortex/models/tests/test_pyramid_rnn.py
index f5e322a..0bcc21b 100644
--- a/cortex/models/tests/test_pyramid_rnn.py
+++ b/cortex/models/tests/test_pyramid_rnn.py
@@ -51,4 +51,30 @@ def test_call(pyramid=None, dim_in=1, dim_h=17, width=13):
 
     y = pyramid(x)
 
-    f = theano.function([], y[0]['p'])
+    f = theano.function([], y[0]['z'])
+
+    # Now calculate what f should be using Numpy.
+
+    outs =[]
+    for k in range(0, 4):  # Iterate through directions
+        x = np.rot90(x.swapaxes(1, 2)).swapaxes(1, 2)  # Rotate input
+
+        dir_input = x[:(width + 1)/2, :, :, None]
+        dir_input = pyramid.input_net.params['b0'] + np.dot(dir_input, pyramid.input_net.params['W0'])
+
+        h = np.zeros((x.shape[1], width, dim_h))
+        Ur = pyramid.params['Ur0']
+
+        for layer in dir_input:  # Iterate through height of pyramid
+            h_t = np.concatenate((h, np.roll(h, 1, 2), np.roll(h, -1, 2)), 2)
+            preact = layer + np.dot(h_t, Ur)
+            h = np.tanh(preact)
+
+        outs.append(h[:, (width + 1)/2, :])  # Remember output for direction
+
+    output = pyramid.output_net.params['b0'] + np.dot(sum(outs), pyramid.output_net.params['W0'])  # Sum over direction\
+    # and apply output network.
+
+    # Test for equality.
+
+    np.testing.assert_almost_equal(output, f())  # Check if they match.

From e40bdd33b0081e7dc225bb9e808acbea9deb22e5 Mon Sep 17 00:00:00 2001
From: Jeremy-E-Johnson <Jeremy.Johnson14@ncf.edu>
Date: Fri, 22 Jul 2016 12:56:20 -0600
Subject: [PATCH 35/37] Cleaned up __call__ a little bit.

---
 cortex/models/pyramid_rnn.py | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/cortex/models/pyramid_rnn.py b/cortex/models/pyramid_rnn.py
index 6280810..a8cfe33 100644
--- a/cortex/models/pyramid_rnn.py
+++ b/cortex/models/pyramid_rnn.py
@@ -133,9 +133,10 @@ def step_call(self, x, m, h0s, *params):
 
         h0s = []
         hs = []
+        output = []
         for k in range(0, 4):  # Iterate through directions.
-            x = self.rotate(input, k)[:(self.width + 1)/2, :, :].astype('float32')
-            h0s.append([T.alloc(0, x.shape[1], self.width, dim_h).astype(floatX) for dim_h in self.dim_hs])
+            x = self.rotate(input, k)[:(self.width + 1)/2, :, :].astype('float32')  # Rotates input
+            h0s.append([T.alloc(0, x.shape[1], self.width, dim_h).astype(floatX) for dim_h in self.dim_hs])  # Make h0
             for i, h0 in enumerate(h0s[k]):
                 seqs         = [m[:, :, :, None]] + self.call_seqs(x, None, i, *params)
                 outputs_info = [h0]
@@ -150,17 +151,14 @@ def step_call(self, x, m, h0s, *params):
                 hs.append(h)
                 x = h
                 updates += updates_
-            if k == 0:
-                output = [h[-1, :, (self.width + 1)/2, :]]
-            else:
-                output = output + [h[-1, :, (self.width + 1)/2, :]]
+            output.append(h[-1, :, (self.width + 1)/2, :])  # Collect directional outputs.
 
         o_params    = self.get_output_args(*params)
         out_net_out = self.output_net.step_call(T.sum(output, 0), *o_params)  # Sum different directions.
         preact      = out_net_out['z']
         p           = out_net_out['p']
 
-        return coll.OrderedDict(hs=hs, p=p[:, 0], z=preact), updates, h0s[0]
+        return coll.OrderedDict(hs=hs, p=p, z=preact), updates, h0s[0]
 
     def __call__(self, x, m=None, h0s=None, condition_on=None):
         '''Call function.
@@ -215,6 +213,15 @@ def call_seqs(self, x, condition_on, level, *params):
         return [a]
 
     def rotate(self, tensor, n_times):
+        """
+
+        Args:
+            tensor (theano tensor): Tensor to rotate along first and third axes.
+            n_times (int): Number of times to rotate.
+
+        Returns: Tensor rotated n times about its first and third axes.
+
+        """
         if n_times == 0:
             return tensor
 

From fe4e639ffff22e345207cfaa8336ad2f445d84f6 Mon Sep 17 00:00:00 2001
From: Jeremy-E-Johnson <Jeremy.Johnson14@ncf.edu>
Date: Fri, 22 Jul 2016 12:58:33 -0600
Subject: [PATCH 36/37] Made cost function work with adjusted pyramid_rnn code
 and removed the no optimizer flag. (The warn on excess input is still
 necessary though, otherwise f_out from trainer.py throws error upon
 creation.)

---
 cortex/demos/demos_basic/pyramid_voc.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/cortex/demos/demos_basic/pyramid_voc.py b/cortex/demos/demos_basic/pyramid_voc.py
index df8ece9..fcaba9d 100644
--- a/cortex/demos/demos_basic/pyramid_voc.py
+++ b/cortex/demos/demos_basic/pyramid_voc.py
@@ -25,7 +25,7 @@
     learning_rate_scheduler=None,
     optimizer='rmsprop',
     optimizer_args=dict(),
-    epochs=100,
+    epochs=150,
     valid_key='-sum log p(x | y)',
     valid_sign='+',
     excludes=[]
@@ -39,7 +39,7 @@
     dataset='voc',
     chunks=1000,
     distribution='multinomial',
-    chunk_size=7,
+    chunk_size=25,
     source='$data'
 )
 
@@ -58,9 +58,9 @@
 extra_arg_keys = ['pyramid_args']
 
 theano.config.on_unused_input = 'ignore'
-theano.config.optimizer = 'None'
-#theano.config.exception_verbosity = 'high'
-#theano.config.compute_test_value = 'warn'
+# theano.config.optimizer = 'None'
+# theano.config.exception_verbosity = 'high'
+# theano.config.compute_test_value = 'warn'
 
 
 def _build(module):
@@ -84,7 +84,7 @@ def _cost(module):
 
     X = module.inputs[module.dataset.name].swapaxes(0, 1)
     Y = module.inputs['label']
-    used_inputs = [module.dataset.name, 'label']
+    used_inputs = ['label', module.dataset.name]
 
     model = models['pyramid_rnn']
 
@@ -92,7 +92,7 @@ def _cost(module):
 
     results = OrderedDict()
     p = outputs['p']
-    base_cost = model.neg_log_prob(Y, p).mean()
+    base_cost = model.neg_log_prob(Y, p[:, 0]).mean()
     cost = base_cost
 
     constants = []
@@ -110,4 +110,3 @@ def _cost(module):
     results['cost'] = cost
 
     return used_inputs, results, updates, constants, outputs
-

From b29a0cd8f757c7a457ce79040785513c1237f719 Mon Sep 17 00:00:00 2001
From: Jeremy-E-Johnson <Jeremy.Johnson14@ncf.edu>
Date: Thu, 28 Jul 2016 10:15:08 -0600
Subject: [PATCH 37/37] Started preparing visuals.

---
 cortex/datasets/basic/voc.py            |  8 +++++++-
 cortex/demos/demos_basic/pyramid_voc.py | 10 +++++++++-
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/cortex/datasets/basic/voc.py b/cortex/datasets/basic/voc.py
index 0851eac..b234a71 100644
--- a/cortex/datasets/basic/voc.py
+++ b/cortex/datasets/basic/voc.py
@@ -7,9 +7,9 @@
 import logging
 from ...utils.tools import resolve_path
 from PIL import Image
-import PIL
 import random
 import numpy as np
+import matplotlib.pyplot as plt
 
 
 class VOC(BasicDataset):
@@ -211,3 +211,9 @@ def next(self):
         rval['label'] = np.array([b[1] for b in rval['label']])
 
         return rval
+
+    def save_images(self, image, ith_image, out_path=None):
+
+        plt.matshow(image)
+
+        plt.savefig(out_path + ith_image + '.png')
diff --git a/cortex/demos/demos_basic/pyramid_voc.py b/cortex/demos/demos_basic/pyramid_voc.py
index fcaba9d..dbeb736 100644
--- a/cortex/demos/demos_basic/pyramid_voc.py
+++ b/cortex/demos/demos_basic/pyramid_voc.py
@@ -17,6 +17,7 @@
 from cortex.utils import intX, floatX, logger
 from cortex.datasets import resolve as resolve_dataset
 import theano.tensor as T
+import matplotlib.pyplot as plt
 
 
 # Default arguments
@@ -25,7 +26,7 @@
     learning_rate_scheduler=None,
     optimizer='rmsprop',
     optimizer_args=dict(),
-    epochs=150,
+    epochs=15,
     valid_key='-sum log p(x | y)',
     valid_sign='+',
     excludes=[]
@@ -110,3 +111,10 @@ def _cost(module):
     results['cost'] = cost
 
     return used_inputs, results, updates, constants, outputs
+
+
+def _vis(module, outputs, results):
+    out_path = module.out_path
+
+    plt.matshow(np.zeros((10, 10)))
+    plt.savefig('Testp_plot.png')