From c73eec5b2cd4d1dbd82a6e2967fe708b859e279c Mon Sep 17 00:00:00 2001 From: Jeremy-E-Johnson Date: Fri, 17 Jun 2016 18:08:35 -0600 Subject: [PATCH 01/37] Created europal.py data itterator. Currently needs europal data unpacked in source directory. --- cortex/datasets/basic/europarl.py | 239 ++++++++++++++++++++++++++++++ 1 file changed, 239 insertions(+) create mode 100644 cortex/datasets/basic/europarl.py diff --git a/cortex/datasets/basic/europarl.py b/cortex/datasets/basic/europarl.py new file mode 100644 index 0000000..6e7fd1e --- /dev/null +++ b/cortex/datasets/basic/europarl.py @@ -0,0 +1,239 @@ +""" +Europarl dataset for machine translation. + +Currently only supports fr-en datasets. +""" + +from .. import BasicDataset, make_one_hot +import string +import numpy as np +from collections import defaultdict +from functools import partial +import logging +from guppy import hpy + + +class Europarl(BasicDataset): + """ + Europarl dataset itterator. + """ + def __init__(self, source=None, mode='train', english_to_french=True, + name='europarl', out_path=None, max_words=5000, + max_sentence=30, max_length=7000, **kwargs): + + self.logger = logging.getLogger( + '.'.join([self.__module__, self.__class__.__name__])) + self.logger.info('Loading %s from %s' % (name, source)) + + if source is None: + raise ValueError('No source file provided.') + print 'Loading {name} ({mode}) from {source}'.format( + name=name, mode=mode, source=source) + + self.masken = None + self.maskfr = None + self.masky = None + self.maskx = None + self.max_sentence = max_sentence + self.max_length = max_length + self.max_words = max_words + self.n_observations = 0 + self.english_to_french = english_to_french + X, Y = self.get_data(source) + data = {name: X, 'label': Y} + distributions = {name: 'multinomial', 'label': 'multinomial'} + + super(Europarl, self).__init__(data, distributions=distributions, + name=name, mode=mode, **kwargs) + + self.out_path = out_path + + if self.shuffle: + self.randomize() + + def slice_data(self, idx, data=None): # Function for restricting dataset in instance. + if data is None: data = self.data + for k, v in data.iteritems(): + self.data[k] = v[idx] + self.n_observations = len(idx) + self.X = data[self.name] + if self.labels in data.keys(): + self.Y = data[self.labels] + self.n = self.X.shape[0] + + def get_data(self, source): + fr = open(source + 'europarl-v7.fr-en.fr') #### NOT SURE IF SOURCING IS CORRECT + en = open(source + 'europarl-v7.fr-en.en') + + X = [] + Y = [] + fMax = 0 + eMax = 0 + self.itt_pos1 = 4 + self.itt_pos2 = 4 + self.frStringToToken = defaultdict(partial(self.count1, self.max_words, 3)) + self.enStringToToken = defaultdict(partial(self.count2, self.max_words, 3)) + special_tokens = {'': 0, '': 1, '': 2, '': 3} + + i = 0 + for eSentence, fSentence in zip(en.__iter__(), fr.__iter__()): # Itterate through file lines + if len(self.string_process(eSentence)) <= self.max_sentence\ + and len(self.string_process(fSentence)) <= self.max_sentence: + X.append([self.enStringToToken[eWord] for eWord in self.string_process(eSentence)]) # Convert to numerical + if len(X[-1]) > eMax: # Keep track of largest sentence in language. + eMax = len(X[-1]) + Y.append([self.frStringToToken[fWord] for fWord in self.string_process(fSentence)]) + if len(Y[-1]) > fMax: + fMax = len(Y[-1]) + i += 1 + if i >= self.max_length: + break + + fr.close() + en.close() + del fr + del en + + print 'Data loaded, preprocessing...' + print 'Padding data.' + self.n_observations = len(X) # Update sample size + + X = map(partial(self.pad_array, length=(eMax + 2)), X) + Y = map(partial(self.pad_array, length=(fMax + 2)), Y) + + print 'Creating masks.' + self.masken = map(self.create_mask, X) + self.maskfr = map(self.create_mask, Y) + + self.masken = np.array(self.masken, dtype='float32') + self.maskfr = np.array(self.maskfr, dtype='float32') + + print 'Converting to one-hot.' + # The following couple lines are really slow to run. + X = make_one_hot(np.array(X).reshape((eMax + 2) * self.n_observations))\ + .reshape((self.n_observations, eMax + 2, max(self.enStringToToken.values()) + 1)) # Convert to one hot, (array -> vector -> one-hot -> array) + Y = make_one_hot(np.array(Y).reshape((fMax + 2) * self.n_observations))\ + .reshape((self.n_observations, fMax + 2, max(self.frStringToToken.values()) + 1)) + + print 'Converting to float32.' + # Conversion after one-hot as float32 arrays slow down one-hot conversion. + X = X.astype('float32') + Y = Y.astype('float32') + + self.frStringToToken.update(special_tokens) + self.enStringToToken.update(special_tokens) + + print 'Data prepared.' + if self.english_to_french: + self.maskx = self.masken + self.masky = self.maskfr + return X, Y + else: + self.maskx = self.maskfr + self.masky = self.masken + return Y, X + + @staticmethod + def factory(C=None, split=None, idx=None, batch_sizes=None, **kwargs): + if C is None: + C = Europarl + europarl = C(batch_size=10, **kwargs) + if hasattr(europarl, 'logger'): + logger = europarl.logger + europarl.logger = None + else: + logger = logging.getLogger('.'.join([europarl.__module__, europarl.__class__.__name__])) + + if idx is None: + logger.info('Splitting dataset into ratios %r' % split) + if round(np.sum(split), 5) != 1. or len(split) != 3: + raise ValueError(split) + + if europarl.balance: + raise NotImplementedError() + else: + split_idx = [] + accum = 0 + for s in split: # Create indicies from percentage values + s_i = int(s * europarl.n_observations + accum) + split_idx.append(s_i) + accum += s_i + idx = range(europarl.n_observations) + + train_idx = idx[:split_idx[0]] + valid_idx = idx[split_idx[0]:split_idx[1]] + test_idx = idx[split_idx[1]:] + idx = [train_idx, valid_idx, test_idx] + else: + logger.info('Splitting dataset into ratios %.2f / %.2f /%.2f ' + 'using given indices' + % tuple(len(idx[i]) / float(europarl.n_observations) + for i in range(3))) + + assert len(batch_sizes) == len(idx) # Shouldn't have different number of batch sizes than datasets + + datasets = [] + modes = ['train', 'valid', 'test'] + data = europarl.data + europarl.data = dict() + for i, bs, mode in zip(idx, batch_sizes, modes): # Create correctly restricted copies of dataset + if bs is None: + dataset = None + else: + dataset = europarl.copy() + dataset.slice_data(i, data=data) + dataset.batch_size = bs + dataset.logger = logger + dataset.mode = mode + datasets.append(dataset) + + return datasets + [idx] + + table = string.maketrans('', '') # Variable for string_process() + + def string_process(self, s): # Helper method for get_data() + return s.lower().translate(self.table, string.punctuation).split() + + @staticmethod + def pad_array(arr, length): # Helper method for get_data() + return [1] + arr + [2] + ([0]*(length - len(arr) - 2)) + + @staticmethod # Helper method for creating mask array from a list. + def create_mask(lst): + return np.array([int(bool(x)) for x in lst]) + + def count1(self, max_words, unknown_val): # Crappy replacement for count as itterators can not be copied. :( + if self.itt_pos1 <= max_words: + self.itt_pos1 += 1 + return self.itt_pos1 - 1 + else: + return unknown_val + + def count2(self, max_words, unknown_val): + if self.itt_pos2 <= max_words: + self.itt_pos2 += 1 + return self.itt_pos2 - 1 + else: + return unknown_val + +def count(start, max_words, unknown_val): # Helper method for defaultdict in get_data() + s = start + while True: + if s <= max_words: + yield s + s += 1 + else: + yield unknown_val + + +def main(): + #data = Europarl(source='/export/mialab/users/jjohnson/data/basic/', batch_size=10) + train, valid, test, idx = Europarl.factory(source='/export/mialab/users/jjohnson/data/basic/', + batch_sizes=[100, 100, 100], split=[0.7, 0.2, 0.1]) + h = hpy() + print h.heap() + print train.data['europarl'].shape + print valid.data['europarl'].shape + print test.data['europarl'].shape + +main() From 4a1d2c2d899a567a319b31c8c701f4d6dca49276 Mon Sep 17 00:00:00 2001 From: rdevon Date: Sat, 18 Jun 2016 13:01:48 -0600 Subject: [PATCH 02/37] updating europarl iterator --- cortex/datasets/basic/europarl.py | 380 +++++++++++++++++++----------- 1 file changed, 238 insertions(+), 142 deletions(-) diff --git a/cortex/datasets/basic/europarl.py b/cortex/datasets/basic/europarl.py index 6e7fd1e..7129f57 100644 --- a/cortex/datasets/basic/europarl.py +++ b/cortex/datasets/basic/europarl.py @@ -1,23 +1,44 @@ -""" +''' Europarl dataset for machine translation. Currently only supports fr-en datasets. -""" +''' -from .. import BasicDataset, make_one_hot -import string -import numpy as np from collections import defaultdict from functools import partial -import logging from guppy import hpy +import logging +import numpy as np +from os import path +from progressbar import ( + Bar, + Percentage, + ProgressBar, + Timer +) +import string + +from ..datasets import BasicDataset, make_one_hot +from ...utils import floatX, intX +np.set_printoptions(threshold=np.nan) +logger = logging.getLogger(__name__) + class Europarl(BasicDataset): - """ - Europarl dataset itterator. - """ - def __init__(self, source=None, mode='train', english_to_french=True, + '''Europarl dataset itterator. + + Attributes: + max_sentence (int): maximimum sentence length. + + ''' + _PAD = 0 + _BEG = 1 + _END = 2 + _UNK = 3 + table = string.maketrans('', '') + + def __init__(self, source=None, english_to_french=True, name='europarl', out_path=None, max_words=5000, max_sentence=30, max_length=7000, **kwargs): @@ -27,31 +48,34 @@ def __init__(self, source=None, mode='train', english_to_french=True, if source is None: raise ValueError('No source file provided.') - print 'Loading {name} ({mode}) from {source}'.format( - name=name, mode=mode, source=source) - self.masken = None - self.maskfr = None - self.masky = None - self.maskx = None self.max_sentence = max_sentence self.max_length = max_length self.max_words = max_words - self.n_observations = 0 self.english_to_french = english_to_french - X, Y = self.get_data(source) - data = {name: X, 'label': Y} - distributions = {name: 'multinomial', 'label': 'multinomial'} + + X, Y, Mx, My = self.get_data(source) + data = {name: X, + 'label': Y, + 'mask_in': Mx, + 'mask_out': My} + distributions = {name: 'multinomial', + 'label': 'multinomial', + 'mask_in': None, + 'mask_out': None} super(Europarl, self).__init__(data, distributions=distributions, - name=name, mode=mode, **kwargs) + name=name, one_hot=False, **kwargs) self.out_path = out_path if self.shuffle: self.randomize() - def slice_data(self, idx, data=None): # Function for restricting dataset in instance. + def slice_data(self, idx, data=None): + '''Function for restricting dataset in instance. + + ''' if data is None: data = self.data for k, v in data.iteritems(): self.data[k] = v[idx] @@ -62,76 +86,181 @@ def slice_data(self, idx, data=None): # Function for restricting dataset in ins self.n = self.X.shape[0] def get_data(self, source): - fr = open(source + 'europarl-v7.fr-en.fr') #### NOT SURE IF SOURCING IS CORRECT - en = open(source + 'europarl-v7.fr-en.en') - - X = [] - Y = [] - fMax = 0 - eMax = 0 - self.itt_pos1 = 4 - self.itt_pos2 = 4 - self.frStringToToken = defaultdict(partial(self.count1, self.max_words, 3)) - self.enStringToToken = defaultdict(partial(self.count2, self.max_words, 3)) - special_tokens = {'': 0, '': 1, '': 2, '': 3} - - i = 0 - for eSentence, fSentence in zip(en.__iter__(), fr.__iter__()): # Itterate through file lines - if len(self.string_process(eSentence)) <= self.max_sentence\ - and len(self.string_process(fSentence)) <= self.max_sentence: - X.append([self.enStringToToken[eWord] for eWord in self.string_process(eSentence)]) # Convert to numerical - if len(X[-1]) > eMax: # Keep track of largest sentence in language. - eMax = len(X[-1]) - Y.append([self.frStringToToken[fWord] for fWord in self.string_process(fSentence)]) - if len(Y[-1]) > fMax: - fMax = len(Y[-1]) - i += 1 - if i >= self.max_length: - break - - fr.close() - en.close() - del fr - del en - - print 'Data loaded, preprocessing...' - print 'Padding data.' - self.n_observations = len(X) # Update sample size - - X = map(partial(self.pad_array, length=(eMax + 2)), X) - Y = map(partial(self.pad_array, length=(fMax + 2)), Y) - - print 'Creating masks.' - self.masken = map(self.create_mask, X) - self.maskfr = map(self.create_mask, Y) - - self.masken = np.array(self.masken, dtype='float32') - self.maskfr = np.array(self.maskfr, dtype='float32') - - print 'Converting to one-hot.' - # The following couple lines are really slow to run. - X = make_one_hot(np.array(X).reshape((eMax + 2) * self.n_observations))\ - .reshape((self.n_observations, eMax + 2, max(self.enStringToToken.values()) + 1)) # Convert to one hot, (array -> vector -> one-hot -> array) - Y = make_one_hot(np.array(Y).reshape((fMax + 2) * self.n_observations))\ - .reshape((self.n_observations, fMax + 2, max(self.frStringToToken.values()) + 1)) - - print 'Converting to float32.' - # Conversion after one-hot as float32 arrays slow down one-hot conversion. - X = X.astype('float32') - Y = Y.astype('float32') - - self.frStringToToken.update(special_tokens) - self.enStringToToken.update(special_tokens) - - print 'Data prepared.' + special_tokens = { + '': self._PAD, '': self._BEG, + '': self._END, '': self._UNK} + + def preprocess(s): + '''Preprocesses string. + + Args: + s (str): string to be preprocessed. + + Returns: + str: preprocessed string. + + ''' + return s.lower().translate(self.table, string.punctuation).split() + + def make_dictionary(sentences, n_lines, max_words=None): + '''Forms a dictionary from words in sentences. + + If there are more words than max_words, use the top frequent ones. + + Args: + sentences (file Handle) + n_lines (int): number of lines in file. + max_words (Optional[int]): maximum number of words. Default + is self.max_words. + + Returns: + dict: word string to token dictionary. + int: maximum length sentence. + + ''' + self.logger.info('Forming dictionary') + if max_words is None: max_words = self.max_words + + count_dict = defaultdict(int) + + widgets = ['Counting words' , ' (', Timer(), ') [', Percentage(), ']'] + pbar = ProgressBar(widgets=widgets, maxval=n_lines).start() + + max_len = 0 + for i, sentence in enumerate(sentences): + ps = preprocess(sentence) + l = len(ps) + if l <= self.max_sentence: + for word in ps: + count_dict[word] += 1 + max_len = max(l, max_len) + pbar.update(i) + + count_keys_sorted = sorted( + count_dict, key=count_dict.get, reverse=True) + vals_sorted = sorted(count_dict.values(), reverse=True) + keys = count_keys_sorted[:max_words] + omit_freq = sum(vals_sorted[max_words:]) / float(sum(vals_sorted)) + self.logger.info('Setting %d words as with total frequency ' + '%.3g.' + % (max(0, len(count_keys_sorted) - max_words), + omit_freq)) + values = range(4, len(keys) + 4) + + d = dict() + d.update(**special_tokens) + d.update(**dict(zip(keys, values))) + return d, max_len + + def tokenize(sentence, d, pad_length): + '''Tokenize sentence using dictionary. + + Args: + sentence (str): sentence to be tokenized. + d (dict): token dictionary. + pad_length (int): total length up to pad. + + Returns: + list: tokenized sentence as list. + + ''' + ps = preprocess(sentence) + if len(ps) > self.max_sentence: + return [] + s = [self._BEG] + [d.get(w, self._UNK) for w in ps] + [self._END] + s += [self._PAD] * max(0, pad_length + 2 - len(s)) + return s + + def read_and_tokenize(file_path): + '''Read and tokenize a file of sentences. + + Args: + file_path (str): path to file. + + Returns: + list: list of tokenized sentences. + dict: token disctionary. + dict: reverse dictionary. + + ''' + self.logger.info('Reading sentences from %s' % file_path) + with open(file_path) as f: + n_lines = sum(1 for line in f) + f.seek(0) + d, max_len = make_dictionary(f, n_lines) + r_d = dict((v, k) for k, v in d.iteritems()) + tokenized_sentences = [] + + f.seek(0) + self.logger.info('Tokenizing sentences from %s' % file_path) + widgets = ['Tokenizing sentences' , + ' (', Timer(), ') [', Percentage(), ']'] + pbar = ProgressBar(widgets=widgets, maxval=n_lines).start() + for i, sentence in enumerate(f): + ts = tokenize(sentence, d, max_len) + assert len(ts) <= self.max_sentence + 2, (ts, len(ts)) + tokenized_sentences.append(ts) + pbar.update(i) + return tokenized_sentences, d, r_d + + def match_and_trim(sentences_a, sentences_b): + '''Matches 2 lists of sentences and removes incomplete pairs. + + If one of the pairs is `[]`, remove pair. + + Args: + sentences_a (list). + sentences_b (list). + + Returns: + list: new sentences_a + list: new sentences_b + + ''' + self.logger.info('Matching datasets and trimming') + if len(sentences_a) != len(sentences_b): + raise TypeError('Sentence lists are different lengths.') + + sentences_a_tr = [] + sentences_b_tr = [] + widgets = ['Matching sentences' , + ' (', Timer(), ') [', Percentage(), ']'] + trimmed = 0 + pbar = ProgressBar(widgets=widgets, maxval=len(sentences_a)).start() + for i, (s_a, s_b) in enumerate(zip(sentences_a, sentences_b)): + if len(s_a) > 0 and len(s_b) > 0: + sentences_a_tr.append(s_a) + sentences_b_tr.append(s_b) + else: + trimmed += 1 + pbar.update(i) + self.logger.debug('Trimmed %d sentences' % trimmed) + + return sentences_a_tr, sentences_b_tr + + fr_sentences, self.fr_dict, self.fr_dict_r = read_and_tokenize( + path.join(path.join(source, 'europarl-v7.fr-en.fr'))) + + en_sentences, self.en_dict, self.en_dict_r = read_and_tokenize( + path.join(path.join(source, 'europarl-v7.fr-en.en'))) + + fr_sentences, en_sentences = match_and_trim(fr_sentences, en_sentences) + if self.english_to_french: - self.maskx = self.masken - self.masky = self.maskfr - return X, Y + X = np.array(en_sentences).astype(intX) + Y = np.array(fr_sentences).astype(intX) else: - self.maskx = self.maskfr - self.masky = self.masken - return Y, X + X = np.array(fr_sentences).astype(intX) + Y = np.array(en_sentences).astype(intX) + + self.nX_tokens = len(np.unique(X).tolist()) + self.nY_tokens = len(np.unique(Y).tolist()) + + self.logger.info('Creating masks') + Mx = (X != 0).astype(intX) + My = (Y != 0).astype(intX) + + return X, Y, Mx, My @staticmethod def factory(C=None, split=None, idx=None, batch_sizes=None, **kwargs): @@ -141,8 +270,6 @@ def factory(C=None, split=None, idx=None, batch_sizes=None, **kwargs): if hasattr(europarl, 'logger'): logger = europarl.logger europarl.logger = None - else: - logger = logging.getLogger('.'.join([europarl.__module__, europarl.__class__.__name__])) if idx is None: logger.info('Splitting dataset into ratios %r' % split) @@ -155,10 +282,10 @@ def factory(C=None, split=None, idx=None, batch_sizes=None, **kwargs): split_idx = [] accum = 0 for s in split: # Create indicies from percentage values - s_i = int(s * europarl.n_observations + accum) + s_i = int(s * europarl.n + accum) split_idx.append(s_i) accum += s_i - idx = range(europarl.n_observations) + idx = range(europarl.n) train_idx = idx[:split_idx[0]] valid_idx = idx[split_idx[0]:split_idx[1]] @@ -167,16 +294,18 @@ def factory(C=None, split=None, idx=None, batch_sizes=None, **kwargs): else: logger.info('Splitting dataset into ratios %.2f / %.2f /%.2f ' 'using given indices' - % tuple(len(idx[i]) / float(europarl.n_observations) + % tuple(len(idx[i]) / float(europarl.n) for i in range(3))) - assert len(batch_sizes) == len(idx) # Shouldn't have different number of batch sizes than datasets + # Shouldn't have different number of batch sizes than datasets + assert len(batch_sizes) == len(idx) datasets = [] modes = ['train', 'valid', 'test'] data = europarl.data europarl.data = dict() - for i, bs, mode in zip(idx, batch_sizes, modes): # Create correctly restricted copies of dataset + # Create correctly restricted copies of dataset + for i, bs, mode in zip(idx, batch_sizes, modes): if bs is None: dataset = None else: @@ -189,51 +318,18 @@ def factory(C=None, split=None, idx=None, batch_sizes=None, **kwargs): return datasets + [idx] - table = string.maketrans('', '') # Variable for string_process() - - def string_process(self, s): # Helper method for get_data() - return s.lower().translate(self.table, string.punctuation).split() - - @staticmethod - def pad_array(arr, length): # Helper method for get_data() - return [1] + arr + [2] + ([0]*(length - len(arr) - 2)) - - @staticmethod # Helper method for creating mask array from a list. - def create_mask(lst): - return np.array([int(bool(x)) for x in lst]) - - def count1(self, max_words, unknown_val): # Crappy replacement for count as itterators can not be copied. :( - if self.itt_pos1 <= max_words: - self.itt_pos1 += 1 - return self.itt_pos1 - 1 - else: - return unknown_val - - def count2(self, max_words, unknown_val): - if self.itt_pos2 <= max_words: - self.itt_pos2 += 1 - return self.itt_pos2 - 1 - else: - return unknown_val - -def count(start, max_words, unknown_val): # Helper method for defaultdict in get_data() - s = start - while True: - if s <= max_words: - yield s - s += 1 - else: - yield unknown_val + def next(self, batch_size=None): + rval = super(Europarl, self).next(batch_size=batch_size) + rval[self.name] = make_one_hot(rval[self.name], + n_classes=self.nX_tokens) + rval['label'] = make_one_hot(rval['label'], + n_classes=self.nY_tokens) + return rval + def save_images(self, out_file=None): + '''Shows tokenized in terms of original words. -def main(): - #data = Europarl(source='/export/mialab/users/jjohnson/data/basic/', batch_size=10) - train, valid, test, idx = Europarl.factory(source='/export/mialab/users/jjohnson/data/basic/', - batch_sizes=[100, 100, 100], split=[0.7, 0.2, 0.1]) - h = hpy() - print h.heap() - print train.data['europarl'].shape - print valid.data['europarl'].shape - print test.data['europarl'].shape + Uses reverse dictionary. -main() + ''' + raise NotImplementedError() From 21542d7f687a8ef8f97ec65faf8557351e463ad3 Mon Sep 17 00:00:00 2001 From: Jeremy-E-Johnson Date: Sun, 19 Jun 2016 12:51:20 -0600 Subject: [PATCH 03/37] Created test for europarl.py. Checks shapes and masks. --- cortex/datasets/basic/tests/__init__.py | 0 cortex/datasets/basic/tests/test_europarl.py | 24 ++++++++++++++++++++ 2 files changed, 24 insertions(+) create mode 100644 cortex/datasets/basic/tests/__init__.py create mode 100644 cortex/datasets/basic/tests/test_europarl.py diff --git a/cortex/datasets/basic/tests/__init__.py b/cortex/datasets/basic/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/cortex/datasets/basic/tests/test_europarl.py b/cortex/datasets/basic/tests/test_europarl.py new file mode 100644 index 0000000..df54bba --- /dev/null +++ b/cortex/datasets/basic/tests/test_europarl.py @@ -0,0 +1,24 @@ +""" +Tests europarl.py, try with nosetests test_europarl.py + +Checks that the shapes split properly and that the masks line up. +""" + +from cortex.datasets.basic.europarl import Europarl + + +def test_europarl(split=[0.7, 0.2, 0.1]): + train, valid, test, idx = Europarl.factory(split=split, batch_sizes=[10, 10, 10], + debug=True, source='/export/mialab/users/jjohnson/data/basic') + + for i, dataset in enumerate([train, valid, test]): + + for key in ['europarl', 'mask_in', 'label', 'mask_out']: # Test shapes. + assert dataset.data[key].shape == (int(idx[2][-1] * split[i]) + 1, 32) + + for k in idx[i]: # Test masks. + relative_k = k - idx[i][0] + for j in range(0, len(dataset.data['europarl'][relative_k])): + assert bool(dataset.data['europarl'][relative_k][j]) == bool(dataset.data['mask_in'][relative_k][j]) + for j in range(0, len(dataset.data['label'][relative_k])): + assert bool(dataset.data['label'][relative_k][j]) == bool(dataset.data['mask_out'][relative_k][j]) From 665283ec30e0fb6e23bfb9f31fe7f3d8622e78b4 Mon Sep 17 00:00:00 2001 From: Jeremy-E-Johnson Date: Sun, 19 Jun 2016 13:01:29 -0600 Subject: [PATCH 04/37] Added support for max_length, added some documentation, removed some unnecessary code in split_data. --- cortex/datasets/basic/europarl.py | 67 +++++++++++++++++++++++-------- 1 file changed, 51 insertions(+), 16 deletions(-) diff --git a/cortex/datasets/basic/europarl.py b/cortex/datasets/basic/europarl.py index 7129f57..7a7e478 100644 --- a/cortex/datasets/basic/europarl.py +++ b/cortex/datasets/basic/europarl.py @@ -18,18 +18,23 @@ ) import string -from ..datasets import BasicDataset, make_one_hot +from .. import BasicDataset, make_one_hot from ...utils import floatX, intX np.set_printoptions(threshold=np.nan) logger = logging.getLogger(__name__) + class Europarl(BasicDataset): '''Europarl dataset itterator. Attributes: - max_sentence (int): maximimum sentence length. + max_sentence (int): Maximum sentence length. + max_length (int): Maximum number of sentences. + max_words (int): Maximum size of vocabulary. + english_to_french (bool): If true English is under name key, and French under label key, else reversed. + debug (bool): If true restricts max_length to 1000. ''' _PAD = 0 @@ -40,7 +45,19 @@ class Europarl(BasicDataset): def __init__(self, source=None, english_to_french=True, name='europarl', out_path=None, max_words=5000, - max_sentence=30, max_length=7000, **kwargs): + max_sentence=30, max_length=7000, debug=False, **kwargs): + """ + Args: + source (str): Path to where the europarl data is stored. + english_to_french (bool): True for English input French labels, False for reverse. + name (str): Name of dataset. + out_path (str): Path to save outs. + max_words (int): Maximum vocab size, extra words are marked unknown. + max_sentence (int): Maximum sentence length, longer sentences are ignored. + max_length (int): Maximum number of sentences. + debug (bool): If True restricts max_length to 1000. + **kwargs: + """ self.logger = logging.getLogger( '.'.join([self.__module__, self.__class__.__name__])) @@ -54,6 +71,9 @@ def __init__(self, source=None, english_to_french=True, self.max_words = max_words self.english_to_french = english_to_french + if debug: + self.max_length = 1000 + X, Y, Mx, My = self.get_data(source) data = {name: X, 'label': Y, @@ -75,15 +95,14 @@ def __init__(self, source=None, english_to_french=True, def slice_data(self, idx, data=None): '''Function for restricting dataset in instance. + Args: + idx (list): Indices of data to be kept. + data (dict): Data to be sliced and kept. + ''' if data is None: data = self.data for k, v in data.iteritems(): self.data[k] = v[idx] - self.n_observations = len(idx) - self.X = data[self.name] - if self.labels in data.keys(): - self.Y = data[self.labels] - self.n = self.X.shape[0] def get_data(self, source): special_tokens = { @@ -123,11 +142,11 @@ def make_dictionary(sentences, n_lines, max_words=None): count_dict = defaultdict(int) - widgets = ['Counting words' , ' (', Timer(), ') [', Percentage(), ']'] + widgets = ['Counting words', ' (', Timer(), ') [', Percentage(), ']'] pbar = ProgressBar(widgets=widgets, maxval=n_lines).start() max_len = 0 - for i, sentence in enumerate(sentences): + for i, sentence in zip(range(0, n_lines), sentences): ps = preprocess(sentence) l = len(ps) if l <= self.max_sentence: @@ -155,6 +174,8 @@ def make_dictionary(sentences, n_lines, max_words=None): def tokenize(sentence, d, pad_length): '''Tokenize sentence using dictionary. + If sentence is longer than max_sentence, returns []. + Args: sentence (str): sentence to be tokenized. d (dict): token dictionary. @@ -171,11 +192,12 @@ def tokenize(sentence, d, pad_length): s += [self._PAD] * max(0, pad_length + 2 - len(s)) return s - def read_and_tokenize(file_path): + def read_and_tokenize(file_path, max_length): '''Read and tokenize a file of sentences. Args: file_path (str): path to file. + max_length (int): maximum number of lines to read. Returns: list: list of tokenized sentences. @@ -185,7 +207,7 @@ def read_and_tokenize(file_path): ''' self.logger.info('Reading sentences from %s' % file_path) with open(file_path) as f: - n_lines = sum(1 for line in f) + n_lines = min(sum(1 for line in f), max_length) f.seek(0) d, max_len = make_dictionary(f, n_lines) r_d = dict((v, k) for k, v in d.iteritems()) @@ -196,7 +218,7 @@ def read_and_tokenize(file_path): widgets = ['Tokenizing sentences' , ' (', Timer(), ') [', Percentage(), ']'] pbar = ProgressBar(widgets=widgets, maxval=n_lines).start() - for i, sentence in enumerate(f): + for i, sentence in zip(range(0, n_lines), f): ts = tokenize(sentence, d, max_len) assert len(ts) <= self.max_sentence + 2, (ts, len(ts)) tokenized_sentences.append(ts) @@ -223,7 +245,7 @@ def match_and_trim(sentences_a, sentences_b): sentences_a_tr = [] sentences_b_tr = [] - widgets = ['Matching sentences' , + widgets = ['Matching sentences', ' (', Timer(), ') [', Percentage(), ']'] trimmed = 0 pbar = ProgressBar(widgets=widgets, maxval=len(sentences_a)).start() @@ -239,10 +261,10 @@ def match_and_trim(sentences_a, sentences_b): return sentences_a_tr, sentences_b_tr fr_sentences, self.fr_dict, self.fr_dict_r = read_and_tokenize( - path.join(path.join(source, 'europarl-v7.fr-en.fr'))) + path.join(path.join(source, 'europarl-v7.fr-en.fr')), self.max_length) en_sentences, self.en_dict, self.en_dict_r = read_and_tokenize( - path.join(path.join(source, 'europarl-v7.fr-en.en'))) + path.join(path.join(source, 'europarl-v7.fr-en.en')), self.max_length) fr_sentences, en_sentences = match_and_trim(fr_sentences, en_sentences) @@ -264,6 +286,19 @@ def match_and_trim(sentences_a, sentences_b): @staticmethod def factory(C=None, split=None, idx=None, batch_sizes=None, **kwargs): + ''' + + Args: + C: Data iterator to use, defaults to Europarl. + split: List of percentage values for train, valid, and test datasets respectively. + idx: List of indices for train, valid and test datasets respectively. + batch_sizes: List of batch sizes for train, valid, and test datasets respectively. + **kwargs: Other arguments to be passed to the data iterator. + + Returns: Train, valid, test,(datasets) indices(list of indices for data of each). + + ''' + if C is None: C = Europarl europarl = C(batch_size=10, **kwargs) From e8cfe60153ed71d8b0755ea5decc1db4a1fd03a0 Mon Sep 17 00:00:00 2001 From: Jeremy-E-Johnson Date: Sun, 19 Jun 2016 14:24:20 -0600 Subject: [PATCH 05/37] Fixed bug where words that only appeared in sentences that are too long would cause make_one_hot to crash. Added batch size and iteration testing to test_europarl.py. --- cortex/datasets/__init__.py | 4 ++-- cortex/datasets/basic/europarl.py | 10 +++++++--- cortex/datasets/basic/tests/test_europarl.py | 7 +++++-- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/cortex/datasets/__init__.py b/cortex/datasets/__init__.py index f7efe01..b816a5e 100644 --- a/cortex/datasets/__init__.py +++ b/cortex/datasets/__init__.py @@ -70,7 +70,7 @@ def make_one_hot(Y, n_classes=None): class_list = np.unique(Y).tolist() n_classes = len(class_list) else: - class_list = range(n_classes) + class_list = range(0, n_classes) if Y.ndim == 2: reshape = Y.shape @@ -86,7 +86,7 @@ def make_one_hot(Y, n_classes=None): i = class_list.index(Y[idx]) except ValueError: raise ValueError('Class list is missing elements') - O[idx, i] = 1.; + O[idx, i] = 1. if reshape is not None: O = O.reshape(reshape + (n_classes,)) diff --git a/cortex/datasets/basic/europarl.py b/cortex/datasets/basic/europarl.py index 7a7e478..a94e7ab 100644 --- a/cortex/datasets/basic/europarl.py +++ b/cortex/datasets/basic/europarl.py @@ -103,6 +103,7 @@ def slice_data(self, idx, data=None): if data is None: data = self.data for k, v in data.iteritems(): self.data[k] = v[idx] + self.n = len(idx) def get_data(self, source): special_tokens = { @@ -215,7 +216,7 @@ def read_and_tokenize(file_path, max_length): f.seek(0) self.logger.info('Tokenizing sentences from %s' % file_path) - widgets = ['Tokenizing sentences' , + widgets = ['Tokenizing sentences', ' (', Timer(), ') [', Percentage(), ']'] pbar = ProgressBar(widgets=widgets, maxval=n_lines).start() for i, sentence in zip(range(0, n_lines), f): @@ -275,8 +276,11 @@ def match_and_trim(sentences_a, sentences_b): X = np.array(fr_sentences).astype(intX) Y = np.array(en_sentences).astype(intX) - self.nX_tokens = len(np.unique(X).tolist()) - self.nY_tokens = len(np.unique(Y).tolist()) + self.nX_tokens = X.max() + 1 + self.nY_tokens = Y.max() + 1 + + #self.nX_tokens = len(np.unique(X).tolist()) + #self.nY_tokens = len(np.unique(Y).tolist()) self.logger.info('Creating masks') Mx = (X != 0).astype(intX) diff --git a/cortex/datasets/basic/tests/test_europarl.py b/cortex/datasets/basic/tests/test_europarl.py index df54bba..edecfe1 100644 --- a/cortex/datasets/basic/tests/test_europarl.py +++ b/cortex/datasets/basic/tests/test_europarl.py @@ -7,8 +7,8 @@ from cortex.datasets.basic.europarl import Europarl -def test_europarl(split=[0.7, 0.2, 0.1]): - train, valid, test, idx = Europarl.factory(split=split, batch_sizes=[10, 10, 10], +def test_europarl(split=[0.7, 0.2, 0.1], batch_sizes=[10, 10, 10]): + train, valid, test, idx = Europarl.factory(split=split, batch_sizes=batch_sizes, debug=True, source='/export/mialab/users/jjohnson/data/basic') for i, dataset in enumerate([train, valid, test]): @@ -16,6 +16,9 @@ def test_europarl(split=[0.7, 0.2, 0.1]): for key in ['europarl', 'mask_in', 'label', 'mask_out']: # Test shapes. assert dataset.data[key].shape == (int(idx[2][-1] * split[i]) + 1, 32) + for data in dataset: # Test batch sizes. + assert len(data[key]) == batch_sizes[i] + for k in idx[i]: # Test masks. relative_k = k - idx[i][0] for j in range(0, len(dataset.data['europarl'][relative_k])): From fdbb43832736078d1594181458ba3f939339dbd3 Mon Sep 17 00:00:00 2001 From: Jeremy-E-Johnson Date: Sun, 19 Jun 2016 15:33:47 -0600 Subject: [PATCH 06/37] Fixed path sourcing for data loading, and added europarl to supported data. --- cortex/datasets/__init__.py | 4 +++- cortex/datasets/basic/europarl.py | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/cortex/datasets/__init__.py b/cortex/datasets/__init__.py index b816a5e..9aa6ea5 100644 --- a/cortex/datasets/__init__.py +++ b/cortex/datasets/__init__.py @@ -42,12 +42,14 @@ def resolve(c): from .basic.caltech import CALTECH from .basic.uci import UCI from .basic.cifar import CIFAR + from .basic.europarl import Europarl r_dict = { 'mnist': MNIST, 'cifar': CIFAR, 'caltech': CALTECH, - 'uci': UCI + 'uci': UCI, + 'europarl': Europarl } C = r_dict.get(c, None) diff --git a/cortex/datasets/basic/europarl.py b/cortex/datasets/basic/europarl.py index a94e7ab..c7e5773 100644 --- a/cortex/datasets/basic/europarl.py +++ b/cortex/datasets/basic/europarl.py @@ -20,7 +20,7 @@ from .. import BasicDataset, make_one_hot from ...utils import floatX, intX - +from ...utils.tools import resolve_path np.set_printoptions(threshold=np.nan) logger = logging.getLogger(__name__) @@ -65,6 +65,7 @@ def __init__(self, source=None, english_to_french=True, if source is None: raise ValueError('No source file provided.') + source = resolve_path(source) self.max_sentence = max_sentence self.max_length = max_length From 7e72540985a90f831225e458ac3f6e7b8a3450f0 Mon Sep 17 00:00:00 2001 From: Jeremy-E-Johnson Date: Mon, 20 Jun 2016 08:29:27 -0600 Subject: [PATCH 07/37] Added __init__ to demos_basic/tests to make imports for tests work better. --- cortex/demos/demos_basic/tests/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 cortex/demos/demos_basic/tests/__init__.py diff --git a/cortex/demos/demos_basic/tests/__init__.py b/cortex/demos/demos_basic/tests/__init__.py new file mode 100644 index 0000000..e69de29 From 5ef70a1385903d1b5950118abc019840ce3bd090 Mon Sep 17 00:00:00 2001 From: Jeremy-E-Johnson Date: Mon, 20 Jun 2016 08:29:58 -0600 Subject: [PATCH 08/37] Start of an RNN demo. --- cortex/demos/demos_basic/rnn_europarl.py | 93 ++++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 cortex/demos/demos_basic/rnn_europarl.py diff --git a/cortex/demos/demos_basic/rnn_europarl.py b/cortex/demos/demos_basic/rnn_europarl.py new file mode 100644 index 0000000..e11adc8 --- /dev/null +++ b/cortex/demos/demos_basic/rnn_europarl.py @@ -0,0 +1,93 @@ +""" +Demo for next word guessing using an RNN. + +Try with cortex-run 'rnn_europarl.py ' +""" + +from collections import OrderedDict +import theano +from cortex.models.rnn import SimpleRNN +from cortex.datasets import resolve as resolve_dataset + + +# Default arguments +_learning_args = dict( + learning_rate=0.01, + learning_rate_scheduler=None, + optimizer='sgd', + optimizer_args=dict(), + epochs=100, + valid_key='error', + valid_sign='+', + excludes=[] +) + +_dataset_args = dict( + train_batch_size=100, + valid_batch_size=100, + dataset='europarl', + distribution='multinomial', + source='$data/basic/' +) + +_model_args = dict( + dim_h=None, + l2_decay=None, +) + +simple_rnn_args = dict( + dim_h=100, + input_layer='europarl', + output='label', + dropout=None +) + +extra_arg_keys = ['simple_rnn_args'] + + +def _build(module): + models = OrderedDict() + dataset = module.dataset + simple_rnn_args = module.simple_rnn_args + dim_in = dataset.dims[dataset.name] + dim_out = dataset.dims['label'] + distribution = dataset.distributions[simple_rnn_args['output']] + + model = SimpleRNN.factory(dim_in=dim_in, dim_out=dim_out, distribution=distribution, **simple_rnn_args) + + models['rnn'] = model + return models + + +def _cost(module): + models = module.models + + X = module.inputs[module.dataset.name] + used_inputs = [module.dataset.name] + + model = models['rnn'] + outputs = model(X) + + results = OrderedDict() + p = outputs[0]['p'] + base_cost = model.neg_log_prob(X[1:], p[:-1]).sum(axis=0) + print base_cost, '####' + print model.neg_log_prob(X[1:], p[:-1]), '#####' + cost = base_cost + + updates = theano.OrderedUpdates() + constants = [] + + l2_decay = module.l2_decay + if l2_decay is not False and l2_decay > 0.: + module.logger.info('Adding %.5f L2 weight decay' % l2_decay) + l2_rval = model.l2_decay(l2_decay) + l2_cost = l2_rval.pop('cost') + cost += l2_cost + results['l2_cost'] = l2_cost + + # results['error'] = (Y * (1 - p)).sum(axis=1).mean() + results['-sum log p(x | y)'] = base_cost + results['cost'] = cost + + return used_inputs, results, updates, constants, outputs[0] From 38dd209cea1a4f12b3eef6f9428cdb6a6d522744 Mon Sep 17 00:00:00 2001 From: Jeremy-E-Johnson Date: Mon, 20 Jun 2016 16:53:18 -0600 Subject: [PATCH 09/37] Added field (self.dimall) for holding all dimensions of data. (useful if shuffleing order of indicies) --- cortex/datasets/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cortex/datasets/__init__.py b/cortex/datasets/__init__.py index 9aa6ea5..acb6749 100644 --- a/cortex/datasets/__init__.py +++ b/cortex/datasets/__init__.py @@ -369,6 +369,7 @@ def __init__(self, data, distributions=None, labels='label', name=None, self.balance = balance self.dims = dict() + self.dimsall = dict() if distributions is None: self.distributions = dict() else: @@ -393,6 +394,7 @@ def __init__(self, data, distributions=None, labels='label', name=None, 'number of samples (shape[0]), ' '(%d vs %d)' % (self.n, v.shape[0])) self.dims[k] = v.shape[1] + self.dimsall[k] = v.shape if not k in self.distributions.keys(): self.distributions[k] = 'binomial' From 65b767a5292f8c361f6a58826b9ef5240a805909 Mon Sep 17 00:00:00 2001 From: Jeremy-E-Johnson Date: Mon, 20 Jun 2016 16:54:25 -0600 Subject: [PATCH 10/37] Fixed source path. --- cortex/datasets/basic/tests/test_europarl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cortex/datasets/basic/tests/test_europarl.py b/cortex/datasets/basic/tests/test_europarl.py index edecfe1..ba13101 100644 --- a/cortex/datasets/basic/tests/test_europarl.py +++ b/cortex/datasets/basic/tests/test_europarl.py @@ -9,7 +9,7 @@ def test_europarl(split=[0.7, 0.2, 0.1], batch_sizes=[10, 10, 10]): train, valid, test, idx = Europarl.factory(split=split, batch_sizes=batch_sizes, - debug=True, source='/export/mialab/users/jjohnson/data/basic') + debug=True, source='/export/mialab/users/jjohnson/data/basic/europarl') for i, dataset in enumerate([train, valid, test]): From c796f5a4745111787fa0f26aff67072d04cbb798 Mon Sep 17 00:00:00 2001 From: Jeremy-E-Johnson Date: Mon, 20 Jun 2016 16:56:05 -0600 Subject: [PATCH 11/37] Added functionality for not counting words that appear in corrospondence with long sentences of the other language. --- cortex/datasets/basic/europarl.py | 62 ++++++++++++++++++++++++------- 1 file changed, 49 insertions(+), 13 deletions(-) diff --git a/cortex/datasets/basic/europarl.py b/cortex/datasets/basic/europarl.py index c7e5773..5e66197 100644 --- a/cortex/datasets/basic/europarl.py +++ b/cortex/datasets/basic/europarl.py @@ -88,6 +88,9 @@ def __init__(self, source=None, english_to_french=True, super(Europarl, self).__init__(data, distributions=distributions, name=name, one_hot=False, **kwargs) + self.dimsall['europarl'] = self.dimsall['europarl'][0], self.dimsall['europarl'][1], self.nX_tokens + self.dimsall['label'] = self.dimsall['label'][0], self.dimsall['label'][1], self.nY_tokens + self.out_path = out_path if self.shuffle: @@ -123,7 +126,18 @@ def preprocess(s): ''' return s.lower().translate(self.table, string.punctuation).split() - def make_dictionary(sentences, n_lines, max_words=None): + def find_long_sentences(epath, fpath, n_lines): + with open(epath) as e: + with open(fpath) as f: + e.seek(0) + f.seek(0) + too_long_indices = [] + for i, eSentence, fSentence in zip(range(0, n_lines), e, f): + if len(preprocess(eSentence)) > self.max_sentence or len(preprocess(fSentence)) > self.max_sentence: + too_long_indices.append(i) + return too_long_indices + + def make_dictionary(sentences, n_lines, max_words=None, too_long_indices=None): '''Forms a dictionary from words in sentences. If there are more words than max_words, use the top frequent ones. @@ -141,6 +155,7 @@ def make_dictionary(sentences, n_lines, max_words=None): ''' self.logger.info('Forming dictionary') if max_words is None: max_words = self.max_words + if too_long_indices is None: too_long_indices = [] count_dict = defaultdict(int) @@ -151,7 +166,7 @@ def make_dictionary(sentences, n_lines, max_words=None): for i, sentence in zip(range(0, n_lines), sentences): ps = preprocess(sentence) l = len(ps) - if l <= self.max_sentence: + if l <= self.max_sentence and i not in too_long_indices: for word in ps: count_dict[word] += 1 max_len = max(l, max_len) @@ -194,7 +209,7 @@ def tokenize(sentence, d, pad_length): s += [self._PAD] * max(0, pad_length + 2 - len(s)) return s - def read_and_tokenize(file_path, max_length): + def read_and_tokenize(file_path, max_length, too_long_indices=None): '''Read and tokenize a file of sentences. Args: @@ -211,7 +226,7 @@ def read_and_tokenize(file_path, max_length): with open(file_path) as f: n_lines = min(sum(1 for line in f), max_length) f.seek(0) - d, max_len = make_dictionary(f, n_lines) + d, max_len = make_dictionary(f, n_lines, too_long_indices=too_long_indices) r_d = dict((v, k) for k, v in d.iteritems()) tokenized_sentences = [] @@ -262,11 +277,14 @@ def match_and_trim(sentences_a, sentences_b): return sentences_a_tr, sentences_b_tr + too_long_indices = find_long_sentences(path.join(path.join(source, 'europarl-v7.fr-en.en')), + path.join(path.join(source, 'europarl-v7.fr-en.fr')), self.max_length) + fr_sentences, self.fr_dict, self.fr_dict_r = read_and_tokenize( - path.join(path.join(source, 'europarl-v7.fr-en.fr')), self.max_length) + path.join(path.join(source, 'europarl-v7.fr-en.fr')), self.max_length, too_long_indices=too_long_indices) en_sentences, self.en_dict, self.en_dict_r = read_and_tokenize( - path.join(path.join(source, 'europarl-v7.fr-en.en')), self.max_length) + path.join(path.join(source, 'europarl-v7.fr-en.en')), self.max_length, too_long_indices=too_long_indices) fr_sentences, en_sentences = match_and_trim(fr_sentences, en_sentences) @@ -277,11 +295,8 @@ def match_and_trim(sentences_a, sentences_b): X = np.array(fr_sentences).astype(intX) Y = np.array(en_sentences).astype(intX) - self.nX_tokens = X.max() + 1 - self.nY_tokens = Y.max() + 1 - - #self.nX_tokens = len(np.unique(X).tolist()) - #self.nY_tokens = len(np.unique(Y).tolist()) + self.nX_tokens = len(np.unique(X).tolist()) + int(3 not in np.unique(X).tolist()) + self.nY_tokens = len(np.unique(Y).tolist()) + int(3 not in np.unique(Y).tolist()) self.logger.info('Creating masks') Mx = (X != 0).astype(intX) @@ -366,10 +381,31 @@ def next(self, batch_size=None): n_classes=self.nY_tokens) return rval - def save_images(self, out_file=None): + def save_images(self, image, english=True, out_path=None): '''Shows tokenized in terms of original words. Uses reverse dictionary. ''' - raise NotImplementedError() + print image.shape + sentences = [] + for sentence in image: + sen = [] + for token in sentence: + + if english: + sen.append(self.en_dict_r[token.argmax()]) + else: + sen.append(self.fr_dict_r[token.argmax]) + sentences.append(sen) + + if out_path is None: + for sentence in sentences: + print ' '.join(sentence) + '.' + + else: + with open(out_path, 'w') as f: + for sentence in sentences: + f.write(' '.join(sentence) + '.') + + return sentences From b84c88f114c5f26f1e59e1e2712c4e78c6358e8b Mon Sep 17 00:00:00 2001 From: Jeremy-E-Johnson Date: Mon, 20 Jun 2016 16:56:45 -0600 Subject: [PATCH 12/37] Added _vis(). Appears to be functional. --- cortex/demos/demos_basic/rnn_europarl.py | 59 +++++++++++++++++------- 1 file changed, 43 insertions(+), 16 deletions(-) diff --git a/cortex/demos/demos_basic/rnn_europarl.py b/cortex/demos/demos_basic/rnn_europarl.py index e11adc8..cabfa3e 100644 --- a/cortex/demos/demos_basic/rnn_europarl.py +++ b/cortex/demos/demos_basic/rnn_europarl.py @@ -6,7 +6,9 @@ from collections import OrderedDict import theano +import numpy as np from cortex.models.rnn import SimpleRNN +from cortex.utils import intX, floatX from cortex.datasets import resolve as resolve_dataset @@ -14,20 +16,21 @@ _learning_args = dict( learning_rate=0.01, learning_rate_scheduler=None, - optimizer='sgd', + optimizer='rmsprop', optimizer_args=dict(), epochs=100, - valid_key='error', + valid_key='-sum log p(x | y)', valid_sign='+', excludes=[] ) _dataset_args = dict( - train_batch_size=100, - valid_batch_size=100, + train_batch_size=10, + valid_batch_size=10, + debug=False, dataset='europarl', distribution='multinomial', - source='$data/basic/' + source='$data/basic/europarl' ) _model_args = dict( @@ -36,7 +39,7 @@ ) simple_rnn_args = dict( - dim_h=100, + dim_h=1000, input_layer='europarl', output='label', dropout=None @@ -44,13 +47,19 @@ extra_arg_keys = ['simple_rnn_args'] +#theano.config.compute_test_value = 'warn' + +#theano.config.exception_verbosity = 'high' + +#theano.config.optimizer = 'None' + def _build(module): models = OrderedDict() dataset = module.dataset simple_rnn_args = module.simple_rnn_args - dim_in = dataset.dims[dataset.name] - dim_out = dataset.dims['label'] + dim_in = dataset.dimsall[dataset.name][2] + dim_out = dataset.dimsall[dataset.name][2] distribution = dataset.distributions[simple_rnn_args['output']] model = SimpleRNN.factory(dim_in=dim_in, dim_out=dim_out, distribution=distribution, **simple_rnn_args) @@ -62,20 +71,17 @@ def _build(module): def _cost(module): models = module.models - X = module.inputs[module.dataset.name] + X = module.inputs[module.dataset.name].transpose(1, 0, 2) used_inputs = [module.dataset.name] model = models['rnn'] - outputs = model(X) + outputs, preact, updates = model(X) results = OrderedDict() - p = outputs[0]['p'] - base_cost = model.neg_log_prob(X[1:], p[:-1]).sum(axis=0) - print base_cost, '####' - print model.neg_log_prob(X[1:], p[:-1]), '#####' + p = outputs['p'] + base_cost = model.neg_log_prob(X[1:], p[:-1]).sum(0).mean() cost = base_cost - updates = theano.OrderedUpdates() constants = [] l2_decay = module.l2_decay @@ -90,4 +96,25 @@ def _cost(module): results['-sum log p(x | y)'] = base_cost results['cost'] = cost - return used_inputs, results, updates, constants, outputs[0] + return used_inputs, results, updates, constants, outputs + + +def _viz(module, outputs, results, n_samples=10, n_steps=10): + out_path = module.out_path + out_path = None #### For testing purposes + n_tokens = int(module.dataset.dimsall[module.dataset.name][2]) + + pvals = np.zeros((n_samples, n_tokens)) + 1./float(n_tokens) + x0 = module.models['rnn'].trng.multinomial(pvals=pvals, dtype=floatX) + + outputs, updates = module.models['rnn'].sample(x0=x0, n_steps=n_steps) + + updates = theano.OrderedUpdates(updates) + + f_vis = theano.function([], outputs['x'], updates=updates) + + def f_analysis(): + out = f_vis() + return module.dataset.save_images(out, out_path=out_path) + + return f_analysis From a88a6c2f983420fa2f259ae79faa07ba944144ea Mon Sep 17 00:00:00 2001 From: Jeremy-E-Johnson Date: Thu, 23 Jun 2016 18:18:25 -0600 Subject: [PATCH 13/37] Added voc.py the start of a 2D classification dataset iterator class. --- cortex/datasets/basic/voc.py | 171 +++++++++++++++++++++++++++++++++++ 1 file changed, 171 insertions(+) create mode 100644 cortex/datasets/basic/voc.py diff --git a/cortex/datasets/basic/voc.py b/cortex/datasets/basic/voc.py new file mode 100644 index 0000000..66eab2d --- /dev/null +++ b/cortex/datasets/basic/voc.py @@ -0,0 +1,171 @@ +""" +Data itteratoor for VOC classification data. +""" + +from .. import BasicDataset +from os import path +import logging +from ...utils.tools import resolve_path +from PIL import Image +import random + + +class VOC(BasicDataset): + """Dataset iterator for VOC classification data. (Designed for use with Pyramid RNNs) + + Attributes: + + """ + + def __init__(self, images_loaded=10, chunk_size=5, out_path=None, chunks=1000, + mode='train', source=None, name='voc', **kwargs): + """ + + Args: + images_loaded (int): How many images to load + chunk_size (int): Dimension of chunks to be made. + mode (str): Type of data to load, train, trainval, val. + source (str): Path to directory containing VOCdevkit + name: Name of iterator + **kwargs: + """ + + self.logger = logging.getLogger( + '.'.join([self.__module__, self.__class__.__name__])) + self.logger.info('Loading %s from %s' % (name, source)) + + if source is None: + raise ValueError('No source file provided.') + source = resolve_path(source) + + self.chunks = chunks + self.images_loaded = images_loaded + if chunk_size % 2: + self.chunk_size = chunk_size + else: + self.logger.info('Using %d + 1 to get an odd chunk size.' % chunk_size) + self.chunk_size = chunk_size + 1 + + X, Y = self.get_data(source, mode) + data = {name: X, 'label': Y} + distributions = {name: 'multinomial', 'label': 'multinomial'} + + super(VOC, self).__init__(data, distributions=distributions, + name=name, **kwargs) + + self.out_path = out_path + + if self.shuffle: + self.randomize() + + def get_data(self, source, mode): + """Gets data given source, chunks it, and returns chunks with center labels. + + Args: + source (str): File path to directory containing VOCdevkit. + mode (str): Mode of data, eg. train, trainval, val. + + Returns: + + """ + rand = random.Random() + buff_dist = (self.chunk_size + 1)/2 + + def get_unique(pixels): + """Helper function for get_data, returns the number of unique classifiers in an image. + + Args: + im (list of lists): Pixel classifier values. + + Returns (int): Number of unique classifiers in image. + + """ + unique = [] + for line in pixels: + for i in line: + if i not in unique: + unique.append(i) + return len(unique) + + def image_to_pixels(im): + """ + + Args: + im (Image): Image object form PIL + + Returns (list of lists): Pixels + + """ + pixels = list(im.getdata()) + width, height, = im.size + return [pixels[i * width:(i + 1) * width] for i in xrange(height)] + + + def project_to_binary(pixels): + """Helper function for get_data, returns binary version of input pixels. + + Args: + pixels (list of lists of ints): pixels of an image. + + Returns: Pixels projected to binary. + + """ + retval = [] + for ln in pixels: + retval.append([bool(val) for val in ln]) + return retval + + def get_random_chunk(pixels_data, pixels_label): + """Helper function for get_data, gets random chunk from data, and returns label for center. + + Args: + pixels_data (list of lists): Image pixels of data. + pixels_label (list of lists): Image pixels of label. + + Returns: data_chunk (list of lists subsection of pixels_data), label_val (value of label at center of chunk) + + """ + y = rand.randint(buff_dist, len(pixels_data) - buff_dist) + x = rand.randint(buff_dist, len(pixels_data[0]) - buff_dist) + data_chunk = [] + label_val = pixels_label[y][x] + for i in range(y -buff_dist + 1, y + buff_dist): + data_chunk.append(pixels_data[i][x - buff_dist + 1 : buff_dist - 1]) + assert len(data_chunk) == self.chunk_size and len(data_chunk[0]) == self.chunk_size + return data_chunk, label_val + + names = [] + with open(source + 'VOCdevkit/VOC2010/ImageSets/Segmentation/' + mode + '.txt') as f: + for line in f: + names.append(line[:-1]) + + self.data_images = [] + self.label_images = [] + images_loaded = 0 + for name in names: + if images_loaded < self.images_loaded: + label_im = Image.open(source + '/VOCdevkit/VOC2010/SegmentationObject/' + name + '.png') + label_pixels = image_to_pixels(label_im) + if get_unique(label_pixels) == 3: + self.label_images.append(project_to_binary(label_pixels)) + + data_im = Image.open(source + 'VOCdevkit/VOC2010/JPEGImages/' + name + '.jpeg').convert('1') + self.data_images.append(image_to_pixels(data_im)) + + images_loaded += 1 + else: + break + + X = [] + Y = [] + for i in xrange(0, self.chunks): + k = rand.randint(0, images_loaded - 1) + x, y = get_random_chunk(self.data_images[k], self.label_images[k]) + X.append(x) + Y.append(y) + + assert len(X) == self.chunks and len(Y) == self.chunks + + return X, Y + + From 6e11504c9ea470e4ca96bf0a141f05b8925f61f8 Mon Sep 17 00:00:00 2001 From: Jeremy-E-Johnson Date: Thu, 23 Jun 2016 18:19:29 -0600 Subject: [PATCH 14/37] Added pyramid_rnn model and basic tests for its construction and step function. Still a work in progress. --- cortex/models/pyramid_rnn.py | 75 +++++++++++++++++++++++++ cortex/models/tests/test_pyramid_rnn.py | 39 +++++++++++++ 2 files changed, 114 insertions(+) create mode 100644 cortex/models/pyramid_rnn.py create mode 100644 cortex/models/tests/test_pyramid_rnn.py diff --git a/cortex/models/pyramid_rnn.py b/cortex/models/pyramid_rnn.py new file mode 100644 index 0000000..85758e7 --- /dev/null +++ b/cortex/models/pyramid_rnn.py @@ -0,0 +1,75 @@ +""" +Module for 2 dimensional pyramid RNN layers. +""" + +from .rnn import RNN +import collections +import theano.tensor as T +import numpy as np +from ..utils import tools + + +class Pyramid_RNN(RNN): + + def __init__(self, dim_in, dim_hs, dim_out=None, output_net=None, + input_net=None, name='pyramid', **kwargs): + + if dim_out is None: + self.dim_out = 1 + super(Pyramid_RNN, self).__init__(dim_in=dim_in, dim_hs=dim_hs, name=name, + output_net=output_net, input_net=input_net, **kwargs) + + @staticmethod + def factory(dim_in=None, dim_out=None, dim_hs=None, **kwargs): + '''Factory for creating MLPs for Pyramid_RNN and returning . + + Convenience to quickly create MLPs from dictionaries, linking all + relevant dimensions and distributions. + + Args: + dim_in (int): input dimension. + dim_hs (list): dimensions of recurrent units. + dim_out (Optional[int]): output dimension. If not provided, assumed + to be dim_in. + + Returns: + RNN + + ''' + assert len(dim_hs) > 0 + if dim_out is None: + dim_out = 1 + mlps, kwargs = RNN.mlp_factory(dim_in, dim_out, dim_hs, **kwargs) + kwargs.update(**mlps) + + return Pyramid_RNN(dim_in, dim_hs, dim_out=dim_out, **kwargs) + + def set_params(self): + '''Initialize RNN parameters. + + ''' + self.params = collections.OrderedDict() + for i, dim_h in enumerate(self.dim_hs): + Ur = tools.norm_weight(3 * dim_h, dim_h) + self.params['Ur%d' % i] = Ur + + self.set_net_params() + + def _step(self, m, y, h_, Ur): + '''Step function for RNN call. + + Args: + m (T.tensor): masks. + y (T.tensor): inputs. + h_ (T.tensor): recurrent state. + Ur (theano.shared): recurrent connection. + + Returns: + T.tensor: next recurrent state. + + ''' + H_t = T.concatenate((h_, T.roll(h_, 1, 2), T.roll(h_, -1, 2)), 2) + preact = y + T.dot(H_t, Ur) + h = T.tanh(preact) + h = m * h + (1 - m) * h_ + return h diff --git a/cortex/models/tests/test_pyramid_rnn.py b/cortex/models/tests/test_pyramid_rnn.py new file mode 100644 index 0000000..2573fa9 --- /dev/null +++ b/cortex/models/tests/test_pyramid_rnn.py @@ -0,0 +1,39 @@ +''' +Module for testing 2D pyramid RNN. +''' + +from cortex.models.pyramid_rnn import Pyramid_RNN +import numpy as np +import theano +import theano.tensor as T + + +def test_build(dim_in=13, dim_h=17): + pyramid = Pyramid_RNN.factory(dim_in=dim_in, dim_hs=[dim_h], + dim_out=1) + pyramid.set_tparams() + + return pyramid + + +def test_step(pyramid=None, dim_in=13, dim_h=17): + if pyramid is None: + pyramid = test_build(dim_in=dim_in, dim_h=dim_h) + + m = theano.tensor.tensor3() + y = theano.tensor.tensor3() + h_ = theano.tensor.tensor3() + Ur = theano.tensor.matrix() + + activation = pyramid._step(m, y, h_, Ur) + f = theano.function([m, y, h_, Ur], activation) + + + t = f(np.ones((10, dim_in, dim_h), dtype='float32'), np.ones((10, dim_in, dim_h), dtype='float32'), + np.ones((10, dim_in, dim_h), dtype='float32'), pyramid.params['Ur0']) + + preact = np.ones((10, dim_in, dim_h), dtype='float32') + \ + np.dot(np.ones((10, dim_in, 3*dim_h), dtype='float32'), pyramid.params['Ur0']) + n = np.tanh(preact) + + np.testing.assert_almost_equal(t, n) From 8688d7512173150838b1beaee209aa10ec1be852 Mon Sep 17 00:00:00 2001 From: Jeremy-E-Johnson Date: Fri, 24 Jun 2016 15:12:41 -0600 Subject: [PATCH 15/37] Working on the __call__ function. Not currently functional. --- cortex/models/pyramid_rnn.py | 117 +++++++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) diff --git a/cortex/models/pyramid_rnn.py b/cortex/models/pyramid_rnn.py index 85758e7..71949da 100644 --- a/cortex/models/pyramid_rnn.py +++ b/cortex/models/pyramid_rnn.py @@ -5,8 +5,11 @@ from .rnn import RNN import collections import theano.tensor as T +import theano import numpy as np from ..utils import tools +import collections as coll +from ..utils import floatX class Pyramid_RNN(RNN): @@ -73,3 +76,117 @@ def _step(self, m, y, h_, Ur): h = T.tanh(preact) h = m * h + (1 - m) * h_ return h + + def step_call(self, x, m, h0s, *params): + '''Step version of __call__ for scan + + Args: + x (T.tensor): input. + m (T.tensor): mask. + h0s (list): list of recurrent initial states. Calculated in this function now, ie NOT IMPLEMENTED + *params: list of theano.shared. + + Returns: + OrderedDict: dictionary of results. and now calculated h0s + + ''' + n_steps = (x.shape[0] + 1)/2 + n_samples = x.shape[1] + input = x + + updates = theano.OrderedUpdates() + + h0s = [] + hs = [] + directional_values = [] + for k in range(0, 4): # Iterate through directions. + x = np.swapaxes(np.rot90(np.swapaxes(input, 1, 2), k), 1, 2)[0:(self.dim_in + 1)/2].astype('float32') + h0s.append([T.alloc(0., x.shape[1], self.dim_in, dim_h).astype(floatX) for dim_h in self.dim_hs]) + for i, h0 in enumerate(h0s[k]): + seqs = [m[:, :, None]] + self.call_seqs(x, None, i, *params) + outputs_info = [h0] + non_seqs = [self.get_recurrent_args(*params)[i]] + h, updates_ = theano.scan( + self._step, + sequences=seqs, + outputs_info=outputs_info, + non_sequences=non_seqs, + name=self.name + '_recurrent_steps_%d' % i, + n_steps=n_steps) + hs.append(h) + x = h + updates += updates_ + directional_values.append(h[(self.dim_in + 1)/2]) # Remember directional outputs. + + o_params = self.get_output_args(*params) + out_net_out = self.output_net.step_call(sum(directional_values), *o_params) # Sum different directions. + preact = out_net_out['z'] + p = out_net_out['p'] + + return coll.OrderedDict(hs=hs, p=p, z=preact), updates, h0s + + def __call__(self, x, m=None, h0s=None, condition_on=None): + '''Call function. + + For learning RNNs. + + Args: + x (T.tensor): input sequence. window x batch x dim (a x b x a) where a is chunk size, b is batch size. + m (T.tensor): mask. window x batch. For masking in recurrent steps. NOT IMPLEMENTED + h0s (Optional[list]): initial h0s. NOT IMPLEMENTED + condition_on (Optional[T.tensor]): conditional for recurrent step. + + Returns: + OrderedDict: dictionary of results: hiddens, probabilities, and + preacts. + theano.OrderedUpdates. + + ''' + constants = [] + input_rotations = [] + + ''' Calculating h0s in step_call so that rotations of data happen once and don't need to be saved. + if h0s is None and self.init_net is not None: + h0s = self.init_net.initialize(x[0]) + constants += h0s + elif h0s is None: + h0s = [T.alloc(0., x.shape[1], dim_h).astype(floatX) for dim_h in self.dim_hs] + ''' + + if m is None: + m = T.ones((x.shape[0], x.shape[1])).astype(floatX) + + params = self.get_sample_params() + + results, updates, h0s = self.step_call(x, m, h0s, *params) + results['h0s'] = h0s + return results, updates, constants + + def call_seqs(self, x, condition_on, level, *params): + '''Prepares the input for `__call__`. + + Args: + x (T.tensor): input + condtion_on (T.tensor or None): tensor to condition recurrence on. + level (int): reccurent level. + *params: list of theano.shared. + + Returns: + list: list of scan inputs. + + ''' + print x.shape + if level == 0: + i_params = self.get_input_args(*params) + a = self.input_net.step_preact(x, *i_params) + else: + i_params = self.get_inter_args(level - 1, *params) + a = self.inter_nets[level - 1].step_preact(x, *i_params) + print a.shape + print self.input_net.dim_in + print self.input_net.dim_out + + if condition_on is not None: + a += condition_on + + return [a] From 2cb1caaa08f38833f27a6fafa50f7edb7127d944 Mon Sep 17 00:00:00 2001 From: Jeremy-E-Johnson Date: Fri, 24 Jun 2016 15:13:07 -0600 Subject: [PATCH 16/37] Test for new __call__ function added. --- cortex/models/tests/test_pyramid_rnn.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/cortex/models/tests/test_pyramid_rnn.py b/cortex/models/tests/test_pyramid_rnn.py index 2573fa9..58ee4a8 100644 --- a/cortex/models/tests/test_pyramid_rnn.py +++ b/cortex/models/tests/test_pyramid_rnn.py @@ -7,6 +7,7 @@ import theano import theano.tensor as T +theano.config.optimizer = 'None' def test_build(dim_in=13, dim_h=17): pyramid = Pyramid_RNN.factory(dim_in=dim_in, dim_hs=[dim_h], @@ -37,3 +38,18 @@ def test_step(pyramid=None, dim_in=13, dim_h=17): n = np.tanh(preact) np.testing.assert_almost_equal(t, n) + + +def test_call(pyramid=None, dim_in=13, dim_h=17): + if pyramid is None: + pyramid = test_build(dim_in=dim_in, dim_h=dim_h) + + rng = np.random.RandomState() + + x = rng.randn(13, 10, 13) + + y = pyramid(x) + + f = theano.function([], y[0]['p']) + + print f() From ee2236f00ed5ac1a0359e6139cb436b5fddfb170 Mon Sep 17 00:00:00 2001 From: Jeremy-E-Johnson Date: Fri, 24 Jun 2016 15:13:25 -0600 Subject: [PATCH 17/37] Tests constructor for voc data iterator. --- cortex/datasets/basic/tests/test_voc.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 cortex/datasets/basic/tests/test_voc.py diff --git a/cortex/datasets/basic/tests/test_voc.py b/cortex/datasets/basic/tests/test_voc.py new file mode 100644 index 0000000..ba8810f --- /dev/null +++ b/cortex/datasets/basic/tests/test_voc.py @@ -0,0 +1,10 @@ +""" +Tests voc.py. +""" + +from ..voc import VOC + + +def test_voc(): + test = VOC(source='$data', batch_size=10) + From eff32b830807f3cd47b5068ebe9cf936d0f8c533 Mon Sep 17 00:00:00 2001 From: Jeremy-E-Johnson Date: Fri, 24 Jun 2016 15:13:46 -0600 Subject: [PATCH 18/37] Tests and minor bug fixes. Normalized outputs. --- cortex/datasets/basic/voc.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/cortex/datasets/basic/voc.py b/cortex/datasets/basic/voc.py index 66eab2d..7e0c1e8 100644 --- a/cortex/datasets/basic/voc.py +++ b/cortex/datasets/basic/voc.py @@ -7,7 +7,9 @@ import logging from ...utils.tools import resolve_path from PIL import Image +import PIL import random +import numpy as np class VOC(BasicDataset): @@ -100,7 +102,6 @@ def image_to_pixels(im): width, height, = im.size return [pixels[i * width:(i + 1) * width] for i in xrange(height)] - def project_to_binary(pixels): """Helper function for get_data, returns binary version of input pixels. @@ -112,7 +113,7 @@ def project_to_binary(pixels): """ retval = [] for ln in pixels: - retval.append([bool(val) for val in ln]) + retval.append([int(bool(val)) for val in ln]) return retval def get_random_chunk(pixels_data, pixels_label): @@ -129,13 +130,13 @@ def get_random_chunk(pixels_data, pixels_label): x = rand.randint(buff_dist, len(pixels_data[0]) - buff_dist) data_chunk = [] label_val = pixels_label[y][x] - for i in range(y -buff_dist + 1, y + buff_dist): - data_chunk.append(pixels_data[i][x - buff_dist + 1 : buff_dist - 1]) + for index in range(y - buff_dist + 1, y + buff_dist): + data_chunk.append(pixels_data[index][x - buff_dist + 1:x + buff_dist]) assert len(data_chunk) == self.chunk_size and len(data_chunk[0]) == self.chunk_size return data_chunk, label_val names = [] - with open(source + 'VOCdevkit/VOC2010/ImageSets/Segmentation/' + mode + '.txt') as f: + with open(source + '/basic/VOCdevkit/VOC2010/ImageSets/Segmentation/' + mode + '.txt') as f: for line in f: names.append(line[:-1]) @@ -144,12 +145,12 @@ def get_random_chunk(pixels_data, pixels_label): images_loaded = 0 for name in names: if images_loaded < self.images_loaded: - label_im = Image.open(source + '/VOCdevkit/VOC2010/SegmentationObject/' + name + '.png') + label_im = Image.open(source + '/basic/VOCdevkit/VOC2010/SegmentationObject/' + name + '.png') label_pixels = image_to_pixels(label_im) if get_unique(label_pixels) == 3: self.label_images.append(project_to_binary(label_pixels)) - data_im = Image.open(source + 'VOCdevkit/VOC2010/JPEGImages/' + name + '.jpeg').convert('1') + data_im = Image.open(source + '/basic/VOCdevkit/VOC2010/JPEGImages/' + name + '.jpg').convert('1') self.data_images.append(image_to_pixels(data_im)) images_loaded += 1 @@ -161,11 +162,11 @@ def get_random_chunk(pixels_data, pixels_label): for i in xrange(0, self.chunks): k = rand.randint(0, images_loaded - 1) x, y = get_random_chunk(self.data_images[k], self.label_images[k]) - X.append(x) - Y.append(y) + X.append(np.array(x, dtype='float32')/255) # Normalize + Y.append(np.array(y, dtype='float32')) assert len(X) == self.chunks and len(Y) == self.chunks - return X, Y + return np.array(X), np.array(Y) From af3d9d57f81d6acc6d5412470e4cf5ddaa1a2de9 Mon Sep 17 00:00:00 2001 From: Jeremy-E-Johnson Date: Wed, 29 Jun 2016 14:17:52 -0600 Subject: [PATCH 19/37] Trying to get a successful demo running. Has a lot of debugging mess in it. Not working. --- cortex/demos/demos_basic/pyramid_voc.py | 125 ++++++++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 cortex/demos/demos_basic/pyramid_voc.py diff --git a/cortex/demos/demos_basic/pyramid_voc.py b/cortex/demos/demos_basic/pyramid_voc.py new file mode 100644 index 0000000..b4d35f4 --- /dev/null +++ b/cortex/demos/demos_basic/pyramid_voc.py @@ -0,0 +1,125 @@ +""" +Demo for Pyramid RNN on VOC classification dataset. + +Try with 'cortex-run pyramid_voc.py' +""" + +""" +Demo for next word guessing using an RNN. + +Try with cortex-run 'rnn_europarl.py ' +""" + +from collections import OrderedDict +import theano +import numpy as np +from cortex.models.pyramid_rnn import Pyramid_RNN +from cortex.utils import intX, floatX +from cortex.datasets import resolve as resolve_dataset +import theano.tensor as T + + +# Default arguments +_learning_args = dict( + learning_rate=0.01, + learning_rate_scheduler=None, + optimizer='rmsprop', + optimizer_args=dict(), + epochs=100, + valid_key='-sum log p(x | y)', + valid_sign='+', + excludes=[] +) + +_dataset_args = dict( + train_batch_size=10, + valid_batch_size=10, + #test_batch_size=10, + debug=False, + dataset='voc', + chunks=1000, + distribution='multinomial', + chunk_size=15, + source='$data' +) + +_model_args = dict( + dim_h=None, + l2_decay=None, + mask_in='mask_in' +) + +pyramid_args = dict( + dim_hs=[17], + input_layer='voc', + output='label', +) + +extra_arg_keys = ['pyramid_args'] + +theano.config.on_unused_input = 'ignore' +theano.config.optimizer = 'None' +#theano.config.exception_verbosity = 'high' +#theano.config.compute_test_value = 'warn' + + +def _build(module): + models = OrderedDict() + dataset = module.dataset + pyramid_args = module.pyramid_args + width = dataset.chunk_size + dim_in = 1 + dim_out = 1 + distribution = dataset.distributions[pyramid_args['output']] + + model = Pyramid_RNN.factory(dim_in=dim_in, dim_out=dim_out, distribution=distribution, + width=width, **pyramid_args) + + models['pyramid_rnn'] = model + return models + + +def _cost(module): + models = module.models + + X = module.inputs[module.dataset.name]#.swapaxes(0, 1) + Y = module.inputs['label'] + used_inputs = [module.dataset.name, 'label'] + + model = models['pyramid_rnn'] + main(model) + + outputs, preact, updates = model(X) + + results = OrderedDict() + p = outputs['p'] + base_cost = model.neg_log_prob(Y, p).sum(0).mean() + cost = base_cost + + constants = [] + + l2_decay = module.l2_decay + if l2_decay is not False and l2_decay > 0.: + module.logger.info('Adding %.5f L2 weight decay' % l2_decay) + l2_rval = model.l2_decay(l2_decay) + l2_cost = l2_rval.pop('cost') + cost += l2_cost + results['l2_cost'] = l2_cost + + # results['error'] = (Y * (1 - p)).sum(axis=1).mean() + results['-sum log p(x | y)'] = base_cost + results['cost'] = cost + + return used_inputs, results, updates, constants, outputs + + +def main(model): + x = T.alloc(1, 8, 10, 17) + + params = model.get_sample_params() + + #print params + + a = model.call_seqs(x, None, 0, *params)[0] + + print a.eval().shape, '****************************************' From b8275cbdcd255a6532c7e15ff3e054153fa93584 Mon Sep 17 00:00:00 2001 From: Jeremy-E-Johnson Date: Wed, 29 Jun 2016 14:18:53 -0600 Subject: [PATCH 20/37] Added factory, support for train/valid/test modes, and fixed output types. --- cortex/datasets/basic/voc.py | 63 +++++++++++++++++++++++++++++++----- 1 file changed, 55 insertions(+), 8 deletions(-) diff --git a/cortex/datasets/basic/voc.py b/cortex/datasets/basic/voc.py index 7e0c1e8..627f4cd 100644 --- a/cortex/datasets/basic/voc.py +++ b/cortex/datasets/basic/voc.py @@ -20,26 +20,30 @@ class VOC(BasicDataset): """ def __init__(self, images_loaded=10, chunk_size=5, out_path=None, chunks=1000, - mode='train', source=None, name='voc', **kwargs): + start_image=0, mode='train', source=None, name='voc', **kwargs): """ Args: images_loaded (int): How many images to load chunk_size (int): Dimension of chunks to be made. - mode (str): Type of data to load, train, trainval, val. + mode (str): Type of data to load, train, valid, test. source (str): Path to directory containing VOCdevkit name: Name of iterator **kwargs: """ + self.mode_resolve = {'train': 'train', 'valid': 'trainval', 'test': 'val'} + self.mode = self.mode_resolve[mode] + self.logger = logging.getLogger( '.'.join([self.__module__, self.__class__.__name__])) - self.logger.info('Loading %s from %s' % (name, source)) + self.logger.info('Loading %s from %s as %s' % (name, source, self.mode)) if source is None: raise ValueError('No source file provided.') source = resolve_path(source) + self.start_image = start_image self.chunks = chunks self.images_loaded = images_loaded if chunk_size % 2: @@ -48,7 +52,7 @@ def __init__(self, images_loaded=10, chunk_size=5, out_path=None, chunks=1000, self.logger.info('Using %d + 1 to get an odd chunk size.' % chunk_size) self.chunk_size = chunk_size + 1 - X, Y = self.get_data(source, mode) + X, Y = self.get_data(source, self.mode) data = {name: X, 'label': Y} distributions = {name: 'multinomial', 'label': 'multinomial'} @@ -60,12 +64,42 @@ def __init__(self, images_loaded=10, chunk_size=5, out_path=None, chunks=1000, if self.shuffle: self.randomize() + @staticmethod + def factory(split=None, idx=None, batch_sizes=None, **kwargs): + if split is None: + raise NotImplementedError('Idx are not supported for this dataset yet.') + if batch_sizes is None: + raise ValueError('Need batch sizes') + + chunks = kwargs['chunks'] + chunk_ammounts = [] + for val in split: + chunk_ammounts.append(int(chunks * val)) + + train = VOC(images_loaded=10, start_image=0, chunk_size=kwargs['chunk_size'], + chunks=chunk_ammounts[0], mode='train', source=kwargs['source'], + batch_size=batch_sizes[0]) + valid = VOC(images_loaded=5, start_image=10, chunk_size=kwargs['chunk_size'], + chunks=chunk_ammounts[1], mode='valid', source=kwargs['source'], + batch_size=batch_sizes[1]) + test = VOC(images_loaded=5, start_image=15, chunk_size=kwargs['chunk_size'], + chunks=chunk_ammounts[2], mode='test', source=kwargs['source'], + batch_size=batch_sizes[2]) + + accum = 0 + idx = [] + for val in chunk_ammounts: + idx.append(range(accum, accum + val)) + accum += val + + return train, valid, test, idx + def get_data(self, source, mode): """Gets data given source, chunks it, and returns chunks with center labels. Args: source (str): File path to directory containing VOCdevkit. - mode (str): Mode of data, eg. train, trainval, val. + mode (str): Mode of data, eg. train, valid, test. Returns: @@ -119,7 +153,7 @@ def project_to_binary(pixels): def get_random_chunk(pixels_data, pixels_label): """Helper function for get_data, gets random chunk from data, and returns label for center. - Args: + Args:self, VOC pixels_data (list of lists): Image pixels of data. pixels_label (list of lists): Image pixels of label. @@ -144,7 +178,7 @@ def get_random_chunk(pixels_data, pixels_label): self.label_images = [] images_loaded = 0 for name in names: - if images_loaded < self.images_loaded: + if images_loaded < (self.images_loaded + self.start_image) and images_loaded >= self.start_image: label_im = Image.open(source + '/basic/VOCdevkit/VOC2010/SegmentationObject/' + name + '.png') label_pixels = image_to_pixels(label_im) if get_unique(label_pixels) == 3: @@ -154,13 +188,15 @@ def get_random_chunk(pixels_data, pixels_label): self.data_images.append(image_to_pixels(data_im)) images_loaded += 1 + elif images_loaded < self.start_image: + images_loaded += 1 else: break X = [] Y = [] for i in xrange(0, self.chunks): - k = rand.randint(0, images_loaded - 1) + k = rand.randint(0, len(self.data_images) - 1) x, y = get_random_chunk(self.data_images[k], self.label_images[k]) X.append(np.array(x, dtype='float32')/255) # Normalize Y.append(np.array(y, dtype='float32')) @@ -169,4 +205,15 @@ def get_random_chunk(pixels_data, pixels_label): return np.array(X), np.array(Y) + def next(self): + rval = super(VOC, self).next() + + #data = rval['voc'] + + #rval['voc'] = [] + #for k in range(0, 4): + # rval['voc'].append(np.swapaxes(np.rot90(np.swapaxes(data, 1, 2), k), 1, 2)[0:(self.chunk_size + 1)/2].astype('float32')) + + rval['label'] = np.array([b[1] for b in rval['label']]) + return rval From a6400abe95f51c507038dd21a4639cedb1605087 Mon Sep 17 00:00:00 2001 From: Jeremy-E-Johnson Date: Wed, 29 Jun 2016 14:20:35 -0600 Subject: [PATCH 21/37] Attempted to get demo running. Not currently working. Has much excess debug code. --- cortex/models/pyramid_rnn.py | 127 ++++++++++++++++++++++++++++------- 1 file changed, 101 insertions(+), 26 deletions(-) diff --git a/cortex/models/pyramid_rnn.py b/cortex/models/pyramid_rnn.py index 71949da..c8f4f7f 100644 --- a/cortex/models/pyramid_rnn.py +++ b/cortex/models/pyramid_rnn.py @@ -14,16 +14,19 @@ class Pyramid_RNN(RNN): - def __init__(self, dim_in, dim_hs, dim_out=None, output_net=None, - input_net=None, name='pyramid', **kwargs): + def __init__(self, dim_in, dim_hs, width, dim_out=None, + output_net=None, input_net=None, name='pyramid', **kwargs): if dim_out is None: self.dim_out = 1 + + self.width = width + super(Pyramid_RNN, self).__init__(dim_in=dim_in, dim_hs=dim_hs, name=name, output_net=output_net, input_net=input_net, **kwargs) @staticmethod - def factory(dim_in=None, dim_out=None, dim_hs=None, **kwargs): + def factory(dim_in=None, dim_out=None, dim_hs=None, width=None, **kwargs): '''Factory for creating MLPs for Pyramid_RNN and returning . Convenience to quickly create MLPs from dictionaries, linking all @@ -34,6 +37,7 @@ def factory(dim_in=None, dim_out=None, dim_hs=None, **kwargs): dim_hs (list): dimensions of recurrent units. dim_out (Optional[int]): output dimension. If not provided, assumed to be dim_in. + width (int): How wide the input block is. Returns: RNN @@ -42,10 +46,41 @@ def factory(dim_in=None, dim_out=None, dim_hs=None, **kwargs): assert len(dim_hs) > 0 if dim_out is None: dim_out = 1 - mlps, kwargs = RNN.mlp_factory(dim_in, dim_out, dim_hs, **kwargs) + mlps, kwargs = Pyramid_RNN.mlp_factory(dim_in, dim_out, dim_hs, **kwargs) kwargs.update(**mlps) - return Pyramid_RNN(dim_in, dim_hs, dim_out=dim_out, **kwargs) + return Pyramid_RNN(dim_in, dim_hs, width, dim_out=dim_out, **kwargs) + + @staticmethod + def mlp_factory(dim_in, dim_out, dim_hs, o_dim_in=None, i_net=None, + o_net=None, c_net=None, data_distribution='binomial', + initialization=None, init_args=None, **kwargs): + '''Factory for creating MLPs for Pyramid RNN. + + Args: + dim_in (int): input dimension. + dim_out (int): output dimension. If not provided, assumed + to be dim_in. + dim_hs (list): dimensions of recurrent units. + o_dim_in (Optional[int]): optional input dimension for output + net. If not provided, then use the last hidden dim. + i_net (dict): input network args. + o_net (dict): output network args. + c_net (dict): conditional network args. + data_distribution (str): distribution of the output. + initialization (str): type of initialization. + init_args (dict): initialization keyword arguments. + **kwargs: extra keyword arguments. + + Returns: + dict: MLPs. + dict: extra keyword arguments. + + ''' + + mlps, kwargs = RNN.mlp_factory(dim_in=dim_in, dim_out=dim_out, dim_hs=dim_hs, **kwargs) + + return mlps, kwargs def set_params(self): '''Initialize RNN parameters. @@ -95,15 +130,17 @@ def step_call(self, x, m, h0s, *params): input = x updates = theano.OrderedUpdates() + print 'width = ', self.width, '**************** ', (self.width + 1)/2, self.dim_hs, '++++++++++++++++++++++++++' h0s = [] hs = [] - directional_values = [] + #directional_values = [] for k in range(0, 4): # Iterate through directions. - x = np.swapaxes(np.rot90(np.swapaxes(input, 1, 2), k), 1, 2)[0:(self.dim_in + 1)/2].astype('float32') - h0s.append([T.alloc(0., x.shape[1], self.dim_in, dim_h).astype(floatX) for dim_h in self.dim_hs]) + #x = np.swapaxes(np.rot90(np.swapaxes(input, 1, 2), k), 1, 2)[0:(self.width + 1)/2].astype('float32') + x = self.rotate(input, k)[:(self.width + 1)/2, :, :].astype('float32') + h0s.append([T.alloc(0., (self.width + 1)/2, x.shape[1], dim_h).astype(floatX) for dim_h in self.dim_hs]) for i, h0 in enumerate(h0s[k]): - seqs = [m[:, :, None]] + self.call_seqs(x, None, i, *params) + seqs = [m[:, :, :, None]] + self.call_seqs(x, None, i, *params) outputs_info = [h0] non_seqs = [self.get_recurrent_args(*params)[i]] h, updates_ = theano.scan( @@ -116,14 +153,19 @@ def step_call(self, x, m, h0s, *params): hs.append(h) x = h updates += updates_ - directional_values.append(h[(self.dim_in + 1)/2]) # Remember directional outputs. + if k == 0: + output = [h[-1, :, (self.width + 1)/2, :]] + else: + output = output + [h[-1, :, (self.width + 1)/2, :]] + #directional_values += [h[-1, :, (self.width + 1)/2, :]] # Remember directional outputs. + #print T.sum(output) o_params = self.get_output_args(*params) - out_net_out = self.output_net.step_call(sum(directional_values), *o_params) # Sum different directions. + out_net_out = self.output_net.step_call(T.sum(output), *o_params) # Sum different directions. preact = out_net_out['z'] p = out_net_out['p'] - return coll.OrderedDict(hs=hs, p=p, z=preact), updates, h0s + return coll.OrderedDict(hs=hs, p=p, z=preact), updates, h0s[0] def __call__(self, x, m=None, h0s=None, condition_on=None): '''Call function. @@ -143,18 +185,9 @@ def __call__(self, x, m=None, h0s=None, condition_on=None): ''' constants = [] - input_rotations = [] - - ''' Calculating h0s in step_call so that rotations of data happen once and don't need to be saved. - if h0s is None and self.init_net is not None: - h0s = self.init_net.initialize(x[0]) - constants += h0s - elif h0s is None: - h0s = [T.alloc(0., x.shape[1], dim_h).astype(floatX) for dim_h in self.dim_hs] - ''' if m is None: - m = T.ones((x.shape[0], x.shape[1])).astype(floatX) + m = T.ones((x.shape)).astype(floatX) params = self.get_sample_params() @@ -175,18 +208,60 @@ def call_seqs(self, x, condition_on, level, *params): list: list of scan inputs. ''' - print x.shape + """ + #print x.shape + x = x[:, :, :, None]#.swapaxes(0, 3) + #print x.shape if level == 0: i_params = self.get_input_args(*params) a = self.input_net.step_preact(x, *i_params) else: i_params = self.get_inter_args(level - 1, *params) a = self.inter_nets[level - 1].step_preact(x, *i_params) - print a.shape - print self.input_net.dim_in - print self.input_net.dim_out + #print a.shape + #print self.input_net.dim_in + #print self.input_net.dim_out + """ + + params = list(params) + + print params + + W = params.pop(1) + b = params.pop(1) + + print W, type(W) + + print params + + for i, z in enumerate(x): + a = T.concatenate([(T.dot(c, W) + b)[:, None, None] for c in z.swapaxes(0, 1)], 1).swapaxes(0, 1) + if i: + T.concatenate([rval, a], 2) + else: + rval = a + + #a = T.dot(a, T.alloc(0, 2, 2)) if condition_on is not None: a += condition_on return [a] + + def rotate(self, tensor, n_times): + #retval = tensor.copy() + if n_times == 0: + return tensor + + #for i in range(0, self.width): + # retval[:, :, tensor.shape[0] - i] = tensor[i, :, :] + + #retval = np.array([tensor[:, :, self.width - 1 - i] for i in range(0, self.width)]) + #retval = + retval = tensor.swapaxes(0, 2)[::-1] + + + if n_times == 1: + return retval + elif n_times > 1: + return self.rotate(retval, n_times - 1) From 7c468ce9d94e1612cbc8a0c07d478784360ec2a2 Mon Sep 17 00:00:00 2001 From: Jeremy-E-Johnson Date: Thu, 21 Jul 2016 16:32:44 -0600 Subject: [PATCH 22/37] Added support for VOC dataset. --- cortex/datasets/__init__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cortex/datasets/__init__.py b/cortex/datasets/__init__.py index acb6749..354c36f 100644 --- a/cortex/datasets/__init__.py +++ b/cortex/datasets/__init__.py @@ -43,13 +43,15 @@ def resolve(c): from .basic.uci import UCI from .basic.cifar import CIFAR from .basic.europarl import Europarl + from .basic.voc import VOC r_dict = { 'mnist': MNIST, 'cifar': CIFAR, 'caltech': CALTECH, 'uci': UCI, - 'europarl': Europarl + 'europarl': Europarl, + 'voc': VOC } C = r_dict.get(c, None) @@ -178,7 +180,7 @@ def dataset_factory(resolve_dataset, dataset=None, split=[0.7, 0.2, 0.1], valid_batch_size=valid_batch_size, test_batch_size=test_batch_size, **dataset_args) else: - train, valid, test, idx = C.factory( + train, valid, test, idx = C.factory( split=split, idx=idx, batch_sizes=[train_batch_size, valid_batch_size, test_batch_size], **dataset_args) From 578ca1d32a4ca2d4aad17289bb2418ab38e9d978 Mon Sep 17 00:00:00 2001 From: Jeremy-E-Johnson Date: Thu, 21 Jul 2016 16:34:24 -0600 Subject: [PATCH 23/37] Bugfixes and cleanup for demo. --- cortex/models/pyramid_rnn.py | 53 ++++-------------------------------- 1 file changed, 6 insertions(+), 47 deletions(-) diff --git a/cortex/models/pyramid_rnn.py b/cortex/models/pyramid_rnn.py index c8f4f7f..6280810 100644 --- a/cortex/models/pyramid_rnn.py +++ b/cortex/models/pyramid_rnn.py @@ -130,15 +130,12 @@ def step_call(self, x, m, h0s, *params): input = x updates = theano.OrderedUpdates() - print 'width = ', self.width, '**************** ', (self.width + 1)/2, self.dim_hs, '++++++++++++++++++++++++++' h0s = [] hs = [] - #directional_values = [] for k in range(0, 4): # Iterate through directions. - #x = np.swapaxes(np.rot90(np.swapaxes(input, 1, 2), k), 1, 2)[0:(self.width + 1)/2].astype('float32') x = self.rotate(input, k)[:(self.width + 1)/2, :, :].astype('float32') - h0s.append([T.alloc(0., (self.width + 1)/2, x.shape[1], dim_h).astype(floatX) for dim_h in self.dim_hs]) + h0s.append([T.alloc(0, x.shape[1], self.width, dim_h).astype(floatX) for dim_h in self.dim_hs]) for i, h0 in enumerate(h0s[k]): seqs = [m[:, :, :, None]] + self.call_seqs(x, None, i, *params) outputs_info = [h0] @@ -157,15 +154,13 @@ def step_call(self, x, m, h0s, *params): output = [h[-1, :, (self.width + 1)/2, :]] else: output = output + [h[-1, :, (self.width + 1)/2, :]] - #directional_values += [h[-1, :, (self.width + 1)/2, :]] # Remember directional outputs. - #print T.sum(output) o_params = self.get_output_args(*params) - out_net_out = self.output_net.step_call(T.sum(output), *o_params) # Sum different directions. + out_net_out = self.output_net.step_call(T.sum(output, 0), *o_params) # Sum different directions. preact = out_net_out['z'] p = out_net_out['p'] - return coll.OrderedDict(hs=hs, p=p, z=preact), updates, h0s[0] + return coll.OrderedDict(hs=hs, p=p[:, 0], z=preact), updates, h0s[0] def __call__(self, x, m=None, h0s=None, condition_on=None): '''Call function. @@ -187,7 +182,7 @@ def __call__(self, x, m=None, h0s=None, condition_on=None): constants = [] if m is None: - m = T.ones((x.shape)).astype(floatX) + m = T.ones(x.shape).astype(floatX) params = self.get_sample_params() @@ -208,59 +203,23 @@ def call_seqs(self, x, condition_on, level, *params): list: list of scan inputs. ''' - """ - #print x.shape - x = x[:, :, :, None]#.swapaxes(0, 3) - #print x.shape + + x = x[:, :, :, None] if level == 0: i_params = self.get_input_args(*params) a = self.input_net.step_preact(x, *i_params) else: i_params = self.get_inter_args(level - 1, *params) a = self.inter_nets[level - 1].step_preact(x, *i_params) - #print a.shape - #print self.input_net.dim_in - #print self.input_net.dim_out - """ - - params = list(params) - - print params - - W = params.pop(1) - b = params.pop(1) - - print W, type(W) - - print params - - for i, z in enumerate(x): - a = T.concatenate([(T.dot(c, W) + b)[:, None, None] for c in z.swapaxes(0, 1)], 1).swapaxes(0, 1) - if i: - T.concatenate([rval, a], 2) - else: - rval = a - - #a = T.dot(a, T.alloc(0, 2, 2)) - - if condition_on is not None: - a += condition_on return [a] def rotate(self, tensor, n_times): - #retval = tensor.copy() if n_times == 0: return tensor - #for i in range(0, self.width): - # retval[:, :, tensor.shape[0] - i] = tensor[i, :, :] - - #retval = np.array([tensor[:, :, self.width - 1 - i] for i in range(0, self.width)]) - #retval = retval = tensor.swapaxes(0, 2)[::-1] - if n_times == 1: return retval elif n_times > 1: From 90e37a5a1ff3f5f7c6fd25de0cd89b89302383d2 Mon Sep 17 00:00:00 2001 From: Jeremy-E-Johnson Date: Thu, 21 Jul 2016 16:35:06 -0600 Subject: [PATCH 24/37] Bugfix, demo runs now but doesnt seem to train particularly well. --- cortex/demos/demos_basic/pyramid_voc.py | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/cortex/demos/demos_basic/pyramid_voc.py b/cortex/demos/demos_basic/pyramid_voc.py index b4d35f4..df8ece9 100644 --- a/cortex/demos/demos_basic/pyramid_voc.py +++ b/cortex/demos/demos_basic/pyramid_voc.py @@ -14,7 +14,7 @@ import theano import numpy as np from cortex.models.pyramid_rnn import Pyramid_RNN -from cortex.utils import intX, floatX +from cortex.utils import intX, floatX, logger from cortex.datasets import resolve as resolve_dataset import theano.tensor as T @@ -34,12 +34,12 @@ _dataset_args = dict( train_batch_size=10, valid_batch_size=10, - #test_batch_size=10, + # test_batch_size=10, debug=False, dataset='voc', chunks=1000, distribution='multinomial', - chunk_size=15, + chunk_size=7, source='$data' ) @@ -50,7 +50,7 @@ ) pyramid_args = dict( - dim_hs=[17], + dim_hs=[25], input_layer='voc', output='label', ) @@ -82,18 +82,17 @@ def _build(module): def _cost(module): models = module.models - X = module.inputs[module.dataset.name]#.swapaxes(0, 1) + X = module.inputs[module.dataset.name].swapaxes(0, 1) Y = module.inputs['label'] used_inputs = [module.dataset.name, 'label'] model = models['pyramid_rnn'] - main(model) outputs, preact, updates = model(X) results = OrderedDict() p = outputs['p'] - base_cost = model.neg_log_prob(Y, p).sum(0).mean() + base_cost = model.neg_log_prob(Y, p).mean() cost = base_cost constants = [] @@ -112,14 +111,3 @@ def _cost(module): return used_inputs, results, updates, constants, outputs - -def main(model): - x = T.alloc(1, 8, 10, 17) - - params = model.get_sample_params() - - #print params - - a = model.call_seqs(x, None, 0, *params)[0] - - print a.eval().shape, '****************************************' From 09f7eb8642945ce331020cdd959cc3301e8e5863 Mon Sep 17 00:00:00 2001 From: Jeremy-E-Johnson Date: Thu, 21 Jul 2016 16:36:22 -0600 Subject: [PATCH 25/37] Added support for masked input. --- cortex/demos/demos_basic/rnn_europarl.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cortex/demos/demos_basic/rnn_europarl.py b/cortex/demos/demos_basic/rnn_europarl.py index cabfa3e..d2f764a 100644 --- a/cortex/demos/demos_basic/rnn_europarl.py +++ b/cortex/demos/demos_basic/rnn_europarl.py @@ -36,6 +36,7 @@ _model_args = dict( dim_h=None, l2_decay=None, + mask_in='mask_in' ) simple_rnn_args = dict( @@ -71,11 +72,12 @@ def _build(module): def _cost(module): models = module.models + mask_in = module.inputs['mask_in'].transpose(1, 0) X = module.inputs[module.dataset.name].transpose(1, 0, 2) - used_inputs = [module.dataset.name] + used_inputs = [module.dataset.name, 'mask_in'] model = models['rnn'] - outputs, preact, updates = model(X) + outputs, preact, updates = model(X, m=mask_in) results = OrderedDict() p = outputs['p'] From 2de7e897e852c8f3cc804bca9b0317bb1d67e44a Mon Sep 17 00:00:00 2001 From: Jeremy-E-Johnson Date: Thu, 21 Jul 2016 16:38:32 -0600 Subject: [PATCH 26/37] Fixed up __call__ test which should work now. --- cortex/models/tests/test_pyramid_rnn.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/cortex/models/tests/test_pyramid_rnn.py b/cortex/models/tests/test_pyramid_rnn.py index 58ee4a8..16e8146 100644 --- a/cortex/models/tests/test_pyramid_rnn.py +++ b/cortex/models/tests/test_pyramid_rnn.py @@ -9,17 +9,18 @@ theano.config.optimizer = 'None' -def test_build(dim_in=13, dim_h=17): + +def test_build(dim_in=1, dim_h=17, width=13): pyramid = Pyramid_RNN.factory(dim_in=dim_in, dim_hs=[dim_h], - dim_out=1) + width=width, dim_out=1) pyramid.set_tparams() return pyramid -def test_step(pyramid=None, dim_in=13, dim_h=17): +def test_step(pyramid=None, dim_in=1, dim_h=17, width=13): if pyramid is None: - pyramid = test_build(dim_in=dim_in, dim_h=dim_h) + pyramid = test_build(dim_in=dim_in, dim_h=dim_h, width=width) m = theano.tensor.tensor3() y = theano.tensor.tensor3() @@ -29,20 +30,22 @@ def test_step(pyramid=None, dim_in=13, dim_h=17): activation = pyramid._step(m, y, h_, Ur) f = theano.function([m, y, h_, Ur], activation) + t = f(np.ones((10, width, dim_h), dtype='float32'), np.ones((10, width, dim_h), dtype='float32'), + np.ones((10, width, dim_h), dtype='float32'), pyramid.params['Ur0']) - t = f(np.ones((10, dim_in, dim_h), dtype='float32'), np.ones((10, dim_in, dim_h), dtype='float32'), - np.ones((10, dim_in, dim_h), dtype='float32'), pyramid.params['Ur0']) + preact = np.ones((10, width, dim_h), dtype='float32') + \ + np.dot(np.ones((10, width, 3*dim_h), dtype='float32'), pyramid.params['Ur0']) - preact = np.ones((10, dim_in, dim_h), dtype='float32') + \ - np.dot(np.ones((10, dim_in, 3*dim_h), dtype='float32'), pyramid.params['Ur0']) n = np.tanh(preact) + print n.shape + np.testing.assert_almost_equal(t, n) -def test_call(pyramid=None, dim_in=13, dim_h=17): +def test_call(pyramid=None, dim_in=1, dim_h=17, width=13): if pyramid is None: - pyramid = test_build(dim_in=dim_in, dim_h=dim_h) + pyramid = test_build(dim_in=dim_in, dim_h=dim_h, width=width) rng = np.random.RandomState() @@ -51,5 +54,3 @@ def test_call(pyramid=None, dim_in=13, dim_h=17): y = pyramid(x) f = theano.function([], y[0]['p']) - - print f() From 4ec1f81dd9dee8cfadd5d23f4147f34336d6371f Mon Sep 17 00:00:00 2001 From: Jeremy-E-Johnson Date: Thu, 21 Jul 2016 16:39:12 -0600 Subject: [PATCH 27/37] Added chunk_size specifier to arguments. Trivial change. --- cortex/datasets/basic/tests/test_voc.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cortex/datasets/basic/tests/test_voc.py b/cortex/datasets/basic/tests/test_voc.py index ba8810f..356b1a1 100644 --- a/cortex/datasets/basic/tests/test_voc.py +++ b/cortex/datasets/basic/tests/test_voc.py @@ -6,5 +6,4 @@ def test_voc(): - test = VOC(source='$data', batch_size=10) - + test = VOC(source='$data', batch_size=10, chunk_size=15) From 638fccc4ee311412d0171ea2e7534e2ff7b4c044 Mon Sep 17 00:00:00 2001 From: Jeremy-E-Johnson Date: Thu, 21 Jul 2016 16:46:45 -0600 Subject: [PATCH 28/37] These changes make no sense but without either or both of them the pyramid demo will not run. --- cortex/utils/trainer.py | 19 ++++++++++++++++++- cortex/utils/training.py | 15 ++++++++++++++- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/cortex/utils/trainer.py b/cortex/utils/trainer.py index 6f2ba19..ab5de97 100644 --- a/cortex/utils/trainer.py +++ b/cortex/utils/trainer.py @@ -29,6 +29,7 @@ logger = logging.getLogger('cortex') + def setup(module): '''Sets up module. @@ -37,6 +38,7 @@ def setup(module): print_section('Running setup') module.setup() + def set_data(module): '''Sets the datasets. @@ -61,6 +63,7 @@ def set_data(module): module.valid_dataset = datasets['valid'] module.test_dataset = datasets['test'] + def make_inputs(module): '''Forms the inputs from the dataset @@ -95,6 +98,7 @@ def make_inputs(module): dataset.reset() module.inputs = inps + def build(module, model_to_load=None): '''Forms the models. @@ -109,6 +113,7 @@ def build(module, model_to_load=None): module.models = models return set_tparams(module) + def set_tparams(module): tparams = OrderedDict() for k, v in module.models.iteritems(): @@ -116,6 +121,7 @@ def set_tparams(module): module.tparams = tparams return tparams + def set_cost(module): '''Sets costs. @@ -135,6 +141,7 @@ def set_cost(module): module.inputs = inputs return results, updates, constants, outputs + def set_test_function(module, results, outputs): '''Sets the test function of a module. @@ -142,9 +149,10 @@ def set_test_function(module, results, outputs): if hasattr(module, 'test'): f_test = module.test(results, outputs) else: - f_test = theano.function(module.inputs.values(), results) + f_test = theano.function(module.inputs.values()[::-1], results) return f_test + def set_out_function(module, results, outputs): '''Sets function for outputs. ''' @@ -158,6 +166,7 @@ def set_out_function(module, results, outputs): f_outs = theano.function(module.inputs.values(), outs) return f_outs + def set_save_function(module, tparams): '''Sets the save function of a module. @@ -174,6 +183,7 @@ def save(outfile): f_save = save return f_save + def set_viz_function(module, results, outputs): '''Sets the visualization function of a module. @@ -188,12 +198,14 @@ def set_viz_function(module, results, outputs): else: return None + def set_eval_functions(module, **kwargs): if hasattr(module, 'eval'): return module.eval(**kwargs) else: return OrderedDict() + def check(module): '''Runs checks. @@ -202,6 +214,7 @@ def check(module): logger.info('Checking experiment.') module.check() + def finish(module): '''Extra finishing-up. @@ -210,6 +223,7 @@ def finish(module): logger.info('Finishing up setup') module.finish() + def train(module, cost, tparams, updates, constants, f_test=None, f_save=None, f_viz=None, f_outs=None, test_every=10, show_every=10, monitor_gradients=False): @@ -289,6 +303,7 @@ def show(self): for k in self.eval_keys: self.__dict__[k]() + class ModuleContainer(object): __required_methods = ['_build', '_cost'] __optional_methods = ['_setup', '_data', '_test', '_save', '_viz', '_check', @@ -411,6 +426,7 @@ def add_component(component): if len(component_list) > 0: flatten_component_layers(component_list, model_dict) + def load_module(model_file, strict=True): '''Loads pretrained model. @@ -503,6 +519,7 @@ def load_module(model_file, strict=True): set_tparams(module) return module + def main(args=None): if args is None: args = sys.argv[1:] diff --git a/cortex/utils/training.py b/cortex/utils/training.py index 4796806..6f5bfb1 100644 --- a/cortex/utils/training.py +++ b/cortex/utils/training.py @@ -12,7 +12,7 @@ from collections import OrderedDict from glob import glob import logging -if not 'matplotlib' in sys.modules: +if 'matplotlib' not in sys.modules: import matplotlib matplotlib.use('Agg') import numpy as np @@ -47,6 +47,7 @@ np.set_printoptions(threshold=np.nan) logger = logging.getLogger(__name__) + def make_argument_parser(): '''Generic experiment parser. @@ -69,6 +70,7 @@ def make_argument_parser(): help='Verbosity of the logging. (0, 1, 2)') return parser + def make_argument_parser_trainer(): '''Generic experiment parser for a trainer. @@ -93,6 +95,7 @@ def make_argument_parser_trainer(): help='Verbosity of the logging. (0, 1, 2)') return parser + def make_argument_parser_test(): '''Generic experiment parser for testing. @@ -112,6 +115,7 @@ def make_argument_parser_test(): help='Verbosity of the logging. (0, 1, 2)') return parser + def set_experiment(args): '''Generic experiment setup method. @@ -186,6 +190,7 @@ def set_experiment(args): exp_dict['model_to_load'] = model_to_load return exp_dict + def reload_model(args): '''Reloads a model from argparse args. @@ -237,6 +242,7 @@ def reload_model(args): exp_dict.update(**args) return exp_dict + def set_model(create_model, model_to_load, unpack, **kwargs): '''Convenience method for creating new or loading old model. @@ -267,6 +273,7 @@ def create_model(): models = create_model() return models + def set_tparams(model_dict): '''Generic tparams setter. @@ -279,6 +286,7 @@ def set_tparams(model_dict): tparams.update(**model.set_tparams()) return tparams + def set_params(tparams, updates, excludes=[]): '''Sets params, removing updates from tparams. @@ -307,6 +315,7 @@ def set_params(tparams, updates, excludes=[]): return tparams, all_params + def set_optimizer(inputs, cost, tparams, constants, updates, extra_outs, optimizer='sgd', optimizer_args=None, **learning_args): @@ -343,6 +352,7 @@ def set_optimizer(inputs, cost, tparams, constants, updates, extra_outs, return f_grad_shared, f_grad_updates, learning_args + def test(data_iter, f_test, f_test_keys, input_keys, n_samples=None): '''Tests the model using a data iterator. @@ -406,6 +416,7 @@ def test(data_iter, f_test, f_test_keys, input_keys, n_samples=None): return results + def validate(results, best_valid, e, best_epoch, save=None, valid_key=None, valid_sign=None, bestfile=None, **kwargs): '''Generic validation method. @@ -444,6 +455,7 @@ def validate(results, best_valid, e, best_epoch, save=None, valid_key=None, return best_valid, best_epoch + def main_loop(train, valid, f_grad_shared, f_grad_updates, f_test, f_test_keys=None, @@ -583,6 +595,7 @@ def main_loop(train, valid, if e > epochs: break + inps.reverse() # THIS IS A SPOT FIX FOR 'pyramid_voc.py' DEMO! rval = f_grad_shared(*inps) if check_bad_nums(rval): From 5c8b4633464339c351f3bd45dc26910852452070 Mon Sep 17 00:00:00 2001 From: Jeremy-E-Johnson Date: Thu, 21 Jul 2016 16:47:38 -0600 Subject: [PATCH 29/37] Fixed an issue with training data being binary and a docstring typo. --- cortex/datasets/basic/voc.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/cortex/datasets/basic/voc.py b/cortex/datasets/basic/voc.py index 627f4cd..bce23f2 100644 --- a/cortex/datasets/basic/voc.py +++ b/cortex/datasets/basic/voc.py @@ -111,7 +111,7 @@ def get_unique(pixels): """Helper function for get_data, returns the number of unique classifiers in an image. Args: - im (list of lists): Pixel classifier values. + pixels (list of lists): Pixel classifier values. Returns (int): Number of unique classifiers in image. @@ -184,7 +184,7 @@ def get_random_chunk(pixels_data, pixels_label): if get_unique(label_pixels) == 3: self.label_images.append(project_to_binary(label_pixels)) - data_im = Image.open(source + '/basic/VOCdevkit/VOC2010/JPEGImages/' + name + '.jpg').convert('1') + data_im = Image.open(source + '/basic/VOCdevkit/VOC2010/JPEGImages/' + name + '.jpg').convert('L') self.data_images.append(image_to_pixels(data_im)) images_loaded += 1 @@ -198,7 +198,7 @@ def get_random_chunk(pixels_data, pixels_label): for i in xrange(0, self.chunks): k = rand.randint(0, len(self.data_images) - 1) x, y = get_random_chunk(self.data_images[k], self.label_images[k]) - X.append(np.array(x, dtype='float32')/255) # Normalize + X.append(np.array(x, dtype='float32')/255.0) # Normalize Y.append(np.array(y, dtype='float32')) assert len(X) == self.chunks and len(Y) == self.chunks @@ -208,12 +208,6 @@ def get_random_chunk(pixels_data, pixels_label): def next(self): rval = super(VOC, self).next() - #data = rval['voc'] - - #rval['voc'] = [] - #for k in range(0, 4): - # rval['voc'].append(np.swapaxes(np.rot90(np.swapaxes(data, 1, 2), k), 1, 2)[0:(self.chunk_size + 1)/2].astype('float32')) - rval['label'] = np.array([b[1] for b in rval['label']]) return rval From 608eccfc280d4f9794fa65b8a2fa8b9dcee674bb Mon Sep 17 00:00:00 2001 From: Jeremy-E-Johnson Date: Thu, 21 Jul 2016 17:08:11 -0600 Subject: [PATCH 30/37] Removed unnecessary print statement. --- cortex/models/tests/test_pyramid_rnn.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/cortex/models/tests/test_pyramid_rnn.py b/cortex/models/tests/test_pyramid_rnn.py index 16e8146..f5e322a 100644 --- a/cortex/models/tests/test_pyramid_rnn.py +++ b/cortex/models/tests/test_pyramid_rnn.py @@ -38,8 +38,6 @@ def test_step(pyramid=None, dim_in=1, dim_h=17, width=13): n = np.tanh(preact) - print n.shape - np.testing.assert_almost_equal(t, n) From 78c527415bbeef3b9bea5ae25a9c2cf4ae87c284 Mon Sep 17 00:00:00 2001 From: Jeremy-E-Johnson Date: Fri, 22 Jul 2016 12:54:34 -0600 Subject: [PATCH 31/37] Removed reversal of input order. --- cortex/utils/training.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cortex/utils/training.py b/cortex/utils/training.py index 6f5bfb1..1f279fc 100644 --- a/cortex/utils/training.py +++ b/cortex/utils/training.py @@ -595,7 +595,6 @@ def main_loop(train, valid, if e > epochs: break - inps.reverse() # THIS IS A SPOT FIX FOR 'pyramid_voc.py' DEMO! rval = f_grad_shared(*inps) if check_bad_nums(rval): From 4f83fe19e2387ecf2fe8d7561638cf04d3d06fb3 Mon Sep 17 00:00:00 2001 From: Jeremy-E-Johnson Date: Fri, 22 Jul 2016 12:54:50 -0600 Subject: [PATCH 32/37] Removed reversal of input order. --- cortex/utils/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cortex/utils/trainer.py b/cortex/utils/trainer.py index ab5de97..c9f759c 100644 --- a/cortex/utils/trainer.py +++ b/cortex/utils/trainer.py @@ -149,7 +149,7 @@ def set_test_function(module, results, outputs): if hasattr(module, 'test'): f_test = module.test(results, outputs) else: - f_test = theano.function(module.inputs.values()[::-1], results) + f_test = theano.function(module.inputs.values(), results) # Not sure why this change was made, but it seems necessary. return f_test From bf899380a2cbfc8defa880943bb1feb61e507e1a Mon Sep 17 00:00:00 2001 From: Jeremy-E-Johnson Date: Fri, 22 Jul 2016 12:55:24 -0600 Subject: [PATCH 33/37] Fixed some name shadowing through scopes. --- cortex/datasets/basic/voc.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cortex/datasets/basic/voc.py b/cortex/datasets/basic/voc.py index bce23f2..0851eac 100644 --- a/cortex/datasets/basic/voc.py +++ b/cortex/datasets/basic/voc.py @@ -117,10 +117,10 @@ def get_unique(pixels): """ unique = [] - for line in pixels: - for i in line: - if i not in unique: - unique.append(i) + for l in pixels: + for j in l: + if j not in unique: + unique.append(j) return len(unique) def image_to_pixels(im): From 77ebe3a324aac1d80d4305d9fc073148e93ee641 Mon Sep 17 00:00:00 2001 From: Jeremy-E-Johnson Date: Fri, 22 Jul 2016 12:55:59 -0600 Subject: [PATCH 34/37] Added proper test for __call__ function that compares it to numpy calculated values. --- cortex/models/tests/test_pyramid_rnn.py | 28 ++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/cortex/models/tests/test_pyramid_rnn.py b/cortex/models/tests/test_pyramid_rnn.py index f5e322a..0bcc21b 100644 --- a/cortex/models/tests/test_pyramid_rnn.py +++ b/cortex/models/tests/test_pyramid_rnn.py @@ -51,4 +51,30 @@ def test_call(pyramid=None, dim_in=1, dim_h=17, width=13): y = pyramid(x) - f = theano.function([], y[0]['p']) + f = theano.function([], y[0]['z']) + + # Now calculate what f should be using Numpy. + + outs =[] + for k in range(0, 4): # Iterate through directions + x = np.rot90(x.swapaxes(1, 2)).swapaxes(1, 2) # Rotate input + + dir_input = x[:(width + 1)/2, :, :, None] + dir_input = pyramid.input_net.params['b0'] + np.dot(dir_input, pyramid.input_net.params['W0']) + + h = np.zeros((x.shape[1], width, dim_h)) + Ur = pyramid.params['Ur0'] + + for layer in dir_input: # Iterate through height of pyramid + h_t = np.concatenate((h, np.roll(h, 1, 2), np.roll(h, -1, 2)), 2) + preact = layer + np.dot(h_t, Ur) + h = np.tanh(preact) + + outs.append(h[:, (width + 1)/2, :]) # Remember output for direction + + output = pyramid.output_net.params['b0'] + np.dot(sum(outs), pyramid.output_net.params['W0']) # Sum over direction\ + # and apply output network. + + # Test for equality. + + np.testing.assert_almost_equal(output, f()) # Check if they match. From e40bdd33b0081e7dc225bb9e808acbea9deb22e5 Mon Sep 17 00:00:00 2001 From: Jeremy-E-Johnson Date: Fri, 22 Jul 2016 12:56:20 -0600 Subject: [PATCH 35/37] Cleaned up __call__ a little bit. --- cortex/models/pyramid_rnn.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/cortex/models/pyramid_rnn.py b/cortex/models/pyramid_rnn.py index 6280810..a8cfe33 100644 --- a/cortex/models/pyramid_rnn.py +++ b/cortex/models/pyramid_rnn.py @@ -133,9 +133,10 @@ def step_call(self, x, m, h0s, *params): h0s = [] hs = [] + output = [] for k in range(0, 4): # Iterate through directions. - x = self.rotate(input, k)[:(self.width + 1)/2, :, :].astype('float32') - h0s.append([T.alloc(0, x.shape[1], self.width, dim_h).astype(floatX) for dim_h in self.dim_hs]) + x = self.rotate(input, k)[:(self.width + 1)/2, :, :].astype('float32') # Rotates input + h0s.append([T.alloc(0, x.shape[1], self.width, dim_h).astype(floatX) for dim_h in self.dim_hs]) # Make h0 for i, h0 in enumerate(h0s[k]): seqs = [m[:, :, :, None]] + self.call_seqs(x, None, i, *params) outputs_info = [h0] @@ -150,17 +151,14 @@ def step_call(self, x, m, h0s, *params): hs.append(h) x = h updates += updates_ - if k == 0: - output = [h[-1, :, (self.width + 1)/2, :]] - else: - output = output + [h[-1, :, (self.width + 1)/2, :]] + output.append(h[-1, :, (self.width + 1)/2, :]) # Collect directional outputs. o_params = self.get_output_args(*params) out_net_out = self.output_net.step_call(T.sum(output, 0), *o_params) # Sum different directions. preact = out_net_out['z'] p = out_net_out['p'] - return coll.OrderedDict(hs=hs, p=p[:, 0], z=preact), updates, h0s[0] + return coll.OrderedDict(hs=hs, p=p, z=preact), updates, h0s[0] def __call__(self, x, m=None, h0s=None, condition_on=None): '''Call function. @@ -215,6 +213,15 @@ def call_seqs(self, x, condition_on, level, *params): return [a] def rotate(self, tensor, n_times): + """ + + Args: + tensor (theano tensor): Tensor to rotate along first and third axes. + n_times (int): Number of times to rotate. + + Returns: Tensor rotated n times about its first and third axes. + + """ if n_times == 0: return tensor From fe4e639ffff22e345207cfaa8336ad2f445d84f6 Mon Sep 17 00:00:00 2001 From: Jeremy-E-Johnson Date: Fri, 22 Jul 2016 12:58:33 -0600 Subject: [PATCH 36/37] Made cost function work with adjusted pyramid_rnn code and removed the no optimizer flag. (The warn on excess input is still necessary though, otherwise f_out from trainer.py throws error upon creation.) --- cortex/demos/demos_basic/pyramid_voc.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/cortex/demos/demos_basic/pyramid_voc.py b/cortex/demos/demos_basic/pyramid_voc.py index df8ece9..fcaba9d 100644 --- a/cortex/demos/demos_basic/pyramid_voc.py +++ b/cortex/demos/demos_basic/pyramid_voc.py @@ -25,7 +25,7 @@ learning_rate_scheduler=None, optimizer='rmsprop', optimizer_args=dict(), - epochs=100, + epochs=150, valid_key='-sum log p(x | y)', valid_sign='+', excludes=[] @@ -39,7 +39,7 @@ dataset='voc', chunks=1000, distribution='multinomial', - chunk_size=7, + chunk_size=25, source='$data' ) @@ -58,9 +58,9 @@ extra_arg_keys = ['pyramid_args'] theano.config.on_unused_input = 'ignore' -theano.config.optimizer = 'None' -#theano.config.exception_verbosity = 'high' -#theano.config.compute_test_value = 'warn' +# theano.config.optimizer = 'None' +# theano.config.exception_verbosity = 'high' +# theano.config.compute_test_value = 'warn' def _build(module): @@ -84,7 +84,7 @@ def _cost(module): X = module.inputs[module.dataset.name].swapaxes(0, 1) Y = module.inputs['label'] - used_inputs = [module.dataset.name, 'label'] + used_inputs = ['label', module.dataset.name] model = models['pyramid_rnn'] @@ -92,7 +92,7 @@ def _cost(module): results = OrderedDict() p = outputs['p'] - base_cost = model.neg_log_prob(Y, p).mean() + base_cost = model.neg_log_prob(Y, p[:, 0]).mean() cost = base_cost constants = [] @@ -110,4 +110,3 @@ def _cost(module): results['cost'] = cost return used_inputs, results, updates, constants, outputs - From b29a0cd8f757c7a457ce79040785513c1237f719 Mon Sep 17 00:00:00 2001 From: Jeremy-E-Johnson Date: Thu, 28 Jul 2016 10:15:08 -0600 Subject: [PATCH 37/37] Started preparing visuals. --- cortex/datasets/basic/voc.py | 8 +++++++- cortex/demos/demos_basic/pyramid_voc.py | 10 +++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/cortex/datasets/basic/voc.py b/cortex/datasets/basic/voc.py index 0851eac..b234a71 100644 --- a/cortex/datasets/basic/voc.py +++ b/cortex/datasets/basic/voc.py @@ -7,9 +7,9 @@ import logging from ...utils.tools import resolve_path from PIL import Image -import PIL import random import numpy as np +import matplotlib.pyplot as plt class VOC(BasicDataset): @@ -211,3 +211,9 @@ def next(self): rval['label'] = np.array([b[1] for b in rval['label']]) return rval + + def save_images(self, image, ith_image, out_path=None): + + plt.matshow(image) + + plt.savefig(out_path + ith_image + '.png') diff --git a/cortex/demos/demos_basic/pyramid_voc.py b/cortex/demos/demos_basic/pyramid_voc.py index fcaba9d..dbeb736 100644 --- a/cortex/demos/demos_basic/pyramid_voc.py +++ b/cortex/demos/demos_basic/pyramid_voc.py @@ -17,6 +17,7 @@ from cortex.utils import intX, floatX, logger from cortex.datasets import resolve as resolve_dataset import theano.tensor as T +import matplotlib.pyplot as plt # Default arguments @@ -25,7 +26,7 @@ learning_rate_scheduler=None, optimizer='rmsprop', optimizer_args=dict(), - epochs=150, + epochs=15, valid_key='-sum log p(x | y)', valid_sign='+', excludes=[] @@ -110,3 +111,10 @@ def _cost(module): results['cost'] = cost return used_inputs, results, updates, constants, outputs + + +def _vis(module, outputs, results): + out_path = module.out_path + + plt.matshow(np.zeros((10, 10))) + plt.savefig('Testp_plot.png')