From 853a0a5592a99b036928d3a90fb40c0e782c866c Mon Sep 17 00:00:00 2001 From: Vardhaman Metpally Date: Mon, 18 Jul 2016 17:42:44 -0700 Subject: [PATCH 1/9] Adding gazetteers features logic --- loader.py | 47 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/loader.py b/loader.py index a7368685..e28b74ce 100644 --- a/loader.py +++ b/loader.py @@ -3,6 +3,7 @@ import codecs from utils import create_dico, create_mapping, zero_digits from utils import iob2, iob_iobes +import numpy as np def load_sentences(path, lower, zeros): @@ -21,11 +22,12 @@ def load_sentences(path, lower, zeros): sentence = [] else: word = line.split() - assert len(word) >= 2 + #assert len(word) >= 2 sentence.append(word) if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) + return sentences @@ -181,10 +183,49 @@ def augment_with_pretrained(dictionary, ext_emb_path, words): for word in words: if any(x in pretrained for x in [ word, - word.lower(), - re.sub('\d', '0', word.lower()) + word.lower()#, + #re.sub('\d', '0', word.lower()) ]) and word not in dictionary: dictionary[word] = 0 word_to_id, id_to_word = create_mapping(dictionary) return dictionary, word_to_id, id_to_word + +def load_gazetteers(gaz_path): + """ + Load extra gazetteers. + """ + lines = [line.rstrip() for line in open(gaz_path, 'r')] + gazetteers = {} + tags = {} + for line in lines: + split = line.split("\t") + assert len(split) >= 2 + category, gazetteer = " ".join(split[-1:]), " ".join(split[:-1]) + if category not in tags: + tags[category] = 1 + + if gazetteer not in gazetteers: + gazetteers[gazetteer] = [] + gazetteers[gazetteer].append(category) + else: + cat = gazetteers[gazetteer] + if category not in cat: + gazetteers[gazetteer].append(category) + return gazetteers,tags.keys() + +def add_gazetteers(data, token_to_gazetteers, index_to_token,gaz_tags): + """ + Add gazetteers features to data. + """ + #tags = ['LOC', 'MISC', 'ORG', 'PER'] + tags = gaz_tags + for sentence in data: + sentence['gazetteers'] = [np.zeros(len(tags), dtype=np.int32) for _ in xrange(len(sentence['words']))] + for i, token_id in enumerate(sentence['words']): + token = index_to_token[token_id].lower() + if token in token_to_gazetteers: + values = set(token_to_gazetteers[token]) + for j,tag in enumerate(tags): + if tag in values: + np.put(sentence['gazetteers'][i],[j],[1]) From 9a64df66be7189fe6f11193fd3bd0f8691bf5ea7 Mon Sep 17 00:00:00 2001 From: Vardhaman Metpally Date: Mon, 18 Jul 2016 17:44:46 -0700 Subject: [PATCH 2/9] changes to model.py for gazetteer features --- model.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/model.py b/model.py index fca1ab48..df8e649c 100644 --- a/model.py +++ b/model.py @@ -119,6 +119,7 @@ def build(self, lr_method, pre_emb, crf, + gaz_dim, cap_dim, training=True, **kwargs @@ -135,7 +136,11 @@ def build(self, if cap_dim: n_cap = 4 - # Network variables + #Gaz features + if gaz_dim: + n_gaz = self.parameters['gaz_dim'] # ner tags + + #Network variables is_train = T.iscalar('is_train') word_ids = T.ivector(name='word_ids') char_for_ids = T.imatrix(name='char_for_ids') @@ -144,7 +149,8 @@ def build(self, tag_ids = T.ivector(name='tag_ids') if cap_dim: cap_ids = T.ivector(name='cap_ids') - + if gaz_dim: + gaz_values = T.ivector(name='gaz_values') # Sentence length s_len = (word_ids if word_dim else char_pos_ids).shape[0] @@ -240,6 +246,12 @@ def build(self, cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer') inputs.append(cap_layer.link(cap_ids)) + #gaz features + if gaz_dim: + input_dim += gaz_dim + gaz_layer = EmbeddingLayer(n_gaz, gaz_dim, name='gaz_layer') + inputs.append(gaz_layer.link(gaz_values)) + # Prepare final input if len(inputs) != 1: inputs = T.concatenate(inputs, axis=1) @@ -335,6 +347,12 @@ def build(self, params.extend(cap_layer.params) self.add_component(final_layer) params.extend(final_layer.params) + # Gazetteers features + if gaz_dim: + self.add_component(gaz_layer) + #experiment.components['gaz_embeddings'] = gaz_embeddings + params.extend(gaz_layer.params) + if crf: self.add_component(transitions) params.append(transitions) @@ -353,6 +371,8 @@ def build(self, eval_inputs.append(char_pos_ids) if cap_dim: eval_inputs.append(cap_ids) + if gaz_dim: + eval_inputs.append(gaz_values) train_inputs = eval_inputs + [tag_ids] # Parse optimization method parameters From 3adcfd048a590cd3916cc9521c65dbac4f39ca03 Mon Sep 17 00:00:00 2001 From: Vardhaman Metpally Date: Mon, 18 Jul 2016 17:53:32 -0700 Subject: [PATCH 3/9] Update train.py --- train.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/train.py b/train.py index 5dbe0436..4fb45392 100755 --- a/train.py +++ b/train.py @@ -92,6 +92,14 @@ "-r", "--reload", default="0", type='int', help="Reload the last saved model" ) +optparser.add_option( + "-G", "--gaz_path",default="", + help="Gazetteers dimension" +) +optparser.add_option( + "-g", "--gaz_dim", default="5", + help="Gazetteers dimension" +) opts = optparser.parse_args()[0] # Parse parameters @@ -111,18 +119,19 @@ parameters['crf'] = opts.crf == 1 parameters['dropout'] = opts.dropout parameters['lr_method'] = opts.lr_method - +parameters['gaz_dim'] = opts.gaz_dim +parameters['gaz_path'] = opts.gaz_path # Check parameters validity assert os.path.isfile(opts.train) assert os.path.isfile(opts.dev) assert os.path.isfile(opts.test) +assert os.path.isfile(opts.gaz_path) assert parameters['char_dim'] > 0 or parameters['word_dim'] > 0 assert 0. <= parameters['dropout'] < 1.0 assert parameters['tag_scheme'] in ['iob', 'iobes'] assert not parameters['all_emb'] or parameters['pre_emb'] assert not parameters['pre_emb'] or parameters['word_dim'] > 0 assert not parameters['pre_emb'] or os.path.isfile(parameters['pre_emb']) - # Check evaluation script / folders if not os.path.isfile(eval_script): raise Exception('CoNLL evaluation script not found at "%s"' % eval_script) @@ -183,6 +192,17 @@ print "%i / %i / %i sentences in train / dev / test." % ( len(train_data), len(dev_data), len(test_data)) +if parameters['gaz_dim']: + '''1: read from gazetteers file with the format: > + 2: once we read the gazetteers, we create a one-hot-encoding gazetteer vector + for every word in the sentence. The length of vector is equal to no of categories + and add the gazetteer feature vector for every word + ''' + gazetteers_dataset,gaz_tags = loader.load_gazetteers(parameters['gaz_path']) + parameters['gaz_dim'] = len(gaz_tags) + loader.add_gazetteers(train_data, gazetteers_dataset, id_to_word,gaz_tags) + loader.add_gazetteers(dev_data, gazetteers_dataset, id_to_word,gaz_tags) + loader.add_gazetteers(test_data, gazetteers_dataset, id_to_word,gaz_tags) # Save the mappings to disk print 'Saving the mappings to disk...' model.save_mappings(id_to_word, id_to_char, id_to_tag) From 8e09ec34a5c73c3798ebf737c4467cbcdacc3808 Mon Sep 17 00:00:00 2001 From: Vardhaman Metpally Date: Mon, 18 Jul 2016 17:57:14 -0700 Subject: [PATCH 4/9] Update utils.py --- utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/utils.py b/utils.py index 19e3175d..e9326233 100644 --- a/utils.py +++ b/utils.py @@ -198,6 +198,8 @@ def create_input(data, parameters, add_label, singletons=None): chars = data['chars'] if singletons is not None: words = insert_singletons(words, singletons) + if parameters['gaz_dim']: + gaz = data['gazetteers'] if parameters['cap_dim']: caps = data['caps'] char_for, char_rev, char_pos = pad_word_chars(chars) @@ -213,6 +215,8 @@ def create_input(data, parameters, add_label, singletons=None): input.append(caps) if add_label: input.append(data['tags']) + if parameters['gaz_dim']: + input.append(gaz) return input From 57255b08f4c2032e26ca04bf0604e8ac006bd017 Mon Sep 17 00:00:00 2001 From: Vardhaman Metpally Date: Mon, 18 Jul 2016 18:03:13 -0700 Subject: [PATCH 5/9] Update loader.py --- loader.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/loader.py b/loader.py index e28b74ce..7658e268 100644 --- a/loader.py +++ b/loader.py @@ -22,7 +22,7 @@ def load_sentences(path, lower, zeros): sentence = [] else: word = line.split() - #assert len(word) >= 2 + assert len(word) >= 2 sentence.append(word) if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: @@ -183,8 +183,8 @@ def augment_with_pretrained(dictionary, ext_emb_path, words): for word in words: if any(x in pretrained for x in [ word, - word.lower()#, - #re.sub('\d', '0', word.lower()) + word.lower(), + re.sub('\d', '0', word.lower()) ]) and word not in dictionary: dictionary[word] = 0 From ed9263a106d6e1e670d8a2e408ed3d16593728a7 Mon Sep 17 00:00:00 2001 From: Vardhaman Metpally Date: Mon, 1 Aug 2016 12:06:15 -0700 Subject: [PATCH 6/9] Update model.py --- model.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/model.py b/model.py index df8e649c..8e7fba56 100644 --- a/model.py +++ b/model.py @@ -137,8 +137,9 @@ def build(self, n_cap = 4 #Gaz features - if gaz_dim: + if gaz_path: n_gaz = self.parameters['gaz_dim'] # ner tags + gaz_dim = n_gaz #Network variables is_train = T.iscalar('is_train') @@ -149,8 +150,8 @@ def build(self, tag_ids = T.ivector(name='tag_ids') if cap_dim: cap_ids = T.ivector(name='cap_ids') - if gaz_dim: - gaz_values = T.ivector(name='gaz_values') + if gaz_path: + gaz_values = T.imatrix(name='gaz_values') # Sentence length s_len = (word_ids if word_dim else char_pos_ids).shape[0] @@ -247,7 +248,7 @@ def build(self, inputs.append(cap_layer.link(cap_ids)) #gaz features - if gaz_dim: + if gaz_path: input_dim += gaz_dim gaz_layer = EmbeddingLayer(n_gaz, gaz_dim, name='gaz_layer') inputs.append(gaz_layer.link(gaz_values)) @@ -348,7 +349,7 @@ def build(self, self.add_component(final_layer) params.extend(final_layer.params) # Gazetteers features - if gaz_dim: + if gaz_path: self.add_component(gaz_layer) #experiment.components['gaz_embeddings'] = gaz_embeddings params.extend(gaz_layer.params) @@ -371,7 +372,7 @@ def build(self, eval_inputs.append(char_pos_ids) if cap_dim: eval_inputs.append(cap_ids) - if gaz_dim: + if gaz_path: eval_inputs.append(gaz_values) train_inputs = eval_inputs + [tag_ids] From 3cfaee514b30446f02abb7c511d68a328fc7f671 Mon Sep 17 00:00:00 2001 From: Vardhaman Metpally Date: Mon, 1 Aug 2016 12:08:30 -0700 Subject: [PATCH 7/9] Update train.py --- train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train.py b/train.py index 4fb45392..cd1242a0 100755 --- a/train.py +++ b/train.py @@ -125,13 +125,13 @@ assert os.path.isfile(opts.train) assert os.path.isfile(opts.dev) assert os.path.isfile(opts.test) -assert os.path.isfile(opts.gaz_path) assert parameters['char_dim'] > 0 or parameters['word_dim'] > 0 assert 0. <= parameters['dropout'] < 1.0 assert parameters['tag_scheme'] in ['iob', 'iobes'] assert not parameters['all_emb'] or parameters['pre_emb'] assert not parameters['pre_emb'] or parameters['word_dim'] > 0 assert not parameters['pre_emb'] or os.path.isfile(parameters['pre_emb']) +assert not parameters['gaz_path'] or os.path.isfile(parameters['gaz_path']) # Check evaluation script / folders if not os.path.isfile(eval_script): raise Exception('CoNLL evaluation script not found at "%s"' % eval_script) From 53831dbba24346d42e89f94417087f32baa36a9e Mon Sep 17 00:00:00 2001 From: Vardhaman Metpally Date: Mon, 1 Aug 2016 12:09:34 -0700 Subject: [PATCH 8/9] Update utils.py --- utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils.py b/utils.py index e9326233..224cf968 100644 --- a/utils.py +++ b/utils.py @@ -198,7 +198,7 @@ def create_input(data, parameters, add_label, singletons=None): chars = data['chars'] if singletons is not None: words = insert_singletons(words, singletons) - if parameters['gaz_dim']: + if parameters['gaz_path']: gaz = data['gazetteers'] if parameters['cap_dim']: caps = data['caps'] @@ -215,7 +215,7 @@ def create_input(data, parameters, add_label, singletons=None): input.append(caps) if add_label: input.append(data['tags']) - if parameters['gaz_dim']: + if parameters['gaz_path']: input.append(gaz) return input From 5352dba79ebba9371f3fe6fa4cb838e11e2db5c2 Mon Sep 17 00:00:00 2001 From: Vardhaman Metpally Date: Mon, 1 Aug 2016 12:53:27 -0700 Subject: [PATCH 9/9] Update loader.py --- loader.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/loader.py b/loader.py index 7658e268..bbf9829e 100644 --- a/loader.py +++ b/loader.py @@ -194,6 +194,10 @@ def augment_with_pretrained(dictionary, ext_emb_path, words): def load_gazetteers(gaz_path): """ Load extra gazetteers. + each line consists of two fields + gazetteer + we could have same gazetteer with multiple categories, but they should appear in + different lines """ lines = [line.rstrip() for line in open(gaz_path, 'r')] gazetteers = {}