From 508ec66f59e43b5b64a012ebfe8719d502873c2b Mon Sep 17 00:00:00 2001 From: anlausch Date: Wed, 3 Feb 2021 12:07:15 +0100 Subject: [PATCH] Update the data processing scripts to consider all relations as in the final version --- .../create_corpora_from_random_walks.py | 20 +++-- randomwalks_utility/preprocess_cn.py | 89 ++++++++++++++----- randomwalks_utility/random_walks.py | 29 +++--- 3 files changed, 96 insertions(+), 42 deletions(-) diff --git a/randomwalks_utility/create_corpora_from_random_walks.py b/randomwalks_utility/create_corpora_from_random_walks.py index eb4cdb1..93148fa 100644 --- a/randomwalks_utility/create_corpora_from_random_walks.py +++ b/randomwalks_utility/create_corpora_from_random_walks.py @@ -1,5 +1,6 @@ import pickle import codecs + from tqdm import tqdm from concurrent.futures import ProcessPoolExecutor, as_completed @@ -46,7 +47,7 @@ def chunks(lst, n): for i in range(0, len(lst), n): yield lst[i:i + n] -def generate_corpus_from_walks(walks, output_path_prefix="./randomwalks/rw_corpus_", output_path_suffix=""): +def generate_corpus_from_walks(walks, output_path_prefix="./data/rw_corpus_", output_path_suffix=""): # how do we actually want to generate the corpus? # one option is to always dublicate the node in the middle.. # also Goran says that we want to keep the relations as separate tokens in the vocab. I do not necessarily agree with this, but we try. @@ -75,13 +76,22 @@ def generate_corpus_from_walks(walks, output_path_prefix="./randomwalks/rw_corpu with codecs.open(output_path, "w", "utf8") as out: out.write(text) +def analyze_walks(path="./output/random_walk"): + walk = load_walks(path) + total_num_walks = len(walk) + + def main(): - in_prefix = "randomwalks/random_walk_" - in_suffix = "1.0_1.0_2_15" - walks = load_walks(in_prefix + in_suffix + ".p") - generate_corpus_from_walks(walks, output_path_suffix=in_suffix + "_nl") + for in_suffix in ["1.0_1.0_5_20","1.0_1.0_5_30"]: + in_prefix = "./data/random_walk_full_assertions_" + #in_suffix = "1.0_1.0_5_10" + walks = load_walks(in_prefix + in_suffix + ".p") + generate_corpus_from_walks(walks, output_path_suffix=in_suffix + "full_assertions_nl") if __name__=="__main__": main() + + + diff --git a/randomwalks_utility/preprocess_cn.py b/randomwalks_utility/preprocess_cn.py index 0602e3f..53ad1ae 100644 --- a/randomwalks_utility/preprocess_cn.py +++ b/randomwalks_utility/preprocess_cn.py @@ -1,34 +1,78 @@ import codecs -""" -As we got the relations to consider from olga, we don't need to do this anymore -""" -# def filter_assertions(path="./relations/assertions.csv"): -# assertions = [] -# with codecs.open(path, "r", "utf8") as f: -# reader = csv.DictReader(f, dialect=csv.excel_tab, fieldnames=["URI", "relation", "node_a", "node_b", "info"]) -# for i,row in enumerate(reader): -# node_a = row["node_a"].split("/c/en/") -# node_b = row["node_b"].split("/c/en/") -# if len(node_a) > 1 and len(node_b) > 1: -# # these should be nodes in english -# node_a = node_a[1].split("/")[-1].replace("_", "-") -# node_b = node_b[1].split("/")[-1].replace("_", "-") -# print(node_a) -# print(node_b) -""" -Based on the relations from olga -""" -def create_joined_assertions_for_random_walks(paths=["./relations/cn_antonyms.txt", "./relations/cn_isA.txt", "./relations/cn_mannerOf.txt","./relations/cn_synonyms.txt"], output_path="./randomwalks/cn_assertions_filtered.tsv"): +def create_joined_assertions_for_random_walks(paths=["./data/cn_antonyms.txt", + "./data/cn_isA.txt", + "./data/cn_mannerOf.txt", + "./data/cn_synonyms.txt", + "./data/cn_atLocation.txt", + "./data/cn_capableOf.txt", + "./data/cn_causes.txt", + "./data/cn_causesDesire.txt", + "./data/cn_createdBy.txt", + "./data/cn_definedAs.txt", + "./data/cn_derivedFrom.txt", + "./data/cn_desires.txt", + "./data/cn_distinctFrom.txt", + "./data/cn_hasA.txt", + "./data/cn_hasContext.txt", + "./data/cn_hasFirstSubevent.txt", + "./data/cn_hasLastSubevent.txt", + "./data/cn_hasSubevent.txt", + "./data/cn_hasPrerequisite.txt", + "./data/cn_hasProperty.txt", + "./data/cn_locatedNear.txt", + "./data/cn_madeOf.txt", + "./data/cn_motivatedByGoal.txt", + "./data/cn_obstructedBy.txt", + "./data/cn_partOf.txt", + "./data/cn_receivesAction.txt", + "./data/cn_relatedTo.txt", + "./data/cn_similarTo.txt", + "./data/cn_symbolOf.txt", + "./data/cn_usedFor.txt"], + output_path="./data/cn_assertions_full.tsv"): + """ + Transforms a series of relation input files to a joint file containing natural language assertations + :param paths: relation input paths + :param output_path: output paths + :return: + """ # we ideally want to have a "natural language representation" of the relations # TODO: keep in mind that antonymy and synonymy are bidirectional relationships, so maybe we want to account for this, i.e., by creating the corresponding pairs in the opposite direction or so # TODO: As an alternative of random walks, we can also just use the natural language representation of the relationships + # TODO: For camera-ready version: Run everything again with this improved mapping relation_dict = { "antonyms": "is an antonym of", "isA": "is a", "mannerOf": "is a manner of", - "synonyms": "is a synonym of" + "synonyms": "is a synonym of", + "atLocation": "is at", + "capableOf": "is capable of", + "causes": "causes", + "causesDesire": "causes the desire to", + "createdBy": "is created by", + "definedAs": "is defined as", + "derivedFrom": "is derived from", + "desires": "desires", + "distinctFrom": "is distinct from", + "hasA": "has a", + "hasContext": "is used in the context of", + "hasFirstSubevent": "begins with", + "hasLastSubevent": "concludes with", + "hasSubevent": "has as subevent", + "hasPrerequisite": "is dependent on", + "hasProperty": "can be described as", + "locatedNear": "is located near", + "madeOf": "is made of", + "motivatedByGoal": "is motivated by", + "obstructedBy": "is obstructed by", + "partOf": "is part of", + "receivesAction": "receives as action", + "relatedTo": "is related to", + "similarTo": "is similar to", + "symbolOf": "is a symbol of", + "usedFor": "is used for", } all_assertions = [] for path in paths: @@ -53,8 +97,7 @@ def create_joined_assertions_for_random_walks(paths=["./relations/cn_antonyms.tx def main(): create_joined_assertions_for_random_walks() - #profile_data() - #filter_assertions() if __name__ == "__main__": main() + diff --git a/randomwalks_utility/random_walks.py b/randomwalks_utility/random_walks.py index 188d237..9411b94 100644 --- a/randomwalks_utility/random_walks.py +++ b/randomwalks_utility/random_walks.py @@ -3,7 +3,7 @@ import random import pickle -def read_graph(path="./randomwalks/cn_assertions_filtered.tsv"): +def read_graph(path="./data/cn_assertions_filtered.tsv"): ''' Reads the input network in networkx. ''' @@ -195,21 +195,22 @@ def generate_random_walks_from_assertions(): p = 1.0 # return hyperparameter q = 1.0 # inout hyperparameter is_directed = True # whether the graph is directed - num_walks = 2 # number of wandom walks per source def. 10 - walk_length = 15 # length of walk per source def. 80 + num_walks = 5 # number of wandom walks per source def. 10 + for walk_length in [20, 30]: + #walk_length = 20 # length of walk per source def. 80 - nx_G = read_graph(path="./randomwalks/cn_assertions_filtered.tsv") - G = Graph(nx_G, is_directed, p, q) - G.preprocess_transition_probs() - walks = G.simulate_walks(num_walks, walk_length) - filename = "./randomwalks/random_walk_" + str(p) + "_" + str(q) + "_" + str(num_walks) + "_" + str(walk_length) + ".p" - with open(filename, 'wb') as handle: - pickle.dump(walks, handle) - print(len(walks)) + nx_G = read_graph(path="./data/cn_assertions_full.tsv") + G = Graph(nx_G, is_directed, p, q) + G.preprocess_transition_probs() + walks = G.simulate_walks(num_walks, walk_length) + filename = "./data/random_walk_full_assertions_" + str(p) + "_" + str(q) + "_" + str(num_walks) + "_" + str(walk_length) + ".p" + with open(filename, 'wb') as handle: + pickle.dump(walks, handle) + print(len(walks)) def analyze_graph(): - nx_G = read_graph(path="./randomwalks/cn_assertions_filtered.tsv") + nx_G = read_graph(path="./data/cn_assertions_full.tsv") print("%d nodes in the graph" % nx_G.number_of_nodes()) print("%d edges in the graph" % nx_G.number_of_edges()) print("%f density of graph" % nx.density(nx_G)) @@ -250,7 +251,7 @@ def load_random_walk(p): def main(): generate_random_walks_from_assertions() #analyze_graph() - # load_random_walk(p="./randomwalks/random_walk_1.0_1.0_2_10.p") + #load_random_walk(p="./data/random_walk_1.0_1.0_2_10.p") if __name__=="__main__": - main() + main() \ No newline at end of file