From 508ec66f59e43b5b64a012ebfe8719d502873c2b Mon Sep 17 00:00:00 2001
From: anlausch <anne-lauscher@web.de>
Date: Wed, 3 Feb 2021 12:07:15 +0100
Subject: [PATCH] Update the data processing scripts to consider all relations
 as in the final version

---
 .../create_corpora_from_random_walks.py       | 20 +++--
 randomwalks_utility/preprocess_cn.py          | 89 ++++++++++++++-----
 randomwalks_utility/random_walks.py           | 29 +++---
 3 files changed, 96 insertions(+), 42 deletions(-)

diff --git a/randomwalks_utility/create_corpora_from_random_walks.py b/randomwalks_utility/create_corpora_from_random_walks.py
index eb4cdb1..93148fa 100644
--- a/randomwalks_utility/create_corpora_from_random_walks.py
+++ b/randomwalks_utility/create_corpora_from_random_walks.py
@@ -1,5 +1,6 @@
 import pickle
 import codecs
+
 from tqdm import tqdm
 from concurrent.futures import ProcessPoolExecutor, as_completed
 
@@ -46,7 +47,7 @@ def chunks(lst, n):
   for i in range(0, len(lst), n):
     yield lst[i:i + n]
 
-def generate_corpus_from_walks(walks, output_path_prefix="./randomwalks/rw_corpus_", output_path_suffix=""):
+def generate_corpus_from_walks(walks, output_path_prefix="./data/rw_corpus_", output_path_suffix=""):
   # how do we actually want to generate the corpus?
   # one option is to always dublicate the node in the middle..
   # also Goran says that we want to keep the relations as separate tokens in the vocab. I do not necessarily agree with this, but we try.
@@ -75,13 +76,22 @@ def generate_corpus_from_walks(walks, output_path_prefix="./randomwalks/rw_corpu
   with codecs.open(output_path, "w", "utf8") as out:
     out.write(text)
 
+def analyze_walks(path="./output/random_walk"):
+  walk = load_walks(path)
+  total_num_walks = len(walk)
+
+
 
 def main():
-  in_prefix = "randomwalks/random_walk_"
-  in_suffix = "1.0_1.0_2_15"
-  walks = load_walks(in_prefix + in_suffix + ".p")
-  generate_corpus_from_walks(walks, output_path_suffix=in_suffix + "_nl")
+  for in_suffix in ["1.0_1.0_5_20","1.0_1.0_5_30"]:
+    in_prefix = "./data/random_walk_full_assertions_"
+    #in_suffix = "1.0_1.0_5_10"
+    walks = load_walks(in_prefix + in_suffix + ".p")
+    generate_corpus_from_walks(walks, output_path_suffix=in_suffix + "full_assertions_nl")
 
 
 if __name__=="__main__":
   main()
+
+
+
diff --git a/randomwalks_utility/preprocess_cn.py b/randomwalks_utility/preprocess_cn.py
index 0602e3f..53ad1ae 100644
--- a/randomwalks_utility/preprocess_cn.py
+++ b/randomwalks_utility/preprocess_cn.py
@@ -1,34 +1,78 @@
 import codecs
 
-"""
-As we got the relations to consider from olga, we don't need to do this anymore
-"""
-# def filter_assertions(path="./relations/assertions.csv"):
-#   assertions = []
-#   with codecs.open(path, "r", "utf8") as f:
-#     reader = csv.DictReader(f, dialect=csv.excel_tab, fieldnames=["URI", "relation", "node_a", "node_b", "info"])
-#     for i,row in enumerate(reader):
-#       node_a = row["node_a"].split("/c/en/")
-#       node_b = row["node_b"].split("/c/en/")
-#       if len(node_a) > 1 and len(node_b) > 1:
-#         # these should be nodes in english
-#         node_a = node_a[1].split("/")[-1].replace("_", "-")
-#         node_b = node_b[1].split("/")[-1].replace("_", "-")
-#         print(node_a)
-#         print(node_b)
 
-"""
-Based on the relations from olga
-"""
-def create_joined_assertions_for_random_walks(paths=["./relations/cn_antonyms.txt", "./relations/cn_isA.txt", "./relations/cn_mannerOf.txt","./relations/cn_synonyms.txt"], output_path="./randomwalks/cn_assertions_filtered.tsv"):
+def create_joined_assertions_for_random_walks(paths=["./data/cn_antonyms.txt",
+                                                     "./data/cn_isA.txt",
+                                                     "./data/cn_mannerOf.txt",
+                                                     "./data/cn_synonyms.txt",
+                                                     "./data/cn_atLocation.txt",
+                                                     "./data/cn_capableOf.txt",
+                                                     "./data/cn_causes.txt",
+                                                     "./data/cn_causesDesire.txt",
+                                                     "./data/cn_createdBy.txt",
+                                                     "./data/cn_definedAs.txt",
+                                                     "./data/cn_derivedFrom.txt",
+                                                     "./data/cn_desires.txt",
+                                                     "./data/cn_distinctFrom.txt",
+                                                     "./data/cn_hasA.txt",
+                                                     "./data/cn_hasContext.txt",
+                                                     "./data/cn_hasFirstSubevent.txt",
+                                                     "./data/cn_hasLastSubevent.txt",
+                                                     "./data/cn_hasSubevent.txt",
+                                                     "./data/cn_hasPrerequisite.txt",
+                                                     "./data/cn_hasProperty.txt",
+                                                     "./data/cn_locatedNear.txt",
+                                                     "./data/cn_madeOf.txt",
+                                                     "./data/cn_motivatedByGoal.txt",
+                                                     "./data/cn_obstructedBy.txt",
+                                                     "./data/cn_partOf.txt",
+                                                     "./data/cn_receivesAction.txt",
+                                                     "./data/cn_relatedTo.txt",
+                                                     "./data/cn_similarTo.txt",
+                                                     "./data/cn_symbolOf.txt",
+                                                     "./data/cn_usedFor.txt"],
+                                              output_path="./data/cn_assertions_full.tsv"):
+  """
+  Transforms a series of relation input files to a joint file containing natural language assertations
+  :param paths: relation input paths
+  :param output_path: output paths
+  :return:
+  """
   # we ideally want to have a "natural language representation" of the relations
   # TODO: keep in mind that antonymy and synonymy are bidirectional relationships, so maybe we want to account for this, i.e., by creating the corresponding pairs in the opposite direction or so
   # TODO: As an alternative of random walks, we can also just use the natural language representation of the relationships
+  # TODO: For camera-ready version: Run everything again with this improved mapping
   relation_dict = {
     "antonyms": "is an antonym of",
     "isA": "is a",
     "mannerOf": "is a manner of",
-    "synonyms": "is a synonym of"
+    "synonyms": "is a synonym of",
+    "atLocation": "is at",
+    "capableOf": "is capable of",
+    "causes": "causes",
+    "causesDesire": "causes the desire to",
+    "createdBy": "is created by",
+    "definedAs": "is defined as",
+    "derivedFrom": "is derived from",
+    "desires": "desires",
+    "distinctFrom": "is distinct from",
+    "hasA": "has a",
+    "hasContext": "is used in the context of",
+    "hasFirstSubevent": "begins with",
+    "hasLastSubevent": "concludes with",
+    "hasSubevent": "has as subevent",
+    "hasPrerequisite": "is dependent on",
+    "hasProperty": "can be described as",
+    "locatedNear": "is located near",
+    "madeOf": "is made of",
+    "motivatedByGoal": "is motivated by",
+    "obstructedBy": "is obstructed by",
+    "partOf": "is part of",
+    "receivesAction": "receives as action",
+    "relatedTo": "is related to",
+    "similarTo": "is similar to",
+    "symbolOf": "is a symbol of",
+    "usedFor": "is used for",
   }
   all_assertions = []
   for path in paths:
@@ -53,8 +97,7 @@ def create_joined_assertions_for_random_walks(paths=["./relations/cn_antonyms.tx
 
 def main():
   create_joined_assertions_for_random_walks()
-  #profile_data()
-  #filter_assertions()
 
 if __name__ == "__main__":
   main()
+
diff --git a/randomwalks_utility/random_walks.py b/randomwalks_utility/random_walks.py
index 188d237..9411b94 100644
--- a/randomwalks_utility/random_walks.py
+++ b/randomwalks_utility/random_walks.py
@@ -3,7 +3,7 @@
 import random
 import pickle
 
-def read_graph(path="./randomwalks/cn_assertions_filtered.tsv"):
+def read_graph(path="./data/cn_assertions_filtered.tsv"):
   '''
   Reads the input network in networkx.
   '''
@@ -195,21 +195,22 @@ def generate_random_walks_from_assertions():
   p = 1.0  # return hyperparameter
   q = 1.0  # inout hyperparameter
   is_directed = True  # whether the graph is directed
-  num_walks = 2  # number of wandom walks per source def. 10
-  walk_length = 15  # length of walk per source def. 80
+  num_walks = 5  # number of wandom walks per source def. 10
+  for walk_length in [20, 30]:
+  #walk_length = 20  # length of walk per source def. 80
 
-  nx_G = read_graph(path="./randomwalks/cn_assertions_filtered.tsv")
-  G = Graph(nx_G, is_directed, p, q)
-  G.preprocess_transition_probs()
-  walks = G.simulate_walks(num_walks, walk_length)
-  filename = "./randomwalks/random_walk_" + str(p) + "_" + str(q) + "_" + str(num_walks) + "_" + str(walk_length) + ".p"
-  with open(filename, 'wb') as handle:
-    pickle.dump(walks, handle)
-  print(len(walks))
+    nx_G = read_graph(path="./data/cn_assertions_full.tsv")
+    G = Graph(nx_G, is_directed, p, q)
+    G.preprocess_transition_probs()
+    walks = G.simulate_walks(num_walks, walk_length)
+    filename = "./data/random_walk_full_assertions_" + str(p) + "_" + str(q) + "_" + str(num_walks) + "_" + str(walk_length) + ".p"
+    with open(filename, 'wb') as handle:
+      pickle.dump(walks, handle)
+    print(len(walks))
 
 
 def analyze_graph():
-  nx_G = read_graph(path="./randomwalks/cn_assertions_filtered.tsv")
+  nx_G = read_graph(path="./data/cn_assertions_full.tsv")
   print("%d nodes in the graph" % nx_G.number_of_nodes())
   print("%d edges in the graph" % nx_G.number_of_edges())
   print("%f density of graph" % nx.density(nx_G))
@@ -250,7 +251,7 @@ def load_random_walk(p):
 def main():
   generate_random_walks_from_assertions()
   #analyze_graph()
-  # load_random_walk(p="./randomwalks/random_walk_1.0_1.0_2_10.p")
+  #load_random_walk(p="./data/random_walk_1.0_1.0_2_10.p")
 
 if __name__=="__main__":
-  main()
+  main()
\ No newline at end of file