fix bug of three or more entities per sentence; add io and path exception handling

gling07 · gling07 · commit 44a109f6349d · 2018-04-05T23:46:15.000-05:00
Signed-off-by: Gang Ling &lt;linggang7@gmail.com&gt;
diff --git a/CONFIG.cfg b/CONFIG.cfg
@@ -1,5 +1,5 @@
 [LTH]
-Path: <absolute-path-to-LTH>/lth_srl
+Path: /home/gangling/PycharmProjects/text2drs/lth_srl
 
 [CoreNLP]
-Path: <absolute-path-to-CoreNLP>/stanford-corenlp-full-2016-10-31
+Path: /home/gangling/PycharmProjects/text2drs/stanford-corenlp-full-2016-10-31
diff --git a/README.md b/README.md
@@ -14,9 +14,9 @@ From Narrative Text to Formal Action Language System Descriptions
 * Download or git clone (https://github.com/gling07/Text2DRS) Text2DRS repository
 * If you already have LTH or Stanford core-NLP 3.7.0, you can omit related steps and edit CONFIG file directly
 * Download LTH (http://nlp.cs.lth.se/software/semantic-parsing-propbank-nombank-frames/)
-* Unzip LTH package and move the package dictionary into Text2DRS repository folder
+* Unzip LTH package
 * Download Standford core-NLP **3.7.0** package (https://stanfordnlp.github.io/CoreNLP/history.html)
-* Unzip core-NLP package and move the package dictionary into Text2DRS repository folder
+* Unzip core-NLP package
 * Edit CONFIG.cfg file to include system paths of LTH and core-NLP package as following:
 ```
 [LTH]
diff --git a/corenlp.py b/corenlp.py
@@ -23,7 +23,7 @@
 
 
 def coreference(xml):
-    coref_dictionary = {}
+    coref_dictionary = dict()
     root = xml.getroot()
     for elem in root.findall('./document/coreference/coreference/'):
         is_mention = elem.attrib.get('representative')
diff --git a/drs.py b/drs.py
@@ -43,9 +43,9 @@ def main_process(data_dct_lst):
     return drs_dict
 
 def retrieve_entity(data_dct_lst):
-    entities = []
+    entities = list()
     for sentences in data_dct_lst:
-        temp = []
+        temp = list()
         for sen in sentences:
             if sen.get('PPOS') == 'NNP' or sen.get('PPOS') == 'NN':
                 temp.append(sen.get('Form'))
@@ -57,7 +57,7 @@ def retrieve_entity(data_dct_lst):
 
 
 def mapping_entity(entities):
-    entities_dictionary = {}
+    entities_dictionary = dict()
     count = 1;
     for entity in entities:
         entities_dictionary['r'+ str(count)] = entity
@@ -67,7 +67,7 @@ def mapping_entity(entities):
 
 
 def retrieve_property(entities_map):
-    properties = []
+    properties = list()
     for key, entity in entities_map.items():
         temp = (key, entity)
         properties.append(temp)
@@ -76,7 +76,7 @@ def retrieve_property(entities_map):
 
 
 def retrieve_event(data_dct_lst):
-    events_dictionary = {}
+    events_dictionary = dict()
     count = 1;
     for sentences in data_dct_lst:
         for sen in sentences:
diff --git a/drs2.py b/drs2.py
@@ -44,6 +44,7 @@ def drs_generator(data_dct_lst, coref_dictionary):
 
     return drs_dict
 
+
 def get_omit_entities(coref_dictionary):
 
     omit_list = list()
@@ -57,6 +58,7 @@ def get_omit_entities(coref_dictionary):
                 omit_list.append((key, v))
     return omit_list
 
+
 def get_all_entities(data_dct_lst, omit_list):
     entities = list()
     num = 0
@@ -72,7 +74,7 @@ def get_all_entities(data_dct_lst, omit_list):
 
 
 def mapping_entity(entities):
-    entities_dictionary = {}
+    entities_dictionary = dict()
     count = 1;
     for entity in entities:
         entities_dictionary['r'+ str(count)] = entity
@@ -82,7 +84,7 @@ def mapping_entity(entities):
 
 
 def retrieve_property(entities_map):
-    properties = []
+    properties = list()
     for key, entity in entities_map.items():
         temp = (key, entity)
         properties.append(temp)
@@ -91,7 +93,7 @@ def retrieve_property(entities_map):
 
 
 def retrieve_event(data_dct_lst):
-    events_dictionary = {}
+    events_dictionary = dict()
     count = 1;
     for sentences in data_dct_lst:
         for sen in sentences:
@@ -109,7 +111,7 @@ def retrieve_event_type(data_dct_lst):
     for sentence in data_dct_lst:
         for item in sentence:
             if item.get('PPOS') == 'VBD':
-                event_type_dictionary['e' + str(count)] = item.get('vn-pb')[0]['vn']
+                event_type_dictionary['e' + str(count)] = item.get('vn-pb')[0][1]
                 count += 1
 
     event_type_list = [(k, v) for k, v in event_type_dictionary.items()]
@@ -127,34 +129,40 @@ def retrieve_event_time(events_map):
     return event_time_list
 
 
-def retrieve_event_argument(data_dct_lst, property, eventType):
+def retrieve_event_argument(data_dct_lst, property, event_type):
+
     event_argument_list = list()
-    sentence_property = list()
-    sentence_rolesets = list()
-    for et, sentence in zip(eventType, data_dct_lst):
-        vn = et[1]
+    event_argument_dict = dict()
+    index = 1
+    for event, sentence in zip(event_type, data_dct_lst):
+        arguments_list = list()
+        args_to_vn = list()
+        event_ref = event[0]
+        for sent in sentence:
+            if sent.get('Args') != '_' and sent.get('vn-pb')[0] != '_':
+                # use first verb class as vn class
+                vn_role = sent.get('vn-pb')[0][1]
+                if sent.get('PPOS') == 'NNP' or sent.get('PPOS') == 'NN' or sent.get('PPOS') == 'TO':
+                    args_to_vn.append(vn_role)
+
+        sub_index = 0
         for item in sentence:
             tmp = list()
             if item.get('PPOS') == 'NNP' or item.get('PPOS') == 'NN':
-                sentence_property.append(item.get('Form'))
-            tmp += item.get('vn-pb')
-            for i in tmp:
-                k_list = [k for k in i.keys()]
-                for k in k_list:
-                    if k == vn:
-                        sentence_rolesets.append(i[k])
-
-    index = 0
-    count = 0
-    for p, r in zip(sentence_property, sentence_rolesets):
-        entity = ''
-        for i in property:
-            if i[1] == p:
-                entity = i[0]
-        event_argument_list.append((eventType[index][0], r, entity))
-        count += 1
-        if count == 2:
-            index += 1
-            count = 0
-
-    return event_argument_list
+                tmp.append(event_ref)
+                tmp.append(args_to_vn[sub_index])
+                sub_index += 1
+                entity = item.get('Form')
+                for (ref, ent) in property:
+                    if entity == ent:
+                        tmp.append(ref)
+                        break
+                arguments_list.append(tmp)
+        event_argument_dict[index] = arguments_list
+        index += 1
+
+    for value in event_argument_dict.values():
+        for v in value:
+            event_argument_list.append(v)
+
+    return event_argument_list
diff --git a/fileGenerator.py b/fileGenerator.py
@@ -23,8 +23,6 @@
 
 
 # print drs in asp format
-
-
 def drs_to_asp(drs_dict):
     print('%', end=' ')
     print(', '.join(drs_dict['entity']), end=', ')
@@ -88,6 +86,7 @@ def drs_to_asp(drs_dict):
                     print()
                     count = 0
 
+
 # print verbnet srl table
 def print_table(m_lst):
     dct_keys = m_lst[0][0].keys()
@@ -100,13 +99,18 @@ def print_table(m_lst):
             for key in dct_keys:
                 if key == 'vn-pb':
                     for item in sub_dct3.get(key):
-                        if 'vn' in item.keys():
-                            print('{};'.format(item.get('vn')), end="")
-                        elif '_' not in item.keys():
-                            for k,v in item.items():
-                                print('{}:{};'.format(k,v), end="")
+                        if item[0] == 'vn':
+                            print('{};'.format(item[1]), end='')
+                        elif item[0] != '_':
+                            count = 0
+                            for k in item:
+                                if count == 0:
+                                    print('{}'.format(k), end=":")
+                                    count += 1
+                                else:
+                                    print('{}'.format(k), end="; ")
+                                    count = 0
                         else:
-                            print('{:5s}'.format(item.get('_')), end="")
+                            print('{:5s}'.format(item[0]), end="")
                 else:
-                    print("{:10s}\t".format(sub_dct3.get(key)), end="")
-
+                    print("{:10s}\t".format(sub_dct3.get(key)), end="")
diff --git a/text2drs.py b/text2drs.py
@@ -25,6 +25,8 @@
 import sys
 import subprocess
 import argparse
+from typing import Any, Union
+
 import verbnetsrl
 import drs
 import xml.etree.ElementTree as ET
@@ -94,6 +96,7 @@ def process_lth(file, lth_path):
     # switch back to text2drs dictionary
     os.chdir(text2_drs_path)
 
+
 # process input file by running corenlp through command line
 # output file format can be choose from text, xml, json
 def process_corenlp(file, corenlp_path):
@@ -121,13 +124,24 @@ def main():
     parser.add_argument("input", help='given full path of input file', type=str)
     args = parser.parse_args()
 
-    config.read(args.config)
-    input_file = args.input
+    try:
+        config.read(args.config)
+    except IOError:
+        print('Could not find CONFIG file')
+
+    try:
+        input_file = args.input
+    except IOError:
+        print('Could not find the txt file')
 
+    input_file = args.input
     lth_path = config.get('LTH', 'Path')
-    corenlp_path = config.get('CoreNLP', 'Path')
 
-    process_lth(input_file, lth_path)
+    if os.path.exists(lth_path):
+        process_lth(input_file, lth_path)
+    else:
+        print('LTH path is invalid')
+        sys.exit()
 
     # read lth output file and store in lth_output
     lth_output = None
@@ -152,7 +166,13 @@ def main():
     sys.stdout = orig_stdout
     f.close()
 
-    corenlp_output_path = process_corenlp(input_file, corenlp_path)
+    corenlp_path = config.get('CoreNLP', 'Path')
+    if os.path.exists(corenlp_path):
+          corenlp_output_path = process_corenlp(input_file, corenlp_path)
+    else:
+        print('Core-NLP path invalid')
+        sys.exit()
+
     corenlp_output = None
     try:
         corenlp_output = ET.parse(corenlp_output_path)
diff --git a/verbnetsrl.py b/verbnetsrl.py