improve exception handling

bowbowbow · bowbowbow · commit b1c2ce8544a8 · 2019-09-12T06:05:28.000+09:00
diff --git a/README.md b/README.md
@@ -105,7 +105,7 @@ If you want to know event types and arguments in detail, read [this document (AC
 
 ### Data Split
 
-The resulting data is divided into test/dev/train as follows.
+The result of data is divided into test/dev/train as follows.
 ```
 ├── output
 │     └── test.json
diff --git a/main.py b/main.py
@@ -28,25 +28,22 @@ def get_data_paths(ace2005_path):
 
 
 def find_token_index(tokens, start_pos, end_pos, phrase):
-    start_idx, end_idx = -1, -1
+    start_idx = -1
     for idx, token in enumerate(tokens):
         if token['characterOffsetBegin'] <= start_pos:
             start_idx = idx
-        # if token['characterOffsetEnd'] == end_pos:
-        #     end_idx = idx - 1
 
     # Some of the ACE2005 data has annotation position errors.
-    if end_idx == -1:
-        end_idx = start_idx + len(phrase.split())
+    end_idx = start_idx + len(phrase.split())
 
     return start_idx, end_idx
 
 
 def preprocessing(data_type, files):
     result = []
-    event_count, entity_count, sent_count = 0, 0, 0
+    event_count, entity_count, sent_count, argument_count = 0, 0, 0, 0
 
-    print('-' * 20)
+    print('=' * 20)
     print('[preprocessing] type: ', data_type)
     for file in tqdm(files):
         parser = Parser(path=file)
@@ -65,15 +62,14 @@ def preprocessing(data_type, files):
                 nlp_text = nlp.annotate(item['sentence'], properties={'annotators': 'tokenize,ssplit,pos,lemma,parse'})
                 nlp_res = json.loads(nlp_text)
             except Exception as e:
-                print('StanfordCore Exception ', e)
-                print('item["sentence"] :', item['sentence'])
-                print('nlp_text :', nlp_text)
+                print('[Warning] StanfordCore Exception: ', nlp_text, 'This sentence will be ignored.')
                 continue
 
             tokens = nlp_res['sentences'][0]['tokens']
 
             if len(nlp_res['sentences']) >= 2:
-                print('len >=2! Sentence :', data['sentence'])
+                # TODO: issue where the sentence segmentation of NTLK and StandfordCoreNLP do not match
+                # This error occurred so little that it was temporarily ignored (< 20 sentences).
                 continue
 
             data['stanford-colcc'] = []
@@ -104,7 +100,7 @@ def preprocessing(data_type, files):
                 data['golden-entity-mentions'].append(entity_mention)
 
             for event_mention in item['golden-event-mentions']:
-                # same event mention cab be shared
+                # same event mention can be shared
                 event_mention = copy.deepcopy(event_mention)
                 position = event_mention['trigger']['position']
                 start_idx, end_idx = find_token_index(
@@ -120,6 +116,7 @@ def preprocessing(data_type, files):
                 del event_mention['position']
 
                 arguments = []
+                argument_count += len(event_mention['arguments'])
                 for argument in event_mention['arguments']:
                     position = argument['position']
                     start_idx, end_idx = find_token_index(
@@ -139,9 +136,11 @@ def preprocessing(data_type, files):
 
             result.append(data)
 
-    print('sent_count :', sent_count)
-    print('event_count :', event_count)
-    print('entity_count :', entity_count)
+    print('======[Statistics]======')
+    print('sent :', sent_count)
+    print('event :', event_count)
+    print('entity :', entity_count)
+    print('argument:', argument_count)
 
     with open('output/{}.json'.format(data_type), 'w') as f:
         json.dump(result, f, indent=2)
@@ -156,6 +155,6 @@ def preprocessing(data_type, files):
     with StanfordCoreNLP('./stanford-corenlp-full-2018-10-05', memory='8g', timeout=60000) as nlp:
         # res = nlp.annotate('Donald John Trump is current president of the United States.', properties={'annotators': 'tokenize,ssplit,pos,lemma,parse'})
         # print(res)
-        preprocessing('dev', dev_files)
         preprocessing('train', train_files)
         preprocessing('test', test_files)
+        preprocessing('dev', dev_files)
diff --git a/parser.py b/parser.py
@@ -7,6 +7,7 @@
 
 class Parser:
     def __init__(self, path):
+        self.path = path
         self.entity_mentions = []
         self.event_mentions = []
         self.sentences = []
@@ -55,10 +56,16 @@ def get_data(self):
                 if text_position[0] <= event_position[0] and event_position[1] <= text_position[1]:
                     event_arguments = []
                     for argument in event_mention['arguments']:
+                        try:
+                            entity_type = entity_map[argument['entity-id']]['entity-type']
+                        except KeyError:
+                            print('[Warning] The entity in the other sentence is mentioned. This argument will be ignored.')
+                            continue
+
                         event_arguments.append({
                             'role': argument['role'],
                             'position': argument['position'],
-                            'entity-type': entity_map[argument['entity-id']]['entity-type'],
+                            'entity-type': entity_type,
                             'text': self.clean_text(argument['text']),
                         })
 
@@ -71,16 +78,15 @@ def get_data(self):
             data.append(item)
         return data
 
-    @staticmethod
-    def find_correct_offset(sgm_text, start_index, text):
+    def find_correct_offset(self, sgm_text, start_index, text):
         offset = 0
-        for i in range(0, 50):
+        for i in range(0, 70):
             for j in [-1, 1]:
                 offset = i * j
                 if sgm_text[start_index + offset:start_index + offset + len(text)] == text:
                     return offset
 
-        print('[Warning] fail to find offset! (start_index: {}, text: {})'.format(start_index, text))
+        print('[Warning] fail to find offset! (start_index: {}, text: {}, path: {})'.format(start_index, text, self.path))
         return offset
 
     def fix_wrong_position(self):
@@ -240,11 +246,12 @@ def parse_value_timex_tag(node):
 
 
 if __name__ == '__main__':
-    parser = Parser('./data/ace_2005_td_v7/data/English/un/fp2/alt.gossip.celebrities_20041118.2331')
-    # parser = Parser('./data/ace_2005_td_v7/data/English/un/adj/alt.atheism_20041104.2428')
+    # parser = Parser('./data/ace_2005_td_v7/data/English/un/fp2/alt.gossip.celebrities_20041118.2331')
+    parser = Parser('./data/ace_2005_td_v7/data/English/un/timex2norm/alt.corel_20041228.0503')
     data = parser.get_data()
     with open('./output/debug.json', 'w') as f:
         json.dump(data, f, indent=2)
 
-    index = parser.sgm_text.find("the two")
-    # print(parser.sgm_text[index:])
+    # index = parser.sgm_text.find("Diego Garcia")
+    # print('index :', index)
+    # print(parser.sgm_text[1918 - 30:])