Skip to content

Commit b1c2ce8

Browse files
committed
improve exception handling
1 parent 63f5268 commit b1c2ce8

File tree

3 files changed

+32
-26
lines changed

3 files changed

+32
-26
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ If you want to know event types and arguments in detail, read [this document (AC
105105

106106
### Data Split
107107

108-
The resulting data is divided into test/dev/train as follows.
108+
The result of data is divided into test/dev/train as follows.
109109
```
110110
├── output
111111
│ └── test.json

main.py

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -28,25 +28,22 @@ def get_data_paths(ace2005_path):
2828

2929

3030
def find_token_index(tokens, start_pos, end_pos, phrase):
31-
start_idx, end_idx = -1, -1
31+
start_idx = -1
3232
for idx, token in enumerate(tokens):
3333
if token['characterOffsetBegin'] <= start_pos:
3434
start_idx = idx
35-
# if token['characterOffsetEnd'] == end_pos:
36-
# end_idx = idx - 1
3735

3836
# Some of the ACE2005 data has annotation position errors.
39-
if end_idx == -1:
40-
end_idx = start_idx + len(phrase.split())
37+
end_idx = start_idx + len(phrase.split())
4138

4239
return start_idx, end_idx
4340

4441

4542
def preprocessing(data_type, files):
4643
result = []
47-
event_count, entity_count, sent_count = 0, 0, 0
44+
event_count, entity_count, sent_count, argument_count = 0, 0, 0, 0
4845

49-
print('-' * 20)
46+
print('=' * 20)
5047
print('[preprocessing] type: ', data_type)
5148
for file in tqdm(files):
5249
parser = Parser(path=file)
@@ -65,15 +62,14 @@ def preprocessing(data_type, files):
6562
nlp_text = nlp.annotate(item['sentence'], properties={'annotators': 'tokenize,ssplit,pos,lemma,parse'})
6663
nlp_res = json.loads(nlp_text)
6764
except Exception as e:
68-
print('StanfordCore Exception ', e)
69-
print('item["sentence"] :', item['sentence'])
70-
print('nlp_text :', nlp_text)
65+
print('[Warning] StanfordCore Exception: ', nlp_text, 'This sentence will be ignored.')
7166
continue
7267

7368
tokens = nlp_res['sentences'][0]['tokens']
7469

7570
if len(nlp_res['sentences']) >= 2:
76-
print('len >=2! Sentence :', data['sentence'])
71+
# TODO: issue where the sentence segmentation of NTLK and StandfordCoreNLP do not match
72+
# This error occurred so little that it was temporarily ignored (< 20 sentences).
7773
continue
7874

7975
data['stanford-colcc'] = []
@@ -104,7 +100,7 @@ def preprocessing(data_type, files):
104100
data['golden-entity-mentions'].append(entity_mention)
105101

106102
for event_mention in item['golden-event-mentions']:
107-
# same event mention cab be shared
103+
# same event mention can be shared
108104
event_mention = copy.deepcopy(event_mention)
109105
position = event_mention['trigger']['position']
110106
start_idx, end_idx = find_token_index(
@@ -120,6 +116,7 @@ def preprocessing(data_type, files):
120116
del event_mention['position']
121117

122118
arguments = []
119+
argument_count += len(event_mention['arguments'])
123120
for argument in event_mention['arguments']:
124121
position = argument['position']
125122
start_idx, end_idx = find_token_index(
@@ -139,9 +136,11 @@ def preprocessing(data_type, files):
139136

140137
result.append(data)
141138

142-
print('sent_count :', sent_count)
143-
print('event_count :', event_count)
144-
print('entity_count :', entity_count)
139+
print('======[Statistics]======')
140+
print('sent :', sent_count)
141+
print('event :', event_count)
142+
print('entity :', entity_count)
143+
print('argument:', argument_count)
145144

146145
with open('output/{}.json'.format(data_type), 'w') as f:
147146
json.dump(result, f, indent=2)
@@ -156,6 +155,6 @@ def preprocessing(data_type, files):
156155
with StanfordCoreNLP('./stanford-corenlp-full-2018-10-05', memory='8g', timeout=60000) as nlp:
157156
# res = nlp.annotate('Donald John Trump is current president of the United States.', properties={'annotators': 'tokenize,ssplit,pos,lemma,parse'})
158157
# print(res)
159-
preprocessing('dev', dev_files)
160158
preprocessing('train', train_files)
161159
preprocessing('test', test_files)
160+
preprocessing('dev', dev_files)

parser.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
class Parser:
99
def __init__(self, path):
10+
self.path = path
1011
self.entity_mentions = []
1112
self.event_mentions = []
1213
self.sentences = []
@@ -55,10 +56,16 @@ def get_data(self):
5556
if text_position[0] <= event_position[0] and event_position[1] <= text_position[1]:
5657
event_arguments = []
5758
for argument in event_mention['arguments']:
59+
try:
60+
entity_type = entity_map[argument['entity-id']]['entity-type']
61+
except KeyError:
62+
print('[Warning] The entity in the other sentence is mentioned. This argument will be ignored.')
63+
continue
64+
5865
event_arguments.append({
5966
'role': argument['role'],
6067
'position': argument['position'],
61-
'entity-type': entity_map[argument['entity-id']]['entity-type'],
68+
'entity-type': entity_type,
6269
'text': self.clean_text(argument['text']),
6370
})
6471

@@ -71,16 +78,15 @@ def get_data(self):
7178
data.append(item)
7279
return data
7380

74-
@staticmethod
75-
def find_correct_offset(sgm_text, start_index, text):
81+
def find_correct_offset(self, sgm_text, start_index, text):
7682
offset = 0
77-
for i in range(0, 50):
83+
for i in range(0, 70):
7884
for j in [-1, 1]:
7985
offset = i * j
8086
if sgm_text[start_index + offset:start_index + offset + len(text)] == text:
8187
return offset
8288

83-
print('[Warning] fail to find offset! (start_index: {}, text: {})'.format(start_index, text))
89+
print('[Warning] fail to find offset! (start_index: {}, text: {}, path: {})'.format(start_index, text, self.path))
8490
return offset
8591

8692
def fix_wrong_position(self):
@@ -240,11 +246,12 @@ def parse_value_timex_tag(node):
240246

241247

242248
if __name__ == '__main__':
243-
parser = Parser('./data/ace_2005_td_v7/data/English/un/fp2/alt.gossip.celebrities_20041118.2331')
244-
# parser = Parser('./data/ace_2005_td_v7/data/English/un/adj/alt.atheism_20041104.2428')
249+
# parser = Parser('./data/ace_2005_td_v7/data/English/un/fp2/alt.gossip.celebrities_20041118.2331')
250+
parser = Parser('./data/ace_2005_td_v7/data/English/un/timex2norm/alt.corel_20041228.0503')
245251
data = parser.get_data()
246252
with open('./output/debug.json', 'w') as f:
247253
json.dump(data, f, indent=2)
248254

249-
index = parser.sgm_text.find("the two")
250-
# print(parser.sgm_text[index:])
255+
# index = parser.sgm_text.find("Diego Garcia")
256+
# print('index :', index)
257+
# print(parser.sgm_text[1918 - 30:])

0 commit comments

Comments
 (0)