Skip to content

Commit 3eee0b6

Browse files
committed
fix critical start/end position error
1 parent b1c2ce8 commit 3eee0b6

File tree

2 files changed

+40
-10
lines changed

2 files changed

+40
-10
lines changed

main.py

+39-9
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import os
22
import copy
3+
import re
34
from parser import Parser
45
import json
56
from stanfordcorenlp import StanfordCoreNLP
@@ -9,7 +10,6 @@
910

1011
def get_data_paths(ace2005_path):
1112
test_files, dev_files, train_files = [], [], []
12-
1313
with open('./data_list.csv', mode='r') as csv_file:
1414
rows = csv_file.readlines()
1515
for row in rows[1:]:
@@ -28,13 +28,30 @@ def get_data_paths(ace2005_path):
2828

2929

3030
def find_token_index(tokens, start_pos, end_pos, phrase):
31-
start_idx = -1
31+
start_idx, end_idx = -1, -1
3232
for idx, token in enumerate(tokens):
3333
if token['characterOffsetBegin'] <= start_pos:
3434
start_idx = idx
3535

36-
# Some of the ACE2005 data has annotation position errors.
37-
end_idx = start_idx + len(phrase.split())
36+
assert start_idx != -1, "start_idx: {}, start_pos: {}, phrase: {}, tokens: {}".format(start_idx, start_pos, phrase, tokens)
37+
chars = ''
38+
39+
def remove_punc(s):
40+
s = re.sub(r'[^\w]', '', s)
41+
return s
42+
43+
for i in range(0, len(tokens) - start_idx):
44+
chars += remove_punc(tokens[start_idx + i]['originalText'])
45+
if remove_punc(phrase) in chars:
46+
end_idx = start_idx + i + 1
47+
break
48+
49+
assert end_idx != -1, "end_idx: {}, end_pos: {}, phrase: {}, tokens: {}, chars:{}".format(end_idx, end_pos, phrase, tokens, chars)
50+
return start_idx, end_idx
51+
52+
53+
def find_token_index_v2(words, phrase):
54+
start_idx, end_idx = -1, -1
3855

3956
return start_idx, end_idx
4057

@@ -59,10 +76,11 @@ def preprocessing(data_type, files):
5976
data['golden-event-mentions'] = []
6077

6178
try:
62-
nlp_text = nlp.annotate(item['sentence'], properties={'annotators': 'tokenize,ssplit,pos,lemma,parse'})
63-
nlp_res = json.loads(nlp_text)
79+
nlp_res_raw = nlp.annotate(item['sentence'], properties={'annotators': 'tokenize,ssplit,pos,lemma,parse'})
80+
nlp_res = json.loads(nlp_res_raw)
6481
except Exception as e:
65-
print('[Warning] StanfordCore Exception: ', nlp_text, 'This sentence will be ignored.')
82+
print('[Warning] StanfordCore Exception: ', nlp_res_raw, 'This sentence will be ignored.')
83+
print('If you want to include all sentences, please refer to this issue: https://github.com/nlpcl-lab/ace2005-preprocessing/issues/1')
6684
continue
6785

6886
tokens = nlp_res['sentences'][0]['tokens']
@@ -91,6 +109,10 @@ def preprocessing(data_type, files):
91109
end_pos=position[1] - sent_start_pos + 1,
92110
phrase=entity_mention['text'],
93111
)
112+
# start_idx, end_idx = find_token_index_v2(
113+
# words=data['words'],
114+
# phrase=entity_mention['text'],
115+
# )
94116

95117
entity_mention['start'] = start_idx
96118
entity_mention['end'] = end_idx
@@ -109,6 +131,10 @@ def preprocessing(data_type, files):
109131
end_pos=position[1] - sent_start_pos + 1,
110132
phrase=event_mention['trigger']['text'],
111133
)
134+
# start_idx, end_idx = find_token_index_v2(
135+
# words=data['words'],
136+
# phrase=event_mention['trigger']['text'],
137+
# )
112138

113139
event_mention['trigger']['start'] = start_idx
114140
event_mention['trigger']['end'] = end_idx
@@ -125,6 +151,10 @@ def preprocessing(data_type, files):
125151
end_pos=position[1] - sent_start_pos + 1,
126152
phrase=argument['text'],
127153
)
154+
# start_idx, end_idx = find_token_index_v2(
155+
# words=data['words'],
156+
# phrase=argument['text'],
157+
# )
128158
argument['start'] = start_idx
129159
argument['end'] = end_idx
130160
del argument['position']
@@ -155,6 +185,6 @@ def preprocessing(data_type, files):
155185
with StanfordCoreNLP('./stanford-corenlp-full-2018-10-05', memory='8g', timeout=60000) as nlp:
156186
# res = nlp.annotate('Donald John Trump is current president of the United States.', properties={'annotators': 'tokenize,ssplit,pos,lemma,parse'})
157187
# print(res)
158-
preprocessing('train', train_files)
159-
preprocessing('test', test_files)
160188
preprocessing('dev', dev_files)
189+
preprocessing('test', test_files)
190+
preprocessing('train', train_files)

parser.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def get_data(self):
5252
entity_map[entity_mention['entity-id']] = entity_mention
5353

5454
for event_mention in self.event_mentions:
55-
event_position = event_mention['position']
55+
event_position = event_mention['trigger']['position']
5656
if text_position[0] <= event_position[0] and event_position[1] <= text_position[1]:
5757
event_arguments = []
5858
for argument in event_mention['arguments']:

0 commit comments

Comments
 (0)