1
1
import os
2
2
import copy
3
+ import re
3
4
from parser import Parser
4
5
import json
5
6
from stanfordcorenlp import StanfordCoreNLP
9
10
10
11
def get_data_paths (ace2005_path ):
11
12
test_files , dev_files , train_files = [], [], []
12
-
13
13
with open ('./data_list.csv' , mode = 'r' ) as csv_file :
14
14
rows = csv_file .readlines ()
15
15
for row in rows [1 :]:
@@ -28,13 +28,30 @@ def get_data_paths(ace2005_path):
28
28
29
29
30
30
def find_token_index (tokens , start_pos , end_pos , phrase ):
31
- start_idx = - 1
31
+ start_idx , end_idx = - 1 , - 1
32
32
for idx , token in enumerate (tokens ):
33
33
if token ['characterOffsetBegin' ] <= start_pos :
34
34
start_idx = idx
35
35
36
- # Some of the ACE2005 data has annotation position errors.
37
- end_idx = start_idx + len (phrase .split ())
36
+ assert start_idx != - 1 , "start_idx: {}, start_pos: {}, phrase: {}, tokens: {}" .format (start_idx , start_pos , phrase , tokens )
37
+ chars = ''
38
+
39
+ def remove_punc (s ):
40
+ s = re .sub (r'[^\w]' , '' , s )
41
+ return s
42
+
43
+ for i in range (0 , len (tokens ) - start_idx ):
44
+ chars += remove_punc (tokens [start_idx + i ]['originalText' ])
45
+ if remove_punc (phrase ) in chars :
46
+ end_idx = start_idx + i + 1
47
+ break
48
+
49
+ assert end_idx != - 1 , "end_idx: {}, end_pos: {}, phrase: {}, tokens: {}, chars:{}" .format (end_idx , end_pos , phrase , tokens , chars )
50
+ return start_idx , end_idx
51
+
52
+
53
+ def find_token_index_v2 (words , phrase ):
54
+ start_idx , end_idx = - 1 , - 1
38
55
39
56
return start_idx , end_idx
40
57
@@ -59,10 +76,11 @@ def preprocessing(data_type, files):
59
76
data ['golden-event-mentions' ] = []
60
77
61
78
try :
62
- nlp_text = nlp .annotate (item ['sentence' ], properties = {'annotators' : 'tokenize,ssplit,pos,lemma,parse' })
63
- nlp_res = json .loads (nlp_text )
79
+ nlp_res_raw = nlp .annotate (item ['sentence' ], properties = {'annotators' : 'tokenize,ssplit,pos,lemma,parse' })
80
+ nlp_res = json .loads (nlp_res_raw )
64
81
except Exception as e :
65
- print ('[Warning] StanfordCore Exception: ' , nlp_text , 'This sentence will be ignored.' )
82
+ print ('[Warning] StanfordCore Exception: ' , nlp_res_raw , 'This sentence will be ignored.' )
83
+ print ('If you want to include all sentences, please refer to this issue: https://github.com/nlpcl-lab/ace2005-preprocessing/issues/1' )
66
84
continue
67
85
68
86
tokens = nlp_res ['sentences' ][0 ]['tokens' ]
@@ -91,6 +109,10 @@ def preprocessing(data_type, files):
91
109
end_pos = position [1 ] - sent_start_pos + 1 ,
92
110
phrase = entity_mention ['text' ],
93
111
)
112
+ # start_idx, end_idx = find_token_index_v2(
113
+ # words=data['words'],
114
+ # phrase=entity_mention['text'],
115
+ # )
94
116
95
117
entity_mention ['start' ] = start_idx
96
118
entity_mention ['end' ] = end_idx
@@ -109,6 +131,10 @@ def preprocessing(data_type, files):
109
131
end_pos = position [1 ] - sent_start_pos + 1 ,
110
132
phrase = event_mention ['trigger' ]['text' ],
111
133
)
134
+ # start_idx, end_idx = find_token_index_v2(
135
+ # words=data['words'],
136
+ # phrase=event_mention['trigger']['text'],
137
+ # )
112
138
113
139
event_mention ['trigger' ]['start' ] = start_idx
114
140
event_mention ['trigger' ]['end' ] = end_idx
@@ -125,6 +151,10 @@ def preprocessing(data_type, files):
125
151
end_pos = position [1 ] - sent_start_pos + 1 ,
126
152
phrase = argument ['text' ],
127
153
)
154
+ # start_idx, end_idx = find_token_index_v2(
155
+ # words=data['words'],
156
+ # phrase=argument['text'],
157
+ # )
128
158
argument ['start' ] = start_idx
129
159
argument ['end' ] = end_idx
130
160
del argument ['position' ]
@@ -155,6 +185,6 @@ def preprocessing(data_type, files):
155
185
with StanfordCoreNLP ('./stanford-corenlp-full-2018-10-05' , memory = '8g' , timeout = 60000 ) as nlp :
156
186
# res = nlp.annotate('Donald John Trump is current president of the United States.', properties={'annotators': 'tokenize,ssplit,pos,lemma,parse'})
157
187
# print(res)
158
- preprocessing ('train' , train_files )
159
- preprocessing ('test' , test_files )
160
188
preprocessing ('dev' , dev_files )
189
+ preprocessing ('test' , test_files )
190
+ preprocessing ('train' , train_files )
0 commit comments