@@ -28,25 +28,22 @@ def get_data_paths(ace2005_path):
28
28
29
29
30
30
def find_token_index (tokens , start_pos , end_pos , phrase ):
31
- start_idx , end_idx = - 1 , - 1
31
+ start_idx = - 1
32
32
for idx , token in enumerate (tokens ):
33
33
if token ['characterOffsetBegin' ] <= start_pos :
34
34
start_idx = idx
35
- # if token['characterOffsetEnd'] == end_pos:
36
- # end_idx = idx - 1
37
35
38
36
# Some of the ACE2005 data has annotation position errors.
39
- if end_idx == - 1 :
40
- end_idx = start_idx + len (phrase .split ())
37
+ end_idx = start_idx + len (phrase .split ())
41
38
42
39
return start_idx , end_idx
43
40
44
41
45
42
def preprocessing (data_type , files ):
46
43
result = []
47
- event_count , entity_count , sent_count = 0 , 0 , 0
44
+ event_count , entity_count , sent_count , argument_count = 0 , 0 , 0 , 0
48
45
49
- print ('- ' * 20 )
46
+ print ('= ' * 20 )
50
47
print ('[preprocessing] type: ' , data_type )
51
48
for file in tqdm (files ):
52
49
parser = Parser (path = file )
@@ -65,15 +62,14 @@ def preprocessing(data_type, files):
65
62
nlp_text = nlp .annotate (item ['sentence' ], properties = {'annotators' : 'tokenize,ssplit,pos,lemma,parse' })
66
63
nlp_res = json .loads (nlp_text )
67
64
except Exception as e :
68
- print ('StanfordCore Exception ' , e )
69
- print ('item["sentence"] :' , item ['sentence' ])
70
- print ('nlp_text :' , nlp_text )
65
+ print ('[Warning] StanfordCore Exception: ' , nlp_text , 'This sentence will be ignored.' )
71
66
continue
72
67
73
68
tokens = nlp_res ['sentences' ][0 ]['tokens' ]
74
69
75
70
if len (nlp_res ['sentences' ]) >= 2 :
76
- print ('len >=2! Sentence :' , data ['sentence' ])
71
+ # TODO: issue where the sentence segmentation of NTLK and StandfordCoreNLP do not match
72
+ # This error occurred so little that it was temporarily ignored (< 20 sentences).
77
73
continue
78
74
79
75
data ['stanford-colcc' ] = []
@@ -104,7 +100,7 @@ def preprocessing(data_type, files):
104
100
data ['golden-entity-mentions' ].append (entity_mention )
105
101
106
102
for event_mention in item ['golden-event-mentions' ]:
107
- # same event mention cab be shared
103
+ # same event mention can be shared
108
104
event_mention = copy .deepcopy (event_mention )
109
105
position = event_mention ['trigger' ]['position' ]
110
106
start_idx , end_idx = find_token_index (
@@ -120,6 +116,7 @@ def preprocessing(data_type, files):
120
116
del event_mention ['position' ]
121
117
122
118
arguments = []
119
+ argument_count += len (event_mention ['arguments' ])
123
120
for argument in event_mention ['arguments' ]:
124
121
position = argument ['position' ]
125
122
start_idx , end_idx = find_token_index (
@@ -139,9 +136,11 @@ def preprocessing(data_type, files):
139
136
140
137
result .append (data )
141
138
142
- print ('sent_count :' , sent_count )
143
- print ('event_count :' , event_count )
144
- print ('entity_count :' , entity_count )
139
+ print ('======[Statistics]======' )
140
+ print ('sent :' , sent_count )
141
+ print ('event :' , event_count )
142
+ print ('entity :' , entity_count )
143
+ print ('argument:' , argument_count )
145
144
146
145
with open ('output/{}.json' .format (data_type ), 'w' ) as f :
147
146
json .dump (result , f , indent = 2 )
@@ -156,6 +155,6 @@ def preprocessing(data_type, files):
156
155
with StanfordCoreNLP ('./stanford-corenlp-full-2018-10-05' , memory = '8g' , timeout = 60000 ) as nlp :
157
156
# res = nlp.annotate('Donald John Trump is current president of the United States.', properties={'annotators': 'tokenize,ssplit,pos,lemma,parse'})
158
157
# print(res)
159
- preprocessing ('dev' , dev_files )
160
158
preprocessing ('train' , train_files )
161
159
preprocessing ('test' , test_files )
160
+ preprocessing ('dev' , dev_files )
0 commit comments