@@ -50,10 +50,43 @@ def remove_punc(s):
50
50
return start_idx , end_idx
51
51
52
52
53
- def find_token_index_v2 (words , phrase ):
54
- start_idx , end_idx = - 1 , - 1
53
+ def verify_result (data ):
54
+ def remove_punctuation (s ):
55
+ for c in ['-LRB-' , '-RRB-' , '-LSB-' , '-RSB-' , '-LCB-' , '-RCB-' , '\xa0 ' ]:
56
+ s = s .replace (c , '' )
57
+ s = re .sub (r'[^\w]' , '' , s )
58
+ return s
55
59
56
- return start_idx , end_idx
60
+ def check_diff (words , phrase ):
61
+ return remove_punctuation (phrase ) not in remove_punctuation (words )
62
+
63
+ for item in data :
64
+ words = item ['words' ]
65
+ for entity_mention in item ['golden-entity-mentions' ]:
66
+ if check_diff ('' .join (words [entity_mention ['start' ]:entity_mention ['end' ]]), entity_mention ['text' ].replace (' ' , '' )):
67
+ print ('============================' )
68
+ print ('[Warning] entity has invalid start/end' )
69
+ print ('Expected: ' , entity_mention ['text' ])
70
+ print ('Actual:' , words [entity_mention ['start' ]:entity_mention ['end' ]])
71
+ print ('start: {}, end: {}, words: {}' .format (entity_mention ['start' ], entity_mention ['end' ], words ))
72
+
73
+ for event_mention in item ['golden-event-mentions' ]:
74
+ trigger = event_mention ['trigger' ]
75
+ if check_diff ('' .join (words [trigger ['start' ]:trigger ['end' ]]), trigger ['text' ].replace (' ' , '' )):
76
+ print ('============================' )
77
+ print ('[Warning] trigger has invalid start/end' )
78
+ print ('Expected: ' , trigger ['text' ])
79
+ print ('Actual:' , words [trigger ['start' ]:trigger ['end' ]])
80
+ print ('start: {}, end: {}, words: {}' .format (trigger ['start' ], trigger ['end' ], words ))
81
+ for argument in event_mention ['arguments' ]:
82
+ if check_diff ('' .join (words [argument ['start' ]:argument ['end' ]]), argument ['text' ].replace (' ' , '' )):
83
+ print ('============================' )
84
+ print ('[Warning] argument has invalid start/end' )
85
+ print ('Expected: ' , argument ['text' ])
86
+ print ('Actual:' , words [argument ['start' ]:argument ['end' ]])
87
+ print ('start: {}, end: {}, words: {}' .format (argument ['start' ], argument ['end' ], words ))
88
+
89
+ print ('Complete verification' )
57
90
58
91
59
92
def preprocessing (data_type , files ):
@@ -109,10 +142,6 @@ def preprocessing(data_type, files):
109
142
end_pos = position [1 ] - sent_start_pos + 1 ,
110
143
phrase = entity_mention ['text' ],
111
144
)
112
- # start_idx, end_idx = find_token_index_v2(
113
- # words=data['words'],
114
- # phrase=entity_mention['text'],
115
- # )
116
145
117
146
entity_mention ['start' ] = start_idx
118
147
entity_mention ['end' ] = end_idx
@@ -131,10 +160,6 @@ def preprocessing(data_type, files):
131
160
end_pos = position [1 ] - sent_start_pos + 1 ,
132
161
phrase = event_mention ['trigger' ]['text' ],
133
162
)
134
- # start_idx, end_idx = find_token_index_v2(
135
- # words=data['words'],
136
- # phrase=event_mention['trigger']['text'],
137
- # )
138
163
139
164
event_mention ['trigger' ]['start' ] = start_idx
140
165
event_mention ['trigger' ]['end' ] = end_idx
@@ -151,10 +176,7 @@ def preprocessing(data_type, files):
151
176
end_pos = position [1 ] - sent_start_pos + 1 ,
152
177
phrase = argument ['text' ],
153
178
)
154
- # start_idx, end_idx = find_token_index_v2(
155
- # words=data['words'],
156
- # phrase=argument['text'],
157
- # )
179
+
158
180
argument ['start' ] = start_idx
159
181
argument ['end' ] = end_idx
160
182
del argument ['position' ]
@@ -172,6 +194,7 @@ def preprocessing(data_type, files):
172
194
print ('entity :' , entity_count )
173
195
print ('argument:' , argument_count )
174
196
197
+ verify_result (result )
175
198
with open ('output/{}.json' .format (data_type ), 'w' ) as f :
176
199
json .dump (result , f , indent = 2 )
177
200
0 commit comments