Skip to content

Commit db784f1

Browse files
committed
add verification func for result
1 parent 3eee0b6 commit db784f1

File tree

2 files changed

+42
-19
lines changed

2 files changed

+42
-19
lines changed

README.md

+4-4
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ This project use the same data partitioning as the previous work ([Yang and Mitc
119119
Below is information about the amount of parsed data when using this project. It is slightly different from the parsing results of the two papers above. The difference seems to have occurred because there are no promised rules for splitting sentences within the sgm format files.
120120
121121
| | Documents | Sentences |Triggers | Arguments | Entity Mentions |
122-
|------- |-----------|----------- |---------------|----------------- |----------------- |
123-
| Test | 40 | 713 | 424 | 878 | 4226 |
124-
| Dev | 30 | 875 | 505 | 906 | 4050 |
125-
| Train | 529 | 14724 | 4420 | 7147 | 53045 |
122+
|------- |--------------|--------------|------------|-----------|----------------- |
123+
| Test | 40 | 713 | 424 | 892 | 4226 |
124+
| Dev | 30 | 875 | 505 | 933 | 4050 |
125+
| Train | 529 | 14724 | 4420 | 7811 | 53045 |

main.py

+38-15
Original file line numberDiff line numberDiff line change
@@ -50,10 +50,43 @@ def remove_punc(s):
5050
return start_idx, end_idx
5151

5252

53-
def find_token_index_v2(words, phrase):
54-
start_idx, end_idx = -1, -1
53+
def verify_result(data):
54+
def remove_punctuation(s):
55+
for c in ['-LRB-', '-RRB-', '-LSB-', '-RSB-', '-LCB-', '-RCB-', '\xa0']:
56+
s = s.replace(c, '')
57+
s = re.sub(r'[^\w]', '', s)
58+
return s
5559

56-
return start_idx, end_idx
60+
def check_diff(words, phrase):
61+
return remove_punctuation(phrase) not in remove_punctuation(words)
62+
63+
for item in data:
64+
words = item['words']
65+
for entity_mention in item['golden-entity-mentions']:
66+
if check_diff(''.join(words[entity_mention['start']:entity_mention['end']]), entity_mention['text'].replace(' ', '')):
67+
print('============================')
68+
print('[Warning] entity has invalid start/end')
69+
print('Expected: ', entity_mention['text'])
70+
print('Actual:', words[entity_mention['start']:entity_mention['end']])
71+
print('start: {}, end: {}, words: {}'.format(entity_mention['start'], entity_mention['end'], words))
72+
73+
for event_mention in item['golden-event-mentions']:
74+
trigger = event_mention['trigger']
75+
if check_diff(''.join(words[trigger['start']:trigger['end']]), trigger['text'].replace(' ', '')):
76+
print('============================')
77+
print('[Warning] trigger has invalid start/end')
78+
print('Expected: ', trigger['text'])
79+
print('Actual:', words[trigger['start']:trigger['end']])
80+
print('start: {}, end: {}, words: {}'.format(trigger['start'], trigger['end'], words))
81+
for argument in event_mention['arguments']:
82+
if check_diff(''.join(words[argument['start']:argument['end']]), argument['text'].replace(' ', '')):
83+
print('============================')
84+
print('[Warning] argument has invalid start/end')
85+
print('Expected: ', argument['text'])
86+
print('Actual:', words[argument['start']:argument['end']])
87+
print('start: {}, end: {}, words: {}'.format(argument['start'], argument['end'], words))
88+
89+
print('Complete verification')
5790

5891

5992
def preprocessing(data_type, files):
@@ -109,10 +142,6 @@ def preprocessing(data_type, files):
109142
end_pos=position[1] - sent_start_pos + 1,
110143
phrase=entity_mention['text'],
111144
)
112-
# start_idx, end_idx = find_token_index_v2(
113-
# words=data['words'],
114-
# phrase=entity_mention['text'],
115-
# )
116145

117146
entity_mention['start'] = start_idx
118147
entity_mention['end'] = end_idx
@@ -131,10 +160,6 @@ def preprocessing(data_type, files):
131160
end_pos=position[1] - sent_start_pos + 1,
132161
phrase=event_mention['trigger']['text'],
133162
)
134-
# start_idx, end_idx = find_token_index_v2(
135-
# words=data['words'],
136-
# phrase=event_mention['trigger']['text'],
137-
# )
138163

139164
event_mention['trigger']['start'] = start_idx
140165
event_mention['trigger']['end'] = end_idx
@@ -151,10 +176,7 @@ def preprocessing(data_type, files):
151176
end_pos=position[1] - sent_start_pos + 1,
152177
phrase=argument['text'],
153178
)
154-
# start_idx, end_idx = find_token_index_v2(
155-
# words=data['words'],
156-
# phrase=argument['text'],
157-
# )
179+
158180
argument['start'] = start_idx
159181
argument['end'] = end_idx
160182
del argument['position']
@@ -172,6 +194,7 @@ def preprocessing(data_type, files):
172194
print('entity :', entity_count)
173195
print('argument:', argument_count)
174196

197+
verify_result(result)
175198
with open('output/{}.json'.format(data_type), 'w') as f:
176199
json.dump(result, f, indent=2)
177200

0 commit comments

Comments
 (0)