From 4f79f38d4aa2955e70f6d85ec787c050f3c7e121 Mon Sep 17 00:00:00 2001 From: Tianhui Zhang Date: Mon, 25 Nov 2024 22:17:18 +0000 Subject: [PATCH] Fix the track 1 with the intensity and It can not address the mulitple labels for each instance --- annotation_agreement.py | 475 +++++++++++++++++++++++++--------------- 1 file changed, 295 insertions(+), 180 deletions(-) diff --git a/annotation_agreement.py b/annotation_agreement.py index d728872..7ded0f6 100644 --- a/annotation_agreement.py +++ b/annotation_agreement.py @@ -1,25 +1,3 @@ -# Copyright 2024 Idris Abdulmumin -# -# Adapted from https://github.com/emotion-analysis-project/emotion-dataset-pipeline/blob/main/scripts/agreement_potato.py -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# This script processes annotated emotion data and calculates Krippendorff's Alpha -# to measure the reliability of annotations from Label Studio annotated instances. - -# To install dependencies, run: -# pip install pandas krippendorff - import os import json import math @@ -27,185 +5,322 @@ import krippendorff import pandas as pd from tabulate import tabulate +from sklearn.model_selection import train_test_split def format_emotion(emotion_value): - try: - return json.loads(emotion_value.replace('""', '"'))['choices'] - except: - return [emotion_value] - -def process_row(row): - result = {} - for col in emotion_columns: - if pd.notna(row[col]): - result[col.split('-')[-1].capitalize()] = row[col].split(':')[0].strip() - if 'Neutral' in format_emotion(row['emotion']): - result['Neutral'] = '' - return result + """ + Formats the emotion value by parsing the JSON string and extracting the 'choices' key. + If parsing fails, returns the original value in a list. -def preprocess_data(data, emotions): - filtered_data = data[['text', 'annotator', 'emotions_present']] - - binary_dataframes = {} - for emotion in emotions: - binary_dataframes[emotion] = filtered_data[['text', 'annotator']].copy() - - binary_dataframes[emotion] = binary_dataframes[emotion].drop_duplicates(subset=['annotator', 'text']) - binary_dataframes[emotion][emotion] = filtered_data.apply(lambda row: 1 if emotion in row['emotions_present'].keys() else 0, axis=1) - binary_dataframes[emotion] = binary_dataframes[emotion].pivot(index='annotator', columns='text', values=emotion) + Args: + emotion_value (str): The raw emotion value as a string. - return binary_dataframes + Returns: + list: A list of emotions extracted from the input value. + """ + try: + return json.loads(emotion_value.replace('""', '"'))['choices'] + except: + return [emotion_value] -def get_all_keys(dicts): - return list([k for d in dicts for k in d.keys()]) -if __name__ == '__main__': +def process_row(row): + """ + Processes a row of the data to extract emotions and their intensities. - parser = argparse.ArgumentParser(description='Process emotion annotations and calculate Krippendorff\'s Alpha') - parser.add_argument('--input', type=str, help='Path to the input CSV file', required=True) - parser.add_argument('--output', type=str, help='Path to the output files', default='') - parser.add_argument('--merge_annotators', type=str, help='Merge annotations from multiple annotators', default='') - parser.add_argument('--lang', type=str, help='Language of the dataset', required=True) - parser.add_argument('--batch', type=str, help='Batch number of the dataset', required=True) + Args: + row (pd.Series): A row of the DataFrame containing emotion intensity columns. - args = parser.parse_args() + Returns: + dict: A dictionary where keys are emotion names and values are their respective intensities. + """ + result = {} + for col in emotion_columns: + if pd.notna(row[col]): + result[col.split('-')[-1].capitalize()] = row[col].split(':')[0].strip() + if 'Neutral' in format_emotion(row['emotion']): + result['Neutral'] = '' + return result - input_path = args.input - output_path = args.output - lang = args.lang - batch = args.batch - merge_annotators = args.merge_annotators - assert input_path.endswith('.csv') or input_path.endswith('.tsv'), 'Input file must be a CSV or TSV file' - assert os.path.exists(input_path), f'Input file: {input_path} does not exist' +def preprocess_data(data, emotions): + """ + Preprocesses the data to create binary DataFrames for each emotion, indicating whether the emotion is present. - if output_path != '': - try: - os.makedirs(output_path) - except: - pass - else: - print('No output path specified. Saving output files to the input file directory.') - output_path = os.path.dirname(input_path) + Args: + data (pd.DataFrame): The input DataFrame containing text, annotator, and emotions_present columns. + emotions (list): A list of all emotions present in the dataset. - file_name = os.path.splitext(os.path.basename(input_path))[0] + Returns: + dict: A dictionary where keys are emotion names and values are binary DataFrames indicating emotion presence. + """ + filtered_data = data[['text', 'annotator', 'emotions_present']] - try: - df = pd.read_csv(input_path) - except: - try: - df = pd.read_csv(input_path, sep='\t') - except: - raise ValueError('Could not read input file. Please make sure the file is a valid CSV or TSV file.') + binary_dataframes = {} + for emotion in emotions: + binary_dataframes[emotion] = filtered_data[['text', 'annotator']].copy() - emotion_columns = [col for col in df.columns if 'emotion-' in col] - df['emotions_present'] = df.apply(process_row, axis=1) + binary_dataframes[emotion] = binary_dataframes[emotion].drop_duplicates(subset=['annotator', 'text']) + binary_dataframes[emotion][emotion] = filtered_data.apply( + lambda row: 1 if emotion in row['emotions_present'].keys() else 0, axis=1 + ) + binary_dataframes[emotion] = binary_dataframes[emotion].pivot(index='annotator', columns='text', values=emotion) - if merge_annotators != '': - input_dict = json.loads(args.merge_annotators) - for merges in input_dict.values(): - new_ann = '&'.join(merges) - for merge in merges: - df.loc[df.annotator == merge, 'annotator'] = new_ann + return binary_dataframes - df[['annotation_id', 'text', 'annotator', 'emotions_present']].to_csv(os.path.join(output_path, 'individual_annotations.csv'), index=False) - # print(df.annotator.unique()) +def get_all_keys(dicts): + """ + Gets all unique keys from a list of dictionaries. + + Args: + dicts (list): A list of dictionaries. + + Returns: + list: A list of all unique keys from the input dictionaries. + """ + return list([k for d in dicts for k in d.keys()]) + +def calculate_emotion_intensity(row, emotion): + """ + Calculates the intensity value of a given emotion for a specific row. + + Args: + row (pd.Series): A row of the DataFrame containing emotion intensity columns. + emotion (str): The emotion to calculate intensity for. + + Returns: + int: The intensity value (0 if not found, 1 for Low, 2 for Medium, 3 for High). + """ + intensity_str = row.get(f'emotion-intensity-{emotion.lower()}', '') + if pd.notna(intensity_str): + if 'Low' in intensity_str: + return 1 + elif 'Medium' in intensity_str: + return 2 + elif 'High' in intensity_str: + return 3 + return 0 + +def calculate_emotion_distribution(df, emotions, set_name): + """ + Calculates and prints the emotion distribution for the given dataset. + + Args: + df (pd.DataFrame): The dataset for which to calculate the distribution. + emotions (list): A list of all emotions present in the dataset. + set_name (str): The name of the dataset (e.g., 'train', 'dev', 'test'). + """ + emotion_counts = {} + for emotion in emotions: + emotion_counts[emotion] = df[emotion].sum() + emotion_counts['Neutral'] = df['Neutral'].sum() - emotions = [i for i in df.explode('emotions_present')['emotions_present'].unique().tolist() if pd.notna(i)] - print(f'Emotion classes in dataset: {emotions}') + print(f'Emotion distribution for {set_name} set:') + for emotion, count in emotion_counts.items(): + print(f'{emotion}: {count}') + print('\n') - annotations = preprocess_data(df, emotions) - annotators = df.annotator.unique().tolist() - majority_vote_value = len(annotators) // 2 + 1 - print(f'\nAnnotators: {annotators}') - # print aumber of annotations by each annotator - print(f'Number of annotations by each annotator:') - for annotator in annotators: - print(f'{annotator}: {df[df.annotator == annotator].shape[0]}') - - # determine if the sum of two annotators' annotations == len(annotations) +if __name__ == '__main__': - print(f'\nNumber of annotators: {len(annotators)}') - print(f'Majority vote value: {majority_vote_value}\n') + parser = argparse.ArgumentParser(description='Process emotion annotations and calculate Krippendorff\'s Alpha') + parser.add_argument('--input', type=str, help='Path to the input CSV file', default='first_step.csv') + parser.add_argument('--output', type=str, help='Path to the output files', default='output') + parser.add_argument('--merge_annotators', type=str, help='Merge annotations from multiple annotators', default='') + parser.add_argument('--lang', type=str, help='Language of the dataset', default='Chinese',) + parser.add_argument('--batch', type=str, help='Batch number of the dataset', default='1',) + parser.add_argument('--if_split', type=bool, help='If each csv file split into train/dev/test', default=True) + + args = parser.parse_args() + + input_path = args.input + output_path = args.output + lang = args.lang + batch = args.batch + merge_annotators = args.merge_annotators + + assert input_path.endswith('.csv') or input_path.endswith('.tsv'), 'Input file must be a CSV or TSV file' + assert os.path.exists(input_path), f'Input file: {input_path} does not exist' + + if output_path != '': + try: + os.makedirs(output_path) + except: + pass + else: + print('No output path specified. Saving output files to the input file directory.') + output_path = os.path.dirname(input_path) + + file_name = os.path.splitext(os.path.basename(input_path))[0] - # process present emotions - ann_df = pd.DataFrame(columns=['text'] + emotions) - unique_texts = df.text.unique() - ann_df['text'] = unique_texts + try: + df = pd.read_csv(input_path) + except: + try: + df = pd.read_csv(input_path, sep='\t') + except: + raise ValueError('Could not read input file. Please make sure the file is a valid CSV or TSV file.') + + emotion_columns = [col for col in df.columns if 'emotion-intensity-' in col] + df['emotions_present'] = df.apply(process_row, axis=1) + + if merge_annotators != '': + input_dict = json.loads(args.merge_annotators) + for merges in input_dict.values(): + new_ann = '&'.join(merges) + for merge in merges: + df.loc[df.annotator == merge, 'annotator'] = new_ann + + df[['annotation_id', 'text', 'annotator', 'emotions_present']].to_csv( + os.path.join(output_path, 'individual_annotations.csv'), index=False + ) + emotions = [i for i in df.explode('emotions_present')['emotions_present'].unique().tolist() if pd.notna(i)] + print(f'Emotion classes in dataset: {emotions}') + annotations = preprocess_data(df, emotions) + + annotators = df.annotator.unique().tolist() + print(f'\nAnnotators: {annotators}') + print(f'Number of annotations by each annotator:') + for annotator in annotators: + print(f'{annotator}: {df[df.annotator == annotator].shape[0]}') + print(f'\nNumber of annotators: {len(annotators)}') + + # process present emotions + ann_df = pd.DataFrame(columns=['text'] + emotions) + unique_texts = df.text.unique() + ann_df['text'] = unique_texts + + # Dictionary to store average intensity for each emotion per text + average_intensity_dict = {} + + for text in unique_texts: + text_df = df[df.text == text] + # Get the number of unique annotators for this text + annotator_count = text_df['annotator'].nunique() + total_intensity = {emotion: 0 for emotion in emotions} + for _, row in text_df.iterrows(): + for emotion in emotions: + total_intensity[emotion] += calculate_emotion_intensity(row, emotion) + + # if a text is neither joyful, fearful, angry, sad etc, we classify it as neutral. + all_emotions_zero = True + for emotion in emotions: + average_intensity = total_intensity[emotion] / annotator_count + if average_intensity > 0.5: + ann_df.loc[ann_df.text == text, emotion] = 1 + all_emotions_zero = False + else: + ann_df.loc[ann_df.text == text, emotion] = 0 + average_intensity_dict[(text, emotion)] = average_intensity + + # If all emotions are zero, mark as Neutral + if all_emotions_zero: + ann_df.loc[ann_df.text == text, 'Neutral'] = 1 + else: + ann_df.loc[ann_df.text == text, 'Neutral'] = 0 + + ann_df.insert(0, 'text_id', value=ann_df.index) + ann_df['text_id'] = ann_df['text_id'].apply(lambda x: f'{lang.lower()}_{batch}_{str(x + 1).zfill(5)}') + ann_df.to_csv(os.path.join(output_path, f'processed_emotion_annotations.csv'), index=False) + + # Chinese has very rare fear annotations, so we need to split the dataset to ensure a balanced distribution + # Split the dataset into train, dev, and test sets + if args.if_split: + emotion_counts = ann_df[emotions+ ['Neutral']].sum().sort_values() + rarest_emotion = emotion_counts.idxmin() + rarest_emotion_count = emotion_counts.min() + print(f"Rarest emotion: {rarest_emotion} with {rarest_emotion_count} occurrences.") + + rarest_emotion_rows = ann_df[ann_df[rarest_emotion] > 0] + remaining_rows = ann_df[~ann_df.index.isin(rarest_emotion_rows.index)] + # Split at leat 5 rarest emotion rows to dev set + rarest_dev, rarest_train = train_test_split(rarest_emotion_rows, test_size=len(rarest_emotion_rows) - 5, random_state=42) + remaining_train, remaining_dev = train_test_split(remaining_rows, test_size=200 - len(rarest_dev), random_state=42) + dev_set = pd.concat([rarest_dev, remaining_dev]) + train_set = pd.concat([rarest_train, remaining_train]) + # Split the train set into train and test sets + train_set, test_set = train_test_split(train_set, test_size=0.5, random_state=42) + + train_set = train_set.sort_values(by='text_id') + dev_set = dev_set.sort_values(by='text_id') + test_set = test_set.sort_values(by='text_id') + + calculate_emotion_distribution(train_set, emotions, 'train') + calculate_emotion_distribution(dev_set, emotions, 'dev') + calculate_emotion_distribution(test_set, emotions, 'test') + + + # Save the train, dev, and test sets to CSV files + train_set.to_csv(os.path.join(output_path, 'train_emotion.csv'), index=False) + dev_set.to_csv(os.path.join(output_path, 'dev_emotion.csv'), index=False) + test_set.to_csv(os.path.join(output_path, 'test_emotion.csv'), index=False) - for text in unique_texts: - text_df = df[df.text == text] - emotions_present = text_df['emotions_present'].tolist() - present_emotions = get_all_keys(emotions_present) + + + # Calculate the intensity for each emotion and save to a new CSV file + ann_intensity = ann_df.copy() + for i, row in enumerate(ann_intensity.itertuples()): + present_emotions = [emotion for emotion in emotions if getattr(row, emotion) == 1] + if present_emotions: + for emotion in present_emotions: + average_intensity = average_intensity_dict[(row.text, emotion)] + # Classify based on average score required by the guidelines + intensity_val = min(3, round(average_intensity)) + ann_intensity.loc[ann_intensity.text == row.text, emotion] = intensity_val + + ann_intensity.to_csv(os.path.join(output_path, f'processed_intensity_annotations.csv'), index=False) + + if args.if_split: + train_intensity = ann_intensity[ann_intensity['text'].isin(train_set.text)] + dev_intensity = ann_intensity[ann_intensity['text'].isin(dev_set.text)] + test_intensity = ann_intensity[ann_intensity['text'].isin(test_set.text)] + train_intensity.to_csv(os.path.join(output_path, 'train_intensity.csv'), index=False) + dev_intensity.to_csv(os.path.join(output_path, 'dev_intensity.csv'), index=False) + test_intensity.to_csv(os.path.join(output_path, 'test_intensity.csv'), index=False) + + + + + # Statistical calculations for emotion counts + emotion_counts = {} for emotion in emotions: - emotion_count = sum([1 for emotions in emotions_present if emotion in emotions]) - ann_df.loc[ann_df.text == text, emotion] = 1 if emotion_count >= majority_vote_value else 0 - - ann_df.insert(0, 'text_id', value = ann_df.index) - ann_df['text_id'] = ann_df['text_id'].apply(lambda x: f'{lang.lower()}_{batch}_{str(x + 1).zfill(5)}') - - ann_df.to_csv(os.path.join(output_path, f'processed_emotion_annotations.csv'), index=False) - - intensity_value = { - 'Low': 1, - 'Medium': 2, - 'High': 3 - } - - ann_intensity = ann_df.copy() - for i, row in enumerate(ann_intensity.itertuples()): - present_emotions = [emotion for emotion in emotions if row.__getattribute__(emotion) == 1] - if present_emotions: - temp_annotations = df[(df.text == row.text)].copy() - r_emotions = temp_annotations['emotions_present'].to_list() - for emotion in present_emotions: - intensities = [r_emotion[emotion] if emotion in r_emotion else '0' for r_emotion in r_emotions] - intensity_val = math.ceil(sum([intensity_value.get(i) if i in intensity_value else 0 for i in intensities]) / len(temp_annotations)) - - ann_intensity.loc[ann_intensity.text == row.text, emotion] = intensity_val - - ann_intensity.to_csv(os.path.join(output_path, f'processed_intensity_annotations.csv'), index=False) - - # saving stats to file - - emotion_counts = {} - for emotion in emotions: - emotion_counts[emotion] = annotations[emotion].sum().sum() - - with open(os.path.join(output_path, f'stats.txt'), 'w') as f: - f.write(f'Number of annotations per emotion:\n') - headers = ['Emotion', 'Count in annotations', 'Count after majority vote', 'Krippendorff\'s Alpha'] - data_list = [[key, value] for key, value in emotion_counts.items()] - for row in data_list: - row.append(ann_df[ann_df[row[0]] == 1].shape[0]) - - avg = 0 - emotion_alphas = {} - for emotion in annotations: - alpha = krippendorff.alpha(annotations[emotion].values) - avg += alpha - - emotion_alphas[emotion] = alpha - - avg /= len(emotions) - emotion_alphas['Average'] = avg - - for row in data_list: - row.append(emotion_alphas[row[0]]) - - data_list.append(['Total', sum(emotion_counts.values()), sum([row[2] for row in data_list]), avg]) - - f.write(tabulate(data_list, headers=headers, tablefmt='grid')) - - f.write('\n\nNumber of 0, 1, ... n emotions per text:\n') - n_emotion_counts = {} - for n in range(len(emotions) + 1): - n_count = len(ann_df[ann_df[emotions].sum(axis=1) == n]) - if n_count > 0: - n_emotion_counts[n] = n_count - - data_list = [[key, value] for key, value in n_emotion_counts.items()] - headers = ['Number of Emotions', 'Count'] - f.write(tabulate(data_list, headers=headers, tablefmt='grid')) \ No newline at end of file + emotion_counts[emotion] = annotations[emotion].sum().sum() + + + with open(os.path.join(output_path, f'stats.txt'), 'w') as f: + f.write(f'Number of annotations per emotion:\n') + headers = ['Emotion', 'Count in annotations', 'Count after majority vote', "Krippendorff's Alpha"] + data_list = [[key, value] for key, value in emotion_counts.items()] + for row in data_list: + row.append(ann_df[ann_df[row[0]] == 1].shape[0]) + + avg = 0 + emotion_alphas = {} + for emotion in annotations: + alpha = krippendorff.alpha(annotations[emotion].values) + avg += alpha + + emotion_alphas[emotion] = alpha + + avg /= len(emotions) + emotion_alphas['Average'] = avg + + for row in data_list: + row.append(emotion_alphas[row[0]]) + + data_list.append(['Total', sum(emotion_counts.values()), sum([row[2] for row in data_list]), avg]) + + f.write(tabulate(data_list, headers=headers, tablefmt='grid')) + + # Calculate the number of texts with 0, 1, ... n emotions + f.write('\n\nNumber of 0, 1, ... n emotions per text:\n') + n_emotion_counts = {} + for n in range(len(emotions) + 1): + n_count = len(ann_df[ann_df[emotions].sum(axis=1) == n]) + if n_count > 0: + n_emotion_counts[n] = n_count + + data_list = [[key, value] for key, value in n_emotion_counts.items()] + headers = ['Number of Emotions', 'Count'] + f.write(tabulate(data_list, headers=headers, tablefmt='grid')) \ No newline at end of file