42 lines
900 B
Python
42 lines
900 B
Python
import json
|
|
|
|
import pandas as pd
|
|
|
|
# Load the evaluation metrics data
|
|
with open('evaluation_metrics.json', 'r') as f:
|
|
data = json.load(f)
|
|
|
|
# Flatten the data into a list of question items
|
|
all_items = []
|
|
for key in data:
|
|
all_items.extend(data[key])
|
|
|
|
# Convert to DataFrame
|
|
df = pd.DataFrame(all_items)
|
|
|
|
# Convert category to numeric type
|
|
df['category'] = pd.to_numeric(df['category'])
|
|
|
|
# Calculate mean scores by category
|
|
result = df.groupby('category').agg({
|
|
'bleu_score': 'mean',
|
|
'f1_score': 'mean',
|
|
'llm_score': 'mean'
|
|
}).round(4)
|
|
|
|
# Add count of questions per category
|
|
result['count'] = df.groupby('category').size()
|
|
|
|
# Print the results
|
|
print("Mean Scores Per Category:")
|
|
print(result)
|
|
|
|
# Calculate overall means
|
|
overall_means = df.agg({
|
|
'bleu_score': 'mean',
|
|
'f1_score': 'mean',
|
|
'llm_score': 'mean'
|
|
}).round(4)
|
|
|
|
print("\nOverall Mean Scores:")
|
|
print(overall_means) |