Formatting (#2750)
This commit is contained in:
@@ -33,35 +33,34 @@ Do NOT include both CORRECT and WRONG in your response, or it will break the eva
|
||||
Just return the label CORRECT or WRONG in a json format with the key as "label".
|
||||
"""
|
||||
|
||||
|
||||
def evaluate_llm_judge(question, gold_answer, generated_answer):
|
||||
"""Evaluate the generated answer against the gold answer using an LLM judge."""
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-4o-mini",
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": ACCURACY_PROMPT.format(
|
||||
question=question,
|
||||
gold_answer=gold_answer,
|
||||
generated_answer=generated_answer
|
||||
)
|
||||
}],
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": ACCURACY_PROMPT.format(
|
||||
question=question, gold_answer=gold_answer, generated_answer=generated_answer
|
||||
),
|
||||
}
|
||||
],
|
||||
response_format={"type": "json_object"},
|
||||
temperature=0.0
|
||||
temperature=0.0,
|
||||
)
|
||||
label = json.loads(response.choices[0].message.content)['label']
|
||||
label = json.loads(response.choices[0].message.content)["label"]
|
||||
return 1 if label == "CORRECT" else 0
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function to evaluate RAG results using LLM judge."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Evaluate RAG results using LLM judge'
|
||||
)
|
||||
parser = argparse.ArgumentParser(description="Evaluate RAG results using LLM judge")
|
||||
parser.add_argument(
|
||||
'--input_file',
|
||||
"--input_file",
|
||||
type=str,
|
||||
default="results/default_run_v4_k30_new_graph.json",
|
||||
help='Path to the input dataset file'
|
||||
help="Path to the input dataset file",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
@@ -78,10 +77,10 @@ def main():
|
||||
index = 0
|
||||
for k, v in data.items():
|
||||
for x in v:
|
||||
question = x['question']
|
||||
gold_answer = x['answer']
|
||||
generated_answer = x['response']
|
||||
category = x['category']
|
||||
question = x["question"]
|
||||
gold_answer = x["answer"]
|
||||
generated_answer = x["response"]
|
||||
category = x["category"]
|
||||
|
||||
# Skip category 5
|
||||
if int(category) == 5:
|
||||
@@ -92,13 +91,15 @@ def main():
|
||||
LLM_JUDGE[category].append(label)
|
||||
|
||||
# Store the results
|
||||
RESULTS[index].append({
|
||||
"question": question,
|
||||
"gt_answer": gold_answer,
|
||||
"response": generated_answer,
|
||||
"category": category,
|
||||
"llm_label": label
|
||||
})
|
||||
RESULTS[index].append(
|
||||
{
|
||||
"question": question,
|
||||
"gt_answer": gold_answer,
|
||||
"response": generated_answer,
|
||||
"category": category,
|
||||
"llm_label": label,
|
||||
}
|
||||
)
|
||||
|
||||
# Save intermediate results
|
||||
with open(output_path, "w") as f:
|
||||
@@ -108,8 +109,7 @@ def main():
|
||||
print("All categories accuracy:")
|
||||
for cat, results in LLM_JUDGE.items():
|
||||
if results: # Only print if there are results for this category
|
||||
print(f" Category {cat}: {np.mean(results):.4f} "
|
||||
f"({sum(results)}/{len(results)})")
|
||||
print(f" Category {cat}: {np.mean(results):.4f} " f"({sum(results)}/{len(results)})")
|
||||
print("------------------------------------------")
|
||||
index += 1
|
||||
|
||||
|
||||
Reference in New Issue
Block a user