Formatting (#2750)

This commit is contained in:
Dev Khant
2025-05-22 01:17:29 +05:30
committed by GitHub
parent dff91154a7
commit d85fcda037
71 changed files with 1391 additions and 1823 deletions

View File

@@ -14,46 +14,47 @@ def process_item(item_data):
local_results = defaultdict(list)
for item in v:
gt_answer = str(item['answer'])
pred_answer = str(item['response'])
category = str(item['category'])
question = str(item['question'])
gt_answer = str(item["answer"])
pred_answer = str(item["response"])
category = str(item["category"])
question = str(item["question"])
# Skip category 5
if category == '5':
if category == "5":
continue
metrics = calculate_metrics(pred_answer, gt_answer)
bleu_scores = calculate_bleu_scores(pred_answer, gt_answer)
llm_score = evaluate_llm_judge(question, gt_answer, pred_answer)
local_results[k].append({
"question": question,
"answer": gt_answer,
"response": pred_answer,
"category": category,
"bleu_score": bleu_scores["bleu1"],
"f1_score": metrics["f1"],
"llm_score": llm_score
})
local_results[k].append(
{
"question": question,
"answer": gt_answer,
"response": pred_answer,
"category": category,
"bleu_score": bleu_scores["bleu1"],
"f1_score": metrics["f1"],
"llm_score": llm_score,
}
)
return local_results
def main():
parser = argparse.ArgumentParser(description='Evaluate RAG results')
parser.add_argument('--input_file', type=str,
default="results/rag_results_500_k1.json",
help='Path to the input dataset file')
parser.add_argument('--output_file', type=str,
default="evaluation_metrics.json",
help='Path to save the evaluation results')
parser.add_argument('--max_workers', type=int, default=10,
help='Maximum number of worker threads')
parser = argparse.ArgumentParser(description="Evaluate RAG results")
parser.add_argument(
"--input_file", type=str, default="results/rag_results_500_k1.json", help="Path to the input dataset file"
)
parser.add_argument(
"--output_file", type=str, default="evaluation_metrics.json", help="Path to save the evaluation results"
)
parser.add_argument("--max_workers", type=int, default=10, help="Maximum number of worker threads")
args = parser.parse_args()
with open(args.input_file, 'r') as f:
with open(args.input_file, "r") as f:
data = json.load(f)
results = defaultdict(list)
@@ -61,18 +62,16 @@ def main():
# Use ThreadPoolExecutor with specified workers
with concurrent.futures.ThreadPoolExecutor(max_workers=args.max_workers) as executor:
futures = [executor.submit(process_item, item_data)
for item_data in data.items()]
futures = [executor.submit(process_item, item_data) for item_data in data.items()]
for future in tqdm(concurrent.futures.as_completed(futures),
total=len(futures)):
for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
local_results = future.result()
with results_lock:
for k, items in local_results.items():
results[k].extend(items)
# Save results to JSON file
with open(args.output_file, 'w') as f:
with open(args.output_file, "w") as f:
json.dump(results, f, indent=4)
print(f"Results saved to {args.output_file}")

View File

@@ -3,7 +3,7 @@ import json
import pandas as pd
# Load the evaluation metrics data
with open('evaluation_metrics.json', 'r') as f:
with open("evaluation_metrics.json", "r") as f:
data = json.load(f)
# Flatten the data into a list of question items
@@ -15,28 +15,20 @@ for key in data:
df = pd.DataFrame(all_items)
# Convert category to numeric type
df['category'] = pd.to_numeric(df['category'])
df["category"] = pd.to_numeric(df["category"])
# Calculate mean scores by category
result = df.groupby('category').agg({
'bleu_score': 'mean',
'f1_score': 'mean',
'llm_score': 'mean'
}).round(4)
result = df.groupby("category").agg({"bleu_score": "mean", "f1_score": "mean", "llm_score": "mean"}).round(4)
# Add count of questions per category
result['count'] = df.groupby('category').size()
result["count"] = df.groupby("category").size()
# Print the results
print("Mean Scores Per Category:")
print(result)
# Calculate overall means
overall_means = df.agg({
'bleu_score': 'mean',
'f1_score': 'mean',
'llm_score': 'mean'
}).round(4)
overall_means = df.agg({"bleu_score": "mean", "f1_score": "mean", "llm_score": "mean"}).round(4)
print("\nOverall Mean Scores:")
print(overall_means)
print(overall_means)

View File

@@ -33,35 +33,34 @@ Do NOT include both CORRECT and WRONG in your response, or it will break the eva
Just return the label CORRECT or WRONG in a json format with the key as "label".
"""
def evaluate_llm_judge(question, gold_answer, generated_answer):
"""Evaluate the generated answer against the gold answer using an LLM judge."""
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{
"role": "user",
"content": ACCURACY_PROMPT.format(
question=question,
gold_answer=gold_answer,
generated_answer=generated_answer
)
}],
messages=[
{
"role": "user",
"content": ACCURACY_PROMPT.format(
question=question, gold_answer=gold_answer, generated_answer=generated_answer
),
}
],
response_format={"type": "json_object"},
temperature=0.0
temperature=0.0,
)
label = json.loads(response.choices[0].message.content)['label']
label = json.loads(response.choices[0].message.content)["label"]
return 1 if label == "CORRECT" else 0
def main():
"""Main function to evaluate RAG results using LLM judge."""
parser = argparse.ArgumentParser(
description='Evaluate RAG results using LLM judge'
)
parser = argparse.ArgumentParser(description="Evaluate RAG results using LLM judge")
parser.add_argument(
'--input_file',
"--input_file",
type=str,
default="results/default_run_v4_k30_new_graph.json",
help='Path to the input dataset file'
help="Path to the input dataset file",
)
args = parser.parse_args()
@@ -78,10 +77,10 @@ def main():
index = 0
for k, v in data.items():
for x in v:
question = x['question']
gold_answer = x['answer']
generated_answer = x['response']
category = x['category']
question = x["question"]
gold_answer = x["answer"]
generated_answer = x["response"]
category = x["category"]
# Skip category 5
if int(category) == 5:
@@ -92,13 +91,15 @@ def main():
LLM_JUDGE[category].append(label)
# Store the results
RESULTS[index].append({
"question": question,
"gt_answer": gold_answer,
"response": generated_answer,
"category": category,
"llm_label": label
})
RESULTS[index].append(
{
"question": question,
"gt_answer": gold_answer,
"response": generated_answer,
"category": category,
"llm_label": label,
}
)
# Save intermediate results
with open(output_path, "w") as f:
@@ -108,8 +109,7 @@ def main():
print("All categories accuracy:")
for cat, results in LLM_JUDGE.items():
if results: # Only print if there are results for this category
print(f" Category {cat}: {np.mean(results):.4f} "
f"({sum(results)}/{len(results)})")
print(f" Category {cat}: {np.mean(results):.4f} " f"({sum(results)}/{len(results)})")
print("------------------------------------------")
index += 1

View File

@@ -3,7 +3,7 @@ Borrowed from https://github.com/WujiangXu/AgenticMemory/blob/main/utils.py
@article{xu2025mem,
title={A-mem: Agentic memory for llm agents},
author={Xu, Wujiang and Liang, Zujie and Mei, Kai and Gao, Hang and Tan, Juntao
author={Xu, Wujiang and Liang, Zujie and Mei, Kai and Gao, Hang and Tan, Juntao
and Zhang, Yongfeng},
journal={arXiv preprint arXiv:2502.12110},
year={2025}
@@ -26,42 +26,45 @@ from sentence_transformers.util import pytorch_cos_sim
# Download required NLTK data
try:
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download("punkt", quiet=True)
nltk.download("wordnet", quiet=True)
except Exception as e:
print(f"Error downloading NLTK data: {e}")
# Initialize SentenceTransformer model (this will be reused)
try:
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
except Exception as e:
print(f"Warning: Could not load SentenceTransformer model: {e}")
sentence_model = None
def simple_tokenize(text):
"""Simple tokenization function."""
# Convert to string if not already
text = str(text)
return text.lower().replace('.', ' ').replace(',', ' ').replace('!', ' ').replace('?', ' ').split()
return text.lower().replace(".", " ").replace(",", " ").replace("!", " ").replace("?", " ").split()
def calculate_rouge_scores(prediction: str, reference: str) -> Dict[str, float]:
"""Calculate ROUGE scores for prediction against reference."""
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
scores = scorer.score(reference, prediction)
return {
'rouge1_f': scores['rouge1'].fmeasure,
'rouge2_f': scores['rouge2'].fmeasure,
'rougeL_f': scores['rougeL'].fmeasure
"rouge1_f": scores["rouge1"].fmeasure,
"rouge2_f": scores["rouge2"].fmeasure,
"rougeL_f": scores["rougeL"].fmeasure,
}
def calculate_bleu_scores(prediction: str, reference: str) -> Dict[str, float]:
"""Calculate BLEU scores with different n-gram settings."""
pred_tokens = nltk.word_tokenize(prediction.lower())
ref_tokens = [nltk.word_tokenize(reference.lower())]
weights_list = [(1, 0, 0, 0), (0.5, 0.5, 0, 0), (0.33, 0.33, 0.33, 0), (0.25, 0.25, 0.25, 0.25)]
smooth = SmoothingFunction().method1
scores = {}
for n, weights in enumerate(weights_list, start=1):
try:
@@ -69,26 +72,20 @@ def calculate_bleu_scores(prediction: str, reference: str) -> Dict[str, float]:
except Exception as e:
print(f"Error calculating BLEU score: {e}")
score = 0.0
scores[f'bleu{n}'] = score
scores[f"bleu{n}"] = score
return scores
def calculate_bert_scores(prediction: str, reference: str) -> Dict[str, float]:
"""Calculate BERTScore for semantic similarity."""
try:
P, R, F1 = bert_score([prediction], [reference], lang='en', verbose=False)
return {
'bert_precision': P.item(),
'bert_recall': R.item(),
'bert_f1': F1.item()
}
P, R, F1 = bert_score([prediction], [reference], lang="en", verbose=False)
return {"bert_precision": P.item(), "bert_recall": R.item(), "bert_f1": F1.item()}
except Exception as e:
print(f"Error calculating BERTScore: {e}")
return {
'bert_precision': 0.0,
'bert_recall': 0.0,
'bert_f1': 0.0
}
return {"bert_precision": 0.0, "bert_recall": 0.0, "bert_f1": 0.0}
def calculate_meteor_score(prediction: str, reference: str) -> float:
"""Calculate METEOR score for the prediction."""
@@ -98,6 +95,7 @@ def calculate_meteor_score(prediction: str, reference: str) -> float:
print(f"Error calculating METEOR score: {e}")
return 0.0
def calculate_sentence_similarity(prediction: str, reference: str) -> float:
"""Calculate sentence embedding similarity using SentenceBERT."""
if sentence_model is None:
@@ -106,7 +104,7 @@ def calculate_sentence_similarity(prediction: str, reference: str) -> float:
# Encode sentences
embedding1 = sentence_model.encode([prediction], convert_to_tensor=True)
embedding2 = sentence_model.encode([reference], convert_to_tensor=True)
# Calculate cosine similarity
similarity = pytorch_cos_sim(embedding1, embedding2).item()
return float(similarity)
@@ -114,6 +112,7 @@ def calculate_sentence_similarity(prediction: str, reference: str) -> float:
print(f"Error calculating sentence similarity: {e}")
return 0.0
def calculate_metrics(prediction: str, reference: str) -> Dict[str, float]:
"""Calculate comprehensive evaluation metrics for a prediction."""
# Handle empty or None values
@@ -130,31 +129,31 @@ def calculate_metrics(prediction: str, reference: str) -> Dict[str, float]:
"bleu4": 0.0,
"bert_f1": 0.0,
"meteor": 0.0,
"sbert_similarity": 0.0
"sbert_similarity": 0.0,
}
# Convert to strings if they're not already
prediction = str(prediction).strip()
reference = str(reference).strip()
# Calculate exact match
exact_match = int(prediction.lower() == reference.lower())
# Calculate token-based F1 score
pred_tokens = set(simple_tokenize(prediction))
ref_tokens = set(simple_tokenize(reference))
common_tokens = pred_tokens & ref_tokens
if not pred_tokens or not ref_tokens:
f1 = 0.0
else:
precision = len(common_tokens) / len(pred_tokens)
recall = len(common_tokens) / len(ref_tokens)
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
# Calculate all scores
bleu_scores = calculate_bleu_scores(prediction, reference)
# Combine all metrics
metrics = {
"exact_match": exact_match,
@@ -164,48 +163,49 @@ def calculate_metrics(prediction: str, reference: str) -> Dict[str, float]:
return metrics
def aggregate_metrics(all_metrics: List[Dict[str, float]], all_categories: List[int]) -> Dict[str, Dict[str, Union[float, Dict[str, float]]]]:
def aggregate_metrics(
all_metrics: List[Dict[str, float]], all_categories: List[int]
) -> Dict[str, Dict[str, Union[float, Dict[str, float]]]]:
"""Calculate aggregate statistics for all metrics, split by category."""
if not all_metrics:
return {}
# Initialize aggregates for overall and per-category metrics
aggregates = defaultdict(list)
category_aggregates = defaultdict(lambda: defaultdict(list))
# Collect all values for each metric, both overall and per category
for metrics, category in zip(all_metrics, all_categories):
for metric_name, value in metrics.items():
aggregates[metric_name].append(value)
category_aggregates[category][metric_name].append(value)
# Calculate statistics for overall metrics
results = {
"overall": {}
}
results = {"overall": {}}
for metric_name, values in aggregates.items():
results["overall"][metric_name] = {
'mean': statistics.mean(values),
'std': statistics.stdev(values) if len(values) > 1 else 0.0,
'median': statistics.median(values),
'min': min(values),
'max': max(values),
'count': len(values)
"mean": statistics.mean(values),
"std": statistics.stdev(values) if len(values) > 1 else 0.0,
"median": statistics.median(values),
"min": min(values),
"max": max(values),
"count": len(values),
}
# Calculate statistics for each category
for category in sorted(category_aggregates.keys()):
results[f"category_{category}"] = {}
for metric_name, values in category_aggregates[category].items():
if values: # Only calculate if we have values for this category
results[f"category_{category}"][metric_name] = {
'mean': statistics.mean(values),
'std': statistics.stdev(values) if len(values) > 1 else 0.0,
'median': statistics.median(values),
'min': min(values),
'max': max(values),
'count': len(values)
"mean": statistics.mean(values),
"std": statistics.stdev(values) if len(values) > 1 else 0.0,
"median": statistics.median(values),
"min": min(values),
"max": max(values),
"count": len(values),
}
return results

View File

@@ -144,4 +144,4 @@ ANSWER_PROMPT_ZEP = """
Question: {{question}}
Answer:
"""
"""

View File

@@ -21,23 +21,15 @@ class Experiment:
def main():
parser = argparse.ArgumentParser(description='Run memory experiments')
parser.add_argument('--technique_type', choices=TECHNIQUES, default='mem0',
help='Memory technique to use')
parser.add_argument('--method', choices=METHODS, default='add',
help='Method to use')
parser.add_argument('--chunk_size', type=int, default=1000,
help='Chunk size for processing')
parser.add_argument('--output_folder', type=str, default='results/',
help='Output path for results')
parser.add_argument('--top_k', type=int, default=30,
help='Number of top memories to retrieve')
parser.add_argument('--filter_memories', action='store_true', default=False,
help='Whether to filter memories')
parser.add_argument('--is_graph', action='store_true', default=False,
help='Whether to use graph-based search')
parser.add_argument('--num_chunks', type=int, default=1,
help='Number of chunks to process')
parser = argparse.ArgumentParser(description="Run memory experiments")
parser.add_argument("--technique_type", choices=TECHNIQUES, default="mem0", help="Memory technique to use")
parser.add_argument("--method", choices=METHODS, default="add", help="Method to use")
parser.add_argument("--chunk_size", type=int, default=1000, help="Chunk size for processing")
parser.add_argument("--output_folder", type=str, default="results/", help="Output path for results")
parser.add_argument("--top_k", type=int, default=30, help="Number of top memories to retrieve")
parser.add_argument("--filter_memories", action="store_true", default=False, help="Whether to filter memories")
parser.add_argument("--is_graph", action="store_true", default=False, help="Whether to use graph-based search")
parser.add_argument("--num_chunks", type=int, default=1, help="Number of chunks to process")
args = parser.parse_args()
@@ -46,33 +38,18 @@ def main():
if args.technique_type == "mem0":
if args.method == "add":
memory_manager = MemoryADD(
data_path='dataset/locomo10.json',
is_graph=args.is_graph
)
memory_manager = MemoryADD(data_path="dataset/locomo10.json", is_graph=args.is_graph)
memory_manager.process_all_conversations()
elif args.method == "search":
output_file_path = os.path.join(
args.output_folder,
f"mem0_results_top_{args.top_k}_filter_{args.filter_memories}_graph_{args.is_graph}.json"
f"mem0_results_top_{args.top_k}_filter_{args.filter_memories}_graph_{args.is_graph}.json",
)
memory_searcher = MemorySearch(
output_file_path,
args.top_k,
args.filter_memories,
args.is_graph
)
memory_searcher.process_data_file('dataset/locomo10.json')
memory_searcher = MemorySearch(output_file_path, args.top_k, args.filter_memories, args.is_graph)
memory_searcher.process_data_file("dataset/locomo10.json")
elif args.technique_type == "rag":
output_file_path = os.path.join(
args.output_folder,
f"rag_results_{args.chunk_size}_k{args.num_chunks}.json"
)
rag_manager = RAGManager(
data_path="dataset/locomo10_rag.json",
chunk_size=args.chunk_size,
k=args.num_chunks
)
output_file_path = os.path.join(args.output_folder, f"rag_results_{args.chunk_size}_k{args.num_chunks}.json")
rag_manager = RAGManager(data_path="dataset/locomo10_rag.json", chunk_size=args.chunk_size, k=args.num_chunks)
rag_manager.process_all_conversations(output_file_path)
elif args.technique_type == "langmem":
output_file_path = os.path.join(args.output_folder, "langmem_results.json")
@@ -85,11 +62,7 @@ def main():
elif args.method == "search":
output_file_path = os.path.join(args.output_folder, "zep_search_results.json")
zep_manager = ZepSearch()
zep_manager.process_data_file(
"dataset/locomo10.json",
"1",
output_file_path
)
zep_manager.process_data_file("dataset/locomo10.json", "1", output_file_path)
elif args.technique_type == "openai":
output_file_path = os.path.join(args.output_folder, "openai_results.json")
openai_manager = OpenAIPredict()

View File

@@ -28,14 +28,12 @@ def get_answer(question, speaker_1_user_id, speaker_1_memories, speaker_2_user_i
speaker_1_user_id=speaker_1_user_id,
speaker_1_memories=speaker_1_memories,
speaker_2_user_id=speaker_2_user_id,
speaker_2_memories=speaker_2_memories
speaker_2_memories=speaker_2_memories,
)
t1 = time.time()
response = client.chat.completions.create(
model=os.getenv("MODEL"),
messages=[{"role": "system", "content": prompt}],
temperature=0.0
model=os.getenv("MODEL"), messages=[{"role": "system", "content": prompt}], temperature=0.0
)
t2 = time.time()
return response.choices[0].message.content, t2 - t1
@@ -59,7 +57,9 @@ def prompt(state):
class LangMem:
def __init__(self,):
def __init__(
self,
):
self.store = InMemoryStore(
index={
"dims": 1536,
@@ -80,18 +80,12 @@ class LangMem:
)
def add_memory(self, message, config):
return self.agent.invoke(
{"messages": [{"role": "user", "content": message}]},
config=config
)
return self.agent.invoke({"messages": [{"role": "user", "content": message}]}, config=config)
def search_memory(self, query, config):
try:
t1 = time.time()
response = self.agent.invoke(
{"messages": [{"role": "user", "content": query}]},
config=config
)
response = self.agent.invoke({"messages": [{"role": "user", "content": query}]}, config=config)
t2 = time.time()
return response["messages"][-1].content, t2 - t1
except Exception as e:
@@ -102,7 +96,7 @@ class LangMem:
class LangMemManager:
def __init__(self, dataset_path):
self.dataset_path = dataset_path
with open(self.dataset_path, 'r') as f:
with open(self.dataset_path, "r") as f:
self.data = json.load(f)
def process_all_conversations(self, output_file_path):
@@ -123,7 +117,7 @@ class LangMemManager:
# Identify speakers
for conv in chat_history:
speakers.add(conv['speaker'])
speakers.add(conv["speaker"])
if len(speakers) != 2:
raise ValueError(f"Expected 2 speakers, got {len(speakers)}")
@@ -134,50 +128,52 @@ class LangMemManager:
# Add memories for each message
for conv in tqdm(chat_history, desc=f"Processing messages {key}", leave=False):
message = f"{conv['timestamp']} | {conv['speaker']}: {conv['text']}"
if conv['speaker'] == speaker1:
if conv["speaker"] == speaker1:
agent1.add_memory(message, config)
elif conv['speaker'] == speaker2:
elif conv["speaker"] == speaker2:
agent2.add_memory(message, config)
else:
raise ValueError(f"Expected speaker1 or speaker2, got {conv['speaker']}")
# Process questions
for q in tqdm(questions, desc=f"Processing questions {key}", leave=False):
category = q['category']
category = q["category"]
if int(category) == 5:
continue
answer = q['answer']
question = q['question']
answer = q["answer"]
question = q["question"]
response1, speaker1_memory_time = agent1.search_memory(question, config)
response2, speaker2_memory_time = agent2.search_memory(question, config)
generated_answer, response_time = get_answer(
question, speaker1, response1, speaker2, response2
)
generated_answer, response_time = get_answer(question, speaker1, response1, speaker2, response2)
result[key].append({
"question": question,
"answer": answer,
"response1": response1,
"response2": response2,
"category": category,
"speaker1_memory_time": speaker1_memory_time,
"speaker2_memory_time": speaker2_memory_time,
"response_time": response_time,
'response': generated_answer
})
result[key].append(
{
"question": question,
"answer": answer,
"response1": response1,
"response2": response2,
"category": category,
"speaker1_memory_time": speaker1_memory_time,
"speaker2_memory_time": speaker2_memory_time,
"response_time": response_time,
"response": generated_answer,
}
)
return result
# Use multiprocessing to process conversations in parallel
with mp.Pool(processes=10) as pool:
results = list(tqdm(
pool.imap(process_conversation, list(self.data.items())),
total=len(self.data),
desc="Processing conversations"
))
results = list(
tqdm(
pool.imap(process_conversation, list(self.data.items())),
total=len(self.data),
desc="Processing conversations",
)
)
# Combine results from all workers
for result in results:
@@ -185,5 +181,5 @@ class LangMemManager:
OUTPUT[key].extend(items)
# Save final results
with open(output_file_path, 'w') as f:
with open(output_file_path, "w") as f:
json.dump(OUTPUT, f, indent=4)

View File

@@ -13,7 +13,7 @@ load_dotenv()
# Update custom instructions
custom_instructions ="""
custom_instructions = """
Generate personal memories that follow these guidelines:
1. Each memory should be self-contained with complete context, including:
@@ -47,7 +47,7 @@ class MemoryADD:
self.mem0_client = MemoryClient(
api_key=os.getenv("MEM0_API_KEY"),
org_id=os.getenv("MEM0_ORGANIZATION_ID"),
project_id=os.getenv("MEM0_PROJECT_ID")
project_id=os.getenv("MEM0_PROJECT_ID"),
)
self.mem0_client.update_project(custom_instructions=custom_instructions)
@@ -59,15 +59,16 @@ class MemoryADD:
self.load_data()
def load_data(self):
with open(self.data_path, 'r') as f:
with open(self.data_path, "r") as f:
self.data = json.load(f)
return self.data
def add_memory(self, user_id, message, metadata, retries=3):
for attempt in range(retries):
try:
_ = self.mem0_client.add(message, user_id=user_id, version="v2",
metadata=metadata, enable_graph=self.is_graph)
_ = self.mem0_client.add(
message, user_id=user_id, version="v2", metadata=metadata, enable_graph=self.is_graph
)
return
except Exception as e:
if attempt < retries - 1:
@@ -78,13 +79,13 @@ class MemoryADD:
def add_memories_for_speaker(self, speaker, messages, timestamp, desc):
for i in tqdm(range(0, len(messages), self.batch_size), desc=desc):
batch_messages = messages[i:i+self.batch_size]
batch_messages = messages[i : i + self.batch_size]
self.add_memory(speaker, batch_messages, metadata={"timestamp": timestamp})
def process_conversation(self, item, idx):
conversation = item['conversation']
speaker_a = conversation['speaker_a']
speaker_b = conversation['speaker_b']
conversation = item["conversation"]
speaker_a = conversation["speaker_a"]
speaker_b = conversation["speaker_b"]
speaker_a_user_id = f"{speaker_a}_{idx}"
speaker_b_user_id = f"{speaker_b}_{idx}"
@@ -94,7 +95,7 @@ class MemoryADD:
self.mem0_client.delete_all(user_id=speaker_b_user_id)
for key in conversation.keys():
if key in ['speaker_a', 'speaker_b'] or "date" in key or "timestamp" in key:
if key in ["speaker_a", "speaker_b"] or "date" in key or "timestamp" in key:
continue
date_time_key = key + "_date_time"
@@ -104,10 +105,10 @@ class MemoryADD:
messages = []
messages_reverse = []
for chat in chats:
if chat['speaker'] == speaker_a:
if chat["speaker"] == speaker_a:
messages.append({"role": "user", "content": f"{speaker_a}: {chat['text']}"})
messages_reverse.append({"role": "assistant", "content": f"{speaker_a}: {chat['text']}"})
elif chat['speaker'] == speaker_b:
elif chat["speaker"] == speaker_b:
messages.append({"role": "assistant", "content": f"{speaker_b}: {chat['text']}"})
messages_reverse.append({"role": "user", "content": f"{speaker_b}: {chat['text']}"})
else:
@@ -116,11 +117,11 @@ class MemoryADD:
# add memories for the two users on different threads
thread_a = threading.Thread(
target=self.add_memories_for_speaker,
args=(speaker_a_user_id, messages, timestamp, "Adding Memories for Speaker A")
args=(speaker_a_user_id, messages, timestamp, "Adding Memories for Speaker A"),
)
thread_b = threading.Thread(
target=self.add_memories_for_speaker,
args=(speaker_b_user_id, messages_reverse, timestamp, "Adding Memories for Speaker B")
args=(speaker_b_user_id, messages_reverse, timestamp, "Adding Memories for Speaker B"),
)
thread_a.start()
@@ -134,10 +135,7 @@ class MemoryADD:
if not self.data:
raise ValueError("No data loaded. Please set data_path and call load_data() first.")
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [
executor.submit(self.process_conversation, item, idx)
for idx, item in enumerate(self.data)
]
futures = [executor.submit(self.process_conversation, item, idx) for idx, item in enumerate(self.data)]
for future in futures:
future.result()
future.result()

View File

@@ -16,12 +16,11 @@ load_dotenv()
class MemorySearch:
def __init__(self, output_path='results.json', top_k=10, filter_memories=False, is_graph=False):
def __init__(self, output_path="results.json", top_k=10, filter_memories=False, is_graph=False):
self.mem0_client = MemoryClient(
api_key=os.getenv("MEM0_API_KEY"),
org_id=os.getenv("MEM0_ORGANIZATION_ID"),
project_id=os.getenv("MEM0_PROJECT_ID")
project_id=os.getenv("MEM0_PROJECT_ID"),
)
self.top_k = top_k
self.openai_client = OpenAI()
@@ -42,11 +41,18 @@ class MemorySearch:
try:
if self.is_graph:
print("Searching with graph")
memories = self.mem0_client.search(query, user_id=user_id, top_k=self.top_k,
filter_memories=self.filter_memories, enable_graph=True, output_format='v1.1')
memories = self.mem0_client.search(
query,
user_id=user_id,
top_k=self.top_k,
filter_memories=self.filter_memories,
enable_graph=True,
output_format="v1.1",
)
else:
memories = self.mem0_client.search(query, user_id=user_id, top_k=self.top_k,
filter_memories=self.filter_memories)
memories = self.mem0_client.search(
query, user_id=user_id, top_k=self.top_k, filter_memories=self.filter_memories
)
break
except Exception as e:
print("Retrying...")
@@ -57,64 +63,86 @@ class MemorySearch:
end_time = time.time()
if not self.is_graph:
semantic_memories = [{'memory': memory['memory'],
'timestamp': memory['metadata']['timestamp'],
'score': round(memory['score'], 2)}
for memory in memories]
semantic_memories = [
{
"memory": memory["memory"],
"timestamp": memory["metadata"]["timestamp"],
"score": round(memory["score"], 2),
}
for memory in memories
]
graph_memories = None
else:
semantic_memories = [{'memory': memory['memory'],
'timestamp': memory['metadata']['timestamp'],
'score': round(memory['score'], 2)} for memory in memories['results']]
graph_memories = [{"source": relation['source'], "relationship": relation['relationship'], "target": relation['target']} for relation in memories['relations']]
semantic_memories = [
{
"memory": memory["memory"],
"timestamp": memory["metadata"]["timestamp"],
"score": round(memory["score"], 2),
}
for memory in memories["results"]
]
graph_memories = [
{"source": relation["source"], "relationship": relation["relationship"], "target": relation["target"]}
for relation in memories["relations"]
]
return semantic_memories, graph_memories, end_time - start_time
def answer_question(self, speaker_1_user_id, speaker_2_user_id, question, answer, category):
speaker_1_memories, speaker_1_graph_memories, speaker_1_memory_time = self.search_memory(speaker_1_user_id, question)
speaker_2_memories, speaker_2_graph_memories, speaker_2_memory_time = self.search_memory(speaker_2_user_id, question)
speaker_1_memories, speaker_1_graph_memories, speaker_1_memory_time = self.search_memory(
speaker_1_user_id, question
)
speaker_2_memories, speaker_2_graph_memories, speaker_2_memory_time = self.search_memory(
speaker_2_user_id, question
)
search_1_memory = [f"{item['timestamp']}: {item['memory']}"
for item in speaker_1_memories]
search_2_memory = [f"{item['timestamp']}: {item['memory']}"
for item in speaker_2_memories]
search_1_memory = [f"{item['timestamp']}: {item['memory']}" for item in speaker_1_memories]
search_2_memory = [f"{item['timestamp']}: {item['memory']}" for item in speaker_2_memories]
template = Template(self.ANSWER_PROMPT)
answer_prompt = template.render(
speaker_1_user_id=speaker_1_user_id.split('_')[0],
speaker_2_user_id=speaker_2_user_id.split('_')[0],
speaker_1_user_id=speaker_1_user_id.split("_")[0],
speaker_2_user_id=speaker_2_user_id.split("_")[0],
speaker_1_memories=json.dumps(search_1_memory, indent=4),
speaker_2_memories=json.dumps(search_2_memory, indent=4),
speaker_1_graph_memories=json.dumps(speaker_1_graph_memories, indent=4),
speaker_2_graph_memories=json.dumps(speaker_2_graph_memories, indent=4),
question=question
question=question,
)
t1 = time.time()
response = self.openai_client.chat.completions.create(
model=os.getenv("MODEL"),
messages=[
{"role": "system", "content": answer_prompt}
],
temperature=0.0
model=os.getenv("MODEL"), messages=[{"role": "system", "content": answer_prompt}], temperature=0.0
)
t2 = time.time()
response_time = t2 - t1
return response.choices[0].message.content, speaker_1_memories, speaker_2_memories, speaker_1_memory_time, speaker_2_memory_time, speaker_1_graph_memories, speaker_2_graph_memories, response_time
return (
response.choices[0].message.content,
speaker_1_memories,
speaker_2_memories,
speaker_1_memory_time,
speaker_2_memory_time,
speaker_1_graph_memories,
speaker_2_graph_memories,
response_time,
)
def process_question(self, val, speaker_a_user_id, speaker_b_user_id):
question = val.get('question', '')
answer = val.get('answer', '')
category = val.get('category', -1)
evidence = val.get('evidence', [])
adversarial_answer = val.get('adversarial_answer', '')
question = val.get("question", "")
answer = val.get("answer", "")
category = val.get("category", -1)
evidence = val.get("evidence", [])
adversarial_answer = val.get("adversarial_answer", "")
response, speaker_1_memories, speaker_2_memories, speaker_1_memory_time, speaker_2_memory_time, speaker_1_graph_memories, speaker_2_graph_memories, response_time = self.answer_question(
speaker_a_user_id,
speaker_b_user_id,
question,
answer,
category
)
(
response,
speaker_1_memories,
speaker_2_memories,
speaker_1_memory_time,
speaker_2_memory_time,
speaker_1_graph_memories,
speaker_2_graph_memories,
response_time,
) = self.answer_question(speaker_a_user_id, speaker_b_user_id, question, answer, category)
result = {
"question": question,
@@ -125,67 +153,63 @@ class MemorySearch:
"adversarial_answer": adversarial_answer,
"speaker_1_memories": speaker_1_memories,
"speaker_2_memories": speaker_2_memories,
'num_speaker_1_memories': len(speaker_1_memories),
'num_speaker_2_memories': len(speaker_2_memories),
'speaker_1_memory_time': speaker_1_memory_time,
'speaker_2_memory_time': speaker_2_memory_time,
"num_speaker_1_memories": len(speaker_1_memories),
"num_speaker_2_memories": len(speaker_2_memories),
"speaker_1_memory_time": speaker_1_memory_time,
"speaker_2_memory_time": speaker_2_memory_time,
"speaker_1_graph_memories": speaker_1_graph_memories,
"speaker_2_graph_memories": speaker_2_graph_memories,
"response_time": response_time
"response_time": response_time,
}
# Save results after each question is processed
with open(self.output_path, 'w') as f:
with open(self.output_path, "w") as f:
json.dump(self.results, f, indent=4)
return result
def process_data_file(self, file_path):
with open(file_path, 'r') as f:
with open(file_path, "r") as f:
data = json.load(f)
for idx, item in tqdm(enumerate(data), total=len(data), desc="Processing conversations"):
qa = item['qa']
conversation = item['conversation']
speaker_a = conversation['speaker_a']
speaker_b = conversation['speaker_b']
qa = item["qa"]
conversation = item["conversation"]
speaker_a = conversation["speaker_a"]
speaker_b = conversation["speaker_b"]
speaker_a_user_id = f"{speaker_a}_{idx}"
speaker_b_user_id = f"{speaker_b}_{idx}"
for question_item in tqdm(qa, total=len(qa), desc=f"Processing questions for conversation {idx}", leave=False):
result = self.process_question(
question_item,
speaker_a_user_id,
speaker_b_user_id
)
for question_item in tqdm(
qa, total=len(qa), desc=f"Processing questions for conversation {idx}", leave=False
):
result = self.process_question(question_item, speaker_a_user_id, speaker_b_user_id)
self.results[idx].append(result)
# Save results after each question is processed
with open(self.output_path, 'w') as f:
with open(self.output_path, "w") as f:
json.dump(self.results, f, indent=4)
# Final save at the end
with open(self.output_path, 'w') as f:
with open(self.output_path, "w") as f:
json.dump(self.results, f, indent=4)
def process_questions_parallel(self, qa_list, speaker_a_user_id, speaker_b_user_id, max_workers=1):
def process_single_question(val):
result = self.process_question(val, speaker_a_user_id, speaker_b_user_id)
# Save results after each question is processed
with open(self.output_path, 'w') as f:
with open(self.output_path, "w") as f:
json.dump(self.results, f, indent=4)
return result
with ThreadPoolExecutor(max_workers=max_workers) as executor:
results = list(tqdm(
executor.map(process_single_question, qa_list),
total=len(qa_list),
desc="Answering Questions"
))
results = list(
tqdm(executor.map(process_single_question, qa_list), total=len(qa_list), desc="Answering Questions")
)
# Final save at the end
with open(self.output_path, 'w') as f:
with open(self.output_path, "w") as f:
json.dump(self.results, f, indent=4)
return results

View File

@@ -59,23 +59,19 @@ class OpenAIPredict:
self.results = defaultdict(list)
def search_memory(self, idx):
with open(f'memories/{idx}.txt', 'r') as file:
with open(f"memories/{idx}.txt", "r") as file:
memories = file.read()
return memories, 0
def process_question(self, val, idx):
question = val.get('question', '')
answer = val.get('answer', '')
category = val.get('category', -1)
evidence = val.get('evidence', [])
adversarial_answer = val.get('adversarial_answer', '')
question = val.get("question", "")
answer = val.get("answer", "")
category = val.get("category", -1)
evidence = val.get("evidence", [])
adversarial_answer = val.get("adversarial_answer", "")
response, search_memory_time, response_time, context = self.answer_question(
idx,
question
)
response, search_memory_time, response_time, context = self.answer_question(idx, question)
result = {
"question": question,
@@ -86,7 +82,7 @@ class OpenAIPredict:
"adversarial_answer": adversarial_answer,
"search_memory_time": search_memory_time,
"response_time": response_time,
"context": context
"context": context,
}
return result
@@ -95,43 +91,35 @@ class OpenAIPredict:
memories, search_memory_time = self.search_memory(idx)
template = Template(ANSWER_PROMPT)
answer_prompt = template.render(
memories=memories,
question=question
)
answer_prompt = template.render(memories=memories, question=question)
t1 = time.time()
response = self.openai_client.chat.completions.create(
model=os.getenv("MODEL"),
messages=[
{"role": "system", "content": answer_prompt}
],
temperature=0.0
model=os.getenv("MODEL"), messages=[{"role": "system", "content": answer_prompt}], temperature=0.0
)
t2 = time.time()
response_time = t2 - t1
return response.choices[0].message.content, search_memory_time, response_time, memories
def process_data_file(self, file_path, output_file_path):
with open(file_path, 'r') as f:
with open(file_path, "r") as f:
data = json.load(f)
for idx, item in tqdm(enumerate(data), total=len(data), desc="Processing conversations"):
qa = item['qa']
qa = item["qa"]
for question_item in tqdm(qa, total=len(qa), desc=f"Processing questions for conversation {idx}", leave=False):
result = self.process_question(
question_item,
idx
)
for question_item in tqdm(
qa, total=len(qa), desc=f"Processing questions for conversation {idx}", leave=False
):
result = self.process_question(question_item, idx)
self.results[idx].append(result)
# Save results after each question is processed
with open(output_file_path, 'w') as f:
with open(output_file_path, "w") as f:
json.dump(self.results, f, indent=4)
# Final save at the end
with open(output_file_path, 'w') as f:
with open(output_file_path, "w") as f:
json.dump(self.results, f, indent=4)
@@ -141,4 +129,3 @@ if __name__ == "__main__":
args = parser.parse_args()
openai_predict = OpenAIPredict()
openai_predict.process_data_file("../../dataset/locomo10.json", args.output_file_path)

View File

@@ -33,10 +33,7 @@ class RAGManager:
def generate_response(self, question, context):
template = Template(PROMPT)
prompt = template.render(
CONTEXT=context,
QUESTION=question
)
prompt = template.render(CONTEXT=context, QUESTION=question)
max_retries = 3
retries = 0
@@ -47,19 +44,21 @@ class RAGManager:
response = self.client.chat.completions.create(
model=self.model,
messages=[
{"role": "system",
"content": "You are a helpful assistant that can answer "
"questions based on the provided context."
"If the question involves timing, use the conversation date for reference."
"Provide the shortest possible answer."
"Use words directly from the conversation when possible."
"Avoid using subjects in your answer."},
{"role": "user", "content": prompt}
{
"role": "system",
"content": "You are a helpful assistant that can answer "
"questions based on the provided context."
"If the question involves timing, use the conversation date for reference."
"Provide the shortest possible answer."
"Use words directly from the conversation when possible."
"Avoid using subjects in your answer.",
},
{"role": "user", "content": prompt},
],
temperature=0
temperature=0,
)
t2 = time.time()
return response.choices[0].message.content.strip(), t2-t1
return response.choices[0].message.content.strip(), t2 - t1
except Exception as e:
retries += 1
if retries > max_retries:
@@ -69,21 +68,16 @@ class RAGManager:
def clean_chat_history(self, chat_history):
cleaned_chat_history = ""
for c in chat_history:
cleaned_chat_history += (f"{c['timestamp']} | {c['speaker']}: "
f"{c['text']}\n")
cleaned_chat_history += f"{c['timestamp']} | {c['speaker']}: " f"{c['text']}\n"
return cleaned_chat_history
def calculate_embedding(self, document):
response = self.client.embeddings.create(
model=os.getenv("EMBEDDING_MODEL"),
input=document
)
response = self.client.embeddings.create(model=os.getenv("EMBEDDING_MODEL"), input=document)
return response.data[0].embedding
def calculate_similarity(self, embedding1, embedding2):
return np.dot(embedding1, embedding2) / (
np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
return np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
def search(self, query, chunks, embeddings, k=1):
"""
@@ -101,10 +95,7 @@ class RAGManager:
"""
t1 = time.time()
query_embedding = self.calculate_embedding(query)
similarities = [
self.calculate_similarity(query_embedding, embedding)
for embedding in embeddings
]
similarities = [self.calculate_similarity(query_embedding, embedding) for embedding in embeddings]
# Get indices of top-k most similar chunks
if k == 1:
@@ -118,7 +109,7 @@ class RAGManager:
combined_chunks = "\n<->\n".join([chunks[i] for i in top_indices])
t2 = time.time()
return combined_chunks, t2-t1
return combined_chunks, t2 - t1
def create_chunks(self, chat_history, chunk_size=500):
"""
@@ -139,7 +130,7 @@ class RAGManager:
# Split into chunks based on token count
for i in range(0, len(tokens), chunk_size):
chunk_tokens = tokens[i:i+chunk_size]
chunk_tokens = tokens[i : i + chunk_size]
chunk = encoding.decode(chunk_tokens)
chunks.append(chunk)
@@ -159,13 +150,9 @@ class RAGManager:
chat_history = value["conversation"]
questions = value["question"]
chunks, embeddings = self.create_chunks(
chat_history, self.chunk_size
)
chunks, embeddings = self.create_chunks(chat_history, self.chunk_size)
for item in tqdm(
questions, desc="Answering questions", leave=False
):
for item in tqdm(questions, desc="Answering questions", leave=False):
question = item["question"]
answer = item.get("answer", "")
category = item["category"]
@@ -174,22 +161,20 @@ class RAGManager:
context = chunks[0]
search_time = 0
else:
context, search_time = self.search(
question, chunks, embeddings, k=self.k
)
response, response_time = self.generate_response(
question, context
)
context, search_time = self.search(question, chunks, embeddings, k=self.k)
response, response_time = self.generate_response(question, context)
FINAL_RESULTS[key].append({
"question": question,
"answer": answer,
"category": category,
"context": context,
"response": response,
"search_time": search_time,
"response_time": response_time,
})
FINAL_RESULTS[key].append(
{
"question": question,
"answer": answer,
"category": category,
"context": context,
"response": response,
"search_time": search_time,
"response_time": response_time,
}
)
with open(output_file_path, "w+") as f:
json.dump(FINAL_RESULTS, f, indent=4)

View File

@@ -1,12 +1,3 @@
TECHNIQUES = [
"mem0",
"rag",
"langmem",
"zep",
"openai"
]
TECHNIQUES = ["mem0", "rag", "langmem", "zep", "openai"]
METHODS = [
"add",
"search"
]
METHODS = ["add", "search"]

View File

@@ -19,12 +19,12 @@ class ZepAdd:
self.load_data()
def load_data(self):
with open(self.data_path, 'r') as f:
with open(self.data_path, "r") as f:
self.data = json.load(f)
return self.data
def process_conversation(self, run_id, item, idx):
conversation = item['conversation']
conversation = item["conversation"]
user_id = f"run_id_{run_id}_experiment_user_{idx}"
session_id = f"run_id_{run_id}_experiment_session_{idx}"
@@ -41,7 +41,7 @@ class ZepAdd:
print("Starting to add memories... for user", user_id)
for key in tqdm(conversation.keys(), desc=f"Processing user {user_id}"):
if key in ['speaker_a', 'speaker_b'] or "date" in key:
if key in ["speaker_a", "speaker_b"] or "date" in key:
continue
date_time_key = key + "_date_time"
@@ -51,11 +51,13 @@ class ZepAdd:
for chat in tqdm(chats, desc=f"Adding chats for {key}", leave=False):
self.zep_client.memory.add(
session_id=session_id,
messages=[Message(
role=chat['speaker'],
role_type="user",
content=f"{timestamp}: {chat['text']}",
)]
messages=[
Message(
role=chat["speaker"],
role_type="user",
content=f"{timestamp}: {chat['text']}",
)
],
)
def process_all_conversations(self, run_id):
@@ -71,4 +73,4 @@ if __name__ == "__main__":
parser.add_argument("--run_id", type=str, required=True)
args = parser.parse_args()
zep_add = ZepAdd(data_path="../../dataset/locomo10.json")
zep_add.process_all_conversations(args.run_id)
zep_add.process_all_conversations(args.run_id)

View File

@@ -42,9 +42,9 @@ class ZepSearch:
return f"{edge.valid_at if edge.valid_at else 'date unknown'} - {(edge.invalid_at if edge.invalid_at else 'present')}"
def compose_search_context(self, edges: list[EntityEdge], nodes: list[EntityNode]) -> str:
facts = [f' - {edge.fact} ({self.format_edge_date_range(edge)})' for edge in edges]
entities = [f' - {node.name}: {node.summary}' for node in nodes]
return TEMPLATE.format(facts='\n'.join(facts), entities='\n'.join(entities))
facts = [f" - {edge.fact} ({self.format_edge_date_range(edge)})" for edge in edges]
entities = [f" - {node.name}: {node.summary}" for node in nodes]
return TEMPLATE.format(facts="\n".join(facts), entities="\n".join(entities))
def search_memory(self, run_id, idx, query, max_retries=3, retry_delay=1):
start_time = time.time()
@@ -52,8 +52,14 @@ class ZepSearch:
while retries < max_retries:
try:
user_id = f"run_id_{run_id}_experiment_user_{idx}"
edges_results = (self.zep_client.graph.search(user_id=user_id, reranker='cross_encoder', query=query, scope='edges', limit=20)).edges
node_results = (self.zep_client.graph.search(user_id=user_id, reranker='rrf', query=query, scope='nodes', limit=20)).nodes
edges_results = (
self.zep_client.graph.search(
user_id=user_id, reranker="cross_encoder", query=query, scope="edges", limit=20
)
).edges
node_results = (
self.zep_client.graph.search(user_id=user_id, reranker="rrf", query=query, scope="nodes", limit=20)
).nodes
context = self.compose_search_context(edges_results, node_results)
break
except Exception as e:
@@ -68,17 +74,13 @@ class ZepSearch:
return context, end_time - start_time
def process_question(self, run_id, val, idx):
question = val.get('question', '')
answer = val.get('answer', '')
category = val.get('category', -1)
evidence = val.get('evidence', [])
adversarial_answer = val.get('adversarial_answer', '')
question = val.get("question", "")
answer = val.get("answer", "")
category = val.get("category", -1)
evidence = val.get("evidence", [])
adversarial_answer = val.get("adversarial_answer", "")
response, search_memory_time, response_time, context = self.answer_question(
run_id,
idx,
question
)
response, search_memory_time, response_time, context = self.answer_question(run_id, idx, question)
result = {
"question": question,
@@ -89,7 +91,7 @@ class ZepSearch:
"adversarial_answer": adversarial_answer,
"search_memory_time": search_memory_time,
"response_time": response_time,
"context": context
"context": context,
}
return result
@@ -98,44 +100,35 @@ class ZepSearch:
context, search_memory_time = self.search_memory(run_id, idx, question)
template = Template(ANSWER_PROMPT_ZEP)
answer_prompt = template.render(
memories=context,
question=question
)
answer_prompt = template.render(memories=context, question=question)
t1 = time.time()
response = self.openai_client.chat.completions.create(
model=os.getenv("MODEL"),
messages=[
{"role": "system", "content": answer_prompt}
],
temperature=0.0
model=os.getenv("MODEL"), messages=[{"role": "system", "content": answer_prompt}], temperature=0.0
)
t2 = time.time()
response_time = t2 - t1
return response.choices[0].message.content, search_memory_time, response_time, context
def process_data_file(self, file_path, run_id, output_file_path):
with open(file_path, 'r') as f:
with open(file_path, "r") as f:
data = json.load(f)
for idx, item in tqdm(enumerate(data), total=len(data), desc="Processing conversations"):
qa = item['qa']
qa = item["qa"]
for question_item in tqdm(qa, total=len(qa), desc=f"Processing questions for conversation {idx}", leave=False):
result = self.process_question(
run_id,
question_item,
idx
)
for question_item in tqdm(
qa, total=len(qa), desc=f"Processing questions for conversation {idx}", leave=False
):
result = self.process_question(run_id, question_item, idx)
self.results[idx].append(result)
# Save results after each question is processed
with open(output_file_path, 'w') as f:
with open(output_file_path, "w") as f:
json.dump(self.results, f, indent=4)
# Final save at the end
with open(output_file_path, 'w') as f:
with open(output_file_path, "w") as f:
json.dump(self.results, f, indent=4)