Formatting (#2750)
This commit is contained in:
@@ -14,46 +14,47 @@ def process_item(item_data):
|
||||
local_results = defaultdict(list)
|
||||
|
||||
for item in v:
|
||||
gt_answer = str(item['answer'])
|
||||
pred_answer = str(item['response'])
|
||||
category = str(item['category'])
|
||||
question = str(item['question'])
|
||||
gt_answer = str(item["answer"])
|
||||
pred_answer = str(item["response"])
|
||||
category = str(item["category"])
|
||||
question = str(item["question"])
|
||||
|
||||
# Skip category 5
|
||||
if category == '5':
|
||||
if category == "5":
|
||||
continue
|
||||
|
||||
metrics = calculate_metrics(pred_answer, gt_answer)
|
||||
bleu_scores = calculate_bleu_scores(pred_answer, gt_answer)
|
||||
llm_score = evaluate_llm_judge(question, gt_answer, pred_answer)
|
||||
|
||||
local_results[k].append({
|
||||
"question": question,
|
||||
"answer": gt_answer,
|
||||
"response": pred_answer,
|
||||
"category": category,
|
||||
"bleu_score": bleu_scores["bleu1"],
|
||||
"f1_score": metrics["f1"],
|
||||
"llm_score": llm_score
|
||||
})
|
||||
local_results[k].append(
|
||||
{
|
||||
"question": question,
|
||||
"answer": gt_answer,
|
||||
"response": pred_answer,
|
||||
"category": category,
|
||||
"bleu_score": bleu_scores["bleu1"],
|
||||
"f1_score": metrics["f1"],
|
||||
"llm_score": llm_score,
|
||||
}
|
||||
)
|
||||
|
||||
return local_results
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Evaluate RAG results')
|
||||
parser.add_argument('--input_file', type=str,
|
||||
default="results/rag_results_500_k1.json",
|
||||
help='Path to the input dataset file')
|
||||
parser.add_argument('--output_file', type=str,
|
||||
default="evaluation_metrics.json",
|
||||
help='Path to save the evaluation results')
|
||||
parser.add_argument('--max_workers', type=int, default=10,
|
||||
help='Maximum number of worker threads')
|
||||
parser = argparse.ArgumentParser(description="Evaluate RAG results")
|
||||
parser.add_argument(
|
||||
"--input_file", type=str, default="results/rag_results_500_k1.json", help="Path to the input dataset file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_file", type=str, default="evaluation_metrics.json", help="Path to save the evaluation results"
|
||||
)
|
||||
parser.add_argument("--max_workers", type=int, default=10, help="Maximum number of worker threads")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.input_file, 'r') as f:
|
||||
with open(args.input_file, "r") as f:
|
||||
data = json.load(f)
|
||||
|
||||
results = defaultdict(list)
|
||||
@@ -61,18 +62,16 @@ def main():
|
||||
|
||||
# Use ThreadPoolExecutor with specified workers
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=args.max_workers) as executor:
|
||||
futures = [executor.submit(process_item, item_data)
|
||||
for item_data in data.items()]
|
||||
futures = [executor.submit(process_item, item_data) for item_data in data.items()]
|
||||
|
||||
for future in tqdm(concurrent.futures.as_completed(futures),
|
||||
total=len(futures)):
|
||||
for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
|
||||
local_results = future.result()
|
||||
with results_lock:
|
||||
for k, items in local_results.items():
|
||||
results[k].extend(items)
|
||||
|
||||
# Save results to JSON file
|
||||
with open(args.output_file, 'w') as f:
|
||||
with open(args.output_file, "w") as f:
|
||||
json.dump(results, f, indent=4)
|
||||
|
||||
print(f"Results saved to {args.output_file}")
|
||||
|
||||
@@ -3,7 +3,7 @@ import json
|
||||
import pandas as pd
|
||||
|
||||
# Load the evaluation metrics data
|
||||
with open('evaluation_metrics.json', 'r') as f:
|
||||
with open("evaluation_metrics.json", "r") as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Flatten the data into a list of question items
|
||||
@@ -15,28 +15,20 @@ for key in data:
|
||||
df = pd.DataFrame(all_items)
|
||||
|
||||
# Convert category to numeric type
|
||||
df['category'] = pd.to_numeric(df['category'])
|
||||
df["category"] = pd.to_numeric(df["category"])
|
||||
|
||||
# Calculate mean scores by category
|
||||
result = df.groupby('category').agg({
|
||||
'bleu_score': 'mean',
|
||||
'f1_score': 'mean',
|
||||
'llm_score': 'mean'
|
||||
}).round(4)
|
||||
result = df.groupby("category").agg({"bleu_score": "mean", "f1_score": "mean", "llm_score": "mean"}).round(4)
|
||||
|
||||
# Add count of questions per category
|
||||
result['count'] = df.groupby('category').size()
|
||||
result["count"] = df.groupby("category").size()
|
||||
|
||||
# Print the results
|
||||
print("Mean Scores Per Category:")
|
||||
print(result)
|
||||
|
||||
# Calculate overall means
|
||||
overall_means = df.agg({
|
||||
'bleu_score': 'mean',
|
||||
'f1_score': 'mean',
|
||||
'llm_score': 'mean'
|
||||
}).round(4)
|
||||
overall_means = df.agg({"bleu_score": "mean", "f1_score": "mean", "llm_score": "mean"}).round(4)
|
||||
|
||||
print("\nOverall Mean Scores:")
|
||||
print(overall_means)
|
||||
print(overall_means)
|
||||
|
||||
@@ -33,35 +33,34 @@ Do NOT include both CORRECT and WRONG in your response, or it will break the eva
|
||||
Just return the label CORRECT or WRONG in a json format with the key as "label".
|
||||
"""
|
||||
|
||||
|
||||
def evaluate_llm_judge(question, gold_answer, generated_answer):
|
||||
"""Evaluate the generated answer against the gold answer using an LLM judge."""
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-4o-mini",
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": ACCURACY_PROMPT.format(
|
||||
question=question,
|
||||
gold_answer=gold_answer,
|
||||
generated_answer=generated_answer
|
||||
)
|
||||
}],
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": ACCURACY_PROMPT.format(
|
||||
question=question, gold_answer=gold_answer, generated_answer=generated_answer
|
||||
),
|
||||
}
|
||||
],
|
||||
response_format={"type": "json_object"},
|
||||
temperature=0.0
|
||||
temperature=0.0,
|
||||
)
|
||||
label = json.loads(response.choices[0].message.content)['label']
|
||||
label = json.loads(response.choices[0].message.content)["label"]
|
||||
return 1 if label == "CORRECT" else 0
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function to evaluate RAG results using LLM judge."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Evaluate RAG results using LLM judge'
|
||||
)
|
||||
parser = argparse.ArgumentParser(description="Evaluate RAG results using LLM judge")
|
||||
parser.add_argument(
|
||||
'--input_file',
|
||||
"--input_file",
|
||||
type=str,
|
||||
default="results/default_run_v4_k30_new_graph.json",
|
||||
help='Path to the input dataset file'
|
||||
help="Path to the input dataset file",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
@@ -78,10 +77,10 @@ def main():
|
||||
index = 0
|
||||
for k, v in data.items():
|
||||
for x in v:
|
||||
question = x['question']
|
||||
gold_answer = x['answer']
|
||||
generated_answer = x['response']
|
||||
category = x['category']
|
||||
question = x["question"]
|
||||
gold_answer = x["answer"]
|
||||
generated_answer = x["response"]
|
||||
category = x["category"]
|
||||
|
||||
# Skip category 5
|
||||
if int(category) == 5:
|
||||
@@ -92,13 +91,15 @@ def main():
|
||||
LLM_JUDGE[category].append(label)
|
||||
|
||||
# Store the results
|
||||
RESULTS[index].append({
|
||||
"question": question,
|
||||
"gt_answer": gold_answer,
|
||||
"response": generated_answer,
|
||||
"category": category,
|
||||
"llm_label": label
|
||||
})
|
||||
RESULTS[index].append(
|
||||
{
|
||||
"question": question,
|
||||
"gt_answer": gold_answer,
|
||||
"response": generated_answer,
|
||||
"category": category,
|
||||
"llm_label": label,
|
||||
}
|
||||
)
|
||||
|
||||
# Save intermediate results
|
||||
with open(output_path, "w") as f:
|
||||
@@ -108,8 +109,7 @@ def main():
|
||||
print("All categories accuracy:")
|
||||
for cat, results in LLM_JUDGE.items():
|
||||
if results: # Only print if there are results for this category
|
||||
print(f" Category {cat}: {np.mean(results):.4f} "
|
||||
f"({sum(results)}/{len(results)})")
|
||||
print(f" Category {cat}: {np.mean(results):.4f} " f"({sum(results)}/{len(results)})")
|
||||
print("------------------------------------------")
|
||||
index += 1
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@ Borrowed from https://github.com/WujiangXu/AgenticMemory/blob/main/utils.py
|
||||
|
||||
@article{xu2025mem,
|
||||
title={A-mem: Agentic memory for llm agents},
|
||||
author={Xu, Wujiang and Liang, Zujie and Mei, Kai and Gao, Hang and Tan, Juntao
|
||||
author={Xu, Wujiang and Liang, Zujie and Mei, Kai and Gao, Hang and Tan, Juntao
|
||||
and Zhang, Yongfeng},
|
||||
journal={arXiv preprint arXiv:2502.12110},
|
||||
year={2025}
|
||||
@@ -26,42 +26,45 @@ from sentence_transformers.util import pytorch_cos_sim
|
||||
|
||||
# Download required NLTK data
|
||||
try:
|
||||
nltk.download('punkt', quiet=True)
|
||||
nltk.download('wordnet', quiet=True)
|
||||
nltk.download("punkt", quiet=True)
|
||||
nltk.download("wordnet", quiet=True)
|
||||
except Exception as e:
|
||||
print(f"Error downloading NLTK data: {e}")
|
||||
|
||||
# Initialize SentenceTransformer model (this will be reused)
|
||||
try:
|
||||
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
|
||||
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not load SentenceTransformer model: {e}")
|
||||
sentence_model = None
|
||||
|
||||
|
||||
def simple_tokenize(text):
|
||||
"""Simple tokenization function."""
|
||||
# Convert to string if not already
|
||||
text = str(text)
|
||||
return text.lower().replace('.', ' ').replace(',', ' ').replace('!', ' ').replace('?', ' ').split()
|
||||
return text.lower().replace(".", " ").replace(",", " ").replace("!", " ").replace("?", " ").split()
|
||||
|
||||
|
||||
def calculate_rouge_scores(prediction: str, reference: str) -> Dict[str, float]:
|
||||
"""Calculate ROUGE scores for prediction against reference."""
|
||||
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
|
||||
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
|
||||
scores = scorer.score(reference, prediction)
|
||||
return {
|
||||
'rouge1_f': scores['rouge1'].fmeasure,
|
||||
'rouge2_f': scores['rouge2'].fmeasure,
|
||||
'rougeL_f': scores['rougeL'].fmeasure
|
||||
"rouge1_f": scores["rouge1"].fmeasure,
|
||||
"rouge2_f": scores["rouge2"].fmeasure,
|
||||
"rougeL_f": scores["rougeL"].fmeasure,
|
||||
}
|
||||
|
||||
|
||||
def calculate_bleu_scores(prediction: str, reference: str) -> Dict[str, float]:
|
||||
"""Calculate BLEU scores with different n-gram settings."""
|
||||
pred_tokens = nltk.word_tokenize(prediction.lower())
|
||||
ref_tokens = [nltk.word_tokenize(reference.lower())]
|
||||
|
||||
|
||||
weights_list = [(1, 0, 0, 0), (0.5, 0.5, 0, 0), (0.33, 0.33, 0.33, 0), (0.25, 0.25, 0.25, 0.25)]
|
||||
smooth = SmoothingFunction().method1
|
||||
|
||||
|
||||
scores = {}
|
||||
for n, weights in enumerate(weights_list, start=1):
|
||||
try:
|
||||
@@ -69,26 +72,20 @@ def calculate_bleu_scores(prediction: str, reference: str) -> Dict[str, float]:
|
||||
except Exception as e:
|
||||
print(f"Error calculating BLEU score: {e}")
|
||||
score = 0.0
|
||||
scores[f'bleu{n}'] = score
|
||||
|
||||
scores[f"bleu{n}"] = score
|
||||
|
||||
return scores
|
||||
|
||||
|
||||
def calculate_bert_scores(prediction: str, reference: str) -> Dict[str, float]:
|
||||
"""Calculate BERTScore for semantic similarity."""
|
||||
try:
|
||||
P, R, F1 = bert_score([prediction], [reference], lang='en', verbose=False)
|
||||
return {
|
||||
'bert_precision': P.item(),
|
||||
'bert_recall': R.item(),
|
||||
'bert_f1': F1.item()
|
||||
}
|
||||
P, R, F1 = bert_score([prediction], [reference], lang="en", verbose=False)
|
||||
return {"bert_precision": P.item(), "bert_recall": R.item(), "bert_f1": F1.item()}
|
||||
except Exception as e:
|
||||
print(f"Error calculating BERTScore: {e}")
|
||||
return {
|
||||
'bert_precision': 0.0,
|
||||
'bert_recall': 0.0,
|
||||
'bert_f1': 0.0
|
||||
}
|
||||
return {"bert_precision": 0.0, "bert_recall": 0.0, "bert_f1": 0.0}
|
||||
|
||||
|
||||
def calculate_meteor_score(prediction: str, reference: str) -> float:
|
||||
"""Calculate METEOR score for the prediction."""
|
||||
@@ -98,6 +95,7 @@ def calculate_meteor_score(prediction: str, reference: str) -> float:
|
||||
print(f"Error calculating METEOR score: {e}")
|
||||
return 0.0
|
||||
|
||||
|
||||
def calculate_sentence_similarity(prediction: str, reference: str) -> float:
|
||||
"""Calculate sentence embedding similarity using SentenceBERT."""
|
||||
if sentence_model is None:
|
||||
@@ -106,7 +104,7 @@ def calculate_sentence_similarity(prediction: str, reference: str) -> float:
|
||||
# Encode sentences
|
||||
embedding1 = sentence_model.encode([prediction], convert_to_tensor=True)
|
||||
embedding2 = sentence_model.encode([reference], convert_to_tensor=True)
|
||||
|
||||
|
||||
# Calculate cosine similarity
|
||||
similarity = pytorch_cos_sim(embedding1, embedding2).item()
|
||||
return float(similarity)
|
||||
@@ -114,6 +112,7 @@ def calculate_sentence_similarity(prediction: str, reference: str) -> float:
|
||||
print(f"Error calculating sentence similarity: {e}")
|
||||
return 0.0
|
||||
|
||||
|
||||
def calculate_metrics(prediction: str, reference: str) -> Dict[str, float]:
|
||||
"""Calculate comprehensive evaluation metrics for a prediction."""
|
||||
# Handle empty or None values
|
||||
@@ -130,31 +129,31 @@ def calculate_metrics(prediction: str, reference: str) -> Dict[str, float]:
|
||||
"bleu4": 0.0,
|
||||
"bert_f1": 0.0,
|
||||
"meteor": 0.0,
|
||||
"sbert_similarity": 0.0
|
||||
"sbert_similarity": 0.0,
|
||||
}
|
||||
|
||||
|
||||
# Convert to strings if they're not already
|
||||
prediction = str(prediction).strip()
|
||||
reference = str(reference).strip()
|
||||
|
||||
|
||||
# Calculate exact match
|
||||
exact_match = int(prediction.lower() == reference.lower())
|
||||
|
||||
|
||||
# Calculate token-based F1 score
|
||||
pred_tokens = set(simple_tokenize(prediction))
|
||||
ref_tokens = set(simple_tokenize(reference))
|
||||
common_tokens = pred_tokens & ref_tokens
|
||||
|
||||
|
||||
if not pred_tokens or not ref_tokens:
|
||||
f1 = 0.0
|
||||
else:
|
||||
precision = len(common_tokens) / len(pred_tokens)
|
||||
recall = len(common_tokens) / len(ref_tokens)
|
||||
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
|
||||
|
||||
|
||||
# Calculate all scores
|
||||
bleu_scores = calculate_bleu_scores(prediction, reference)
|
||||
|
||||
|
||||
# Combine all metrics
|
||||
metrics = {
|
||||
"exact_match": exact_match,
|
||||
@@ -164,48 +163,49 @@ def calculate_metrics(prediction: str, reference: str) -> Dict[str, float]:
|
||||
|
||||
return metrics
|
||||
|
||||
def aggregate_metrics(all_metrics: List[Dict[str, float]], all_categories: List[int]) -> Dict[str, Dict[str, Union[float, Dict[str, float]]]]:
|
||||
|
||||
def aggregate_metrics(
|
||||
all_metrics: List[Dict[str, float]], all_categories: List[int]
|
||||
) -> Dict[str, Dict[str, Union[float, Dict[str, float]]]]:
|
||||
"""Calculate aggregate statistics for all metrics, split by category."""
|
||||
if not all_metrics:
|
||||
return {}
|
||||
|
||||
|
||||
# Initialize aggregates for overall and per-category metrics
|
||||
aggregates = defaultdict(list)
|
||||
category_aggregates = defaultdict(lambda: defaultdict(list))
|
||||
|
||||
|
||||
# Collect all values for each metric, both overall and per category
|
||||
for metrics, category in zip(all_metrics, all_categories):
|
||||
for metric_name, value in metrics.items():
|
||||
aggregates[metric_name].append(value)
|
||||
category_aggregates[category][metric_name].append(value)
|
||||
|
||||
|
||||
# Calculate statistics for overall metrics
|
||||
results = {
|
||||
"overall": {}
|
||||
}
|
||||
|
||||
results = {"overall": {}}
|
||||
|
||||
for metric_name, values in aggregates.items():
|
||||
results["overall"][metric_name] = {
|
||||
'mean': statistics.mean(values),
|
||||
'std': statistics.stdev(values) if len(values) > 1 else 0.0,
|
||||
'median': statistics.median(values),
|
||||
'min': min(values),
|
||||
'max': max(values),
|
||||
'count': len(values)
|
||||
"mean": statistics.mean(values),
|
||||
"std": statistics.stdev(values) if len(values) > 1 else 0.0,
|
||||
"median": statistics.median(values),
|
||||
"min": min(values),
|
||||
"max": max(values),
|
||||
"count": len(values),
|
||||
}
|
||||
|
||||
|
||||
# Calculate statistics for each category
|
||||
for category in sorted(category_aggregates.keys()):
|
||||
results[f"category_{category}"] = {}
|
||||
for metric_name, values in category_aggregates[category].items():
|
||||
if values: # Only calculate if we have values for this category
|
||||
results[f"category_{category}"][metric_name] = {
|
||||
'mean': statistics.mean(values),
|
||||
'std': statistics.stdev(values) if len(values) > 1 else 0.0,
|
||||
'median': statistics.median(values),
|
||||
'min': min(values),
|
||||
'max': max(values),
|
||||
'count': len(values)
|
||||
"mean": statistics.mean(values),
|
||||
"std": statistics.stdev(values) if len(values) > 1 else 0.0,
|
||||
"median": statistics.median(values),
|
||||
"min": min(values),
|
||||
"max": max(values),
|
||||
"count": len(values),
|
||||
}
|
||||
|
||||
|
||||
return results
|
||||
|
||||
@@ -144,4 +144,4 @@ ANSWER_PROMPT_ZEP = """
|
||||
|
||||
Question: {{question}}
|
||||
Answer:
|
||||
"""
|
||||
"""
|
||||
|
||||
@@ -21,23 +21,15 @@ class Experiment:
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Run memory experiments')
|
||||
parser.add_argument('--technique_type', choices=TECHNIQUES, default='mem0',
|
||||
help='Memory technique to use')
|
||||
parser.add_argument('--method', choices=METHODS, default='add',
|
||||
help='Method to use')
|
||||
parser.add_argument('--chunk_size', type=int, default=1000,
|
||||
help='Chunk size for processing')
|
||||
parser.add_argument('--output_folder', type=str, default='results/',
|
||||
help='Output path for results')
|
||||
parser.add_argument('--top_k', type=int, default=30,
|
||||
help='Number of top memories to retrieve')
|
||||
parser.add_argument('--filter_memories', action='store_true', default=False,
|
||||
help='Whether to filter memories')
|
||||
parser.add_argument('--is_graph', action='store_true', default=False,
|
||||
help='Whether to use graph-based search')
|
||||
parser.add_argument('--num_chunks', type=int, default=1,
|
||||
help='Number of chunks to process')
|
||||
parser = argparse.ArgumentParser(description="Run memory experiments")
|
||||
parser.add_argument("--technique_type", choices=TECHNIQUES, default="mem0", help="Memory technique to use")
|
||||
parser.add_argument("--method", choices=METHODS, default="add", help="Method to use")
|
||||
parser.add_argument("--chunk_size", type=int, default=1000, help="Chunk size for processing")
|
||||
parser.add_argument("--output_folder", type=str, default="results/", help="Output path for results")
|
||||
parser.add_argument("--top_k", type=int, default=30, help="Number of top memories to retrieve")
|
||||
parser.add_argument("--filter_memories", action="store_true", default=False, help="Whether to filter memories")
|
||||
parser.add_argument("--is_graph", action="store_true", default=False, help="Whether to use graph-based search")
|
||||
parser.add_argument("--num_chunks", type=int, default=1, help="Number of chunks to process")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -46,33 +38,18 @@ def main():
|
||||
|
||||
if args.technique_type == "mem0":
|
||||
if args.method == "add":
|
||||
memory_manager = MemoryADD(
|
||||
data_path='dataset/locomo10.json',
|
||||
is_graph=args.is_graph
|
||||
)
|
||||
memory_manager = MemoryADD(data_path="dataset/locomo10.json", is_graph=args.is_graph)
|
||||
memory_manager.process_all_conversations()
|
||||
elif args.method == "search":
|
||||
output_file_path = os.path.join(
|
||||
args.output_folder,
|
||||
f"mem0_results_top_{args.top_k}_filter_{args.filter_memories}_graph_{args.is_graph}.json"
|
||||
f"mem0_results_top_{args.top_k}_filter_{args.filter_memories}_graph_{args.is_graph}.json",
|
||||
)
|
||||
memory_searcher = MemorySearch(
|
||||
output_file_path,
|
||||
args.top_k,
|
||||
args.filter_memories,
|
||||
args.is_graph
|
||||
)
|
||||
memory_searcher.process_data_file('dataset/locomo10.json')
|
||||
memory_searcher = MemorySearch(output_file_path, args.top_k, args.filter_memories, args.is_graph)
|
||||
memory_searcher.process_data_file("dataset/locomo10.json")
|
||||
elif args.technique_type == "rag":
|
||||
output_file_path = os.path.join(
|
||||
args.output_folder,
|
||||
f"rag_results_{args.chunk_size}_k{args.num_chunks}.json"
|
||||
)
|
||||
rag_manager = RAGManager(
|
||||
data_path="dataset/locomo10_rag.json",
|
||||
chunk_size=args.chunk_size,
|
||||
k=args.num_chunks
|
||||
)
|
||||
output_file_path = os.path.join(args.output_folder, f"rag_results_{args.chunk_size}_k{args.num_chunks}.json")
|
||||
rag_manager = RAGManager(data_path="dataset/locomo10_rag.json", chunk_size=args.chunk_size, k=args.num_chunks)
|
||||
rag_manager.process_all_conversations(output_file_path)
|
||||
elif args.technique_type == "langmem":
|
||||
output_file_path = os.path.join(args.output_folder, "langmem_results.json")
|
||||
@@ -85,11 +62,7 @@ def main():
|
||||
elif args.method == "search":
|
||||
output_file_path = os.path.join(args.output_folder, "zep_search_results.json")
|
||||
zep_manager = ZepSearch()
|
||||
zep_manager.process_data_file(
|
||||
"dataset/locomo10.json",
|
||||
"1",
|
||||
output_file_path
|
||||
)
|
||||
zep_manager.process_data_file("dataset/locomo10.json", "1", output_file_path)
|
||||
elif args.technique_type == "openai":
|
||||
output_file_path = os.path.join(args.output_folder, "openai_results.json")
|
||||
openai_manager = OpenAIPredict()
|
||||
|
||||
@@ -28,14 +28,12 @@ def get_answer(question, speaker_1_user_id, speaker_1_memories, speaker_2_user_i
|
||||
speaker_1_user_id=speaker_1_user_id,
|
||||
speaker_1_memories=speaker_1_memories,
|
||||
speaker_2_user_id=speaker_2_user_id,
|
||||
speaker_2_memories=speaker_2_memories
|
||||
speaker_2_memories=speaker_2_memories,
|
||||
)
|
||||
|
||||
t1 = time.time()
|
||||
response = client.chat.completions.create(
|
||||
model=os.getenv("MODEL"),
|
||||
messages=[{"role": "system", "content": prompt}],
|
||||
temperature=0.0
|
||||
model=os.getenv("MODEL"), messages=[{"role": "system", "content": prompt}], temperature=0.0
|
||||
)
|
||||
t2 = time.time()
|
||||
return response.choices[0].message.content, t2 - t1
|
||||
@@ -59,7 +57,9 @@ def prompt(state):
|
||||
|
||||
|
||||
class LangMem:
|
||||
def __init__(self,):
|
||||
def __init__(
|
||||
self,
|
||||
):
|
||||
self.store = InMemoryStore(
|
||||
index={
|
||||
"dims": 1536,
|
||||
@@ -80,18 +80,12 @@ class LangMem:
|
||||
)
|
||||
|
||||
def add_memory(self, message, config):
|
||||
return self.agent.invoke(
|
||||
{"messages": [{"role": "user", "content": message}]},
|
||||
config=config
|
||||
)
|
||||
return self.agent.invoke({"messages": [{"role": "user", "content": message}]}, config=config)
|
||||
|
||||
def search_memory(self, query, config):
|
||||
try:
|
||||
t1 = time.time()
|
||||
response = self.agent.invoke(
|
||||
{"messages": [{"role": "user", "content": query}]},
|
||||
config=config
|
||||
)
|
||||
response = self.agent.invoke({"messages": [{"role": "user", "content": query}]}, config=config)
|
||||
t2 = time.time()
|
||||
return response["messages"][-1].content, t2 - t1
|
||||
except Exception as e:
|
||||
@@ -102,7 +96,7 @@ class LangMem:
|
||||
class LangMemManager:
|
||||
def __init__(self, dataset_path):
|
||||
self.dataset_path = dataset_path
|
||||
with open(self.dataset_path, 'r') as f:
|
||||
with open(self.dataset_path, "r") as f:
|
||||
self.data = json.load(f)
|
||||
|
||||
def process_all_conversations(self, output_file_path):
|
||||
@@ -123,7 +117,7 @@ class LangMemManager:
|
||||
|
||||
# Identify speakers
|
||||
for conv in chat_history:
|
||||
speakers.add(conv['speaker'])
|
||||
speakers.add(conv["speaker"])
|
||||
|
||||
if len(speakers) != 2:
|
||||
raise ValueError(f"Expected 2 speakers, got {len(speakers)}")
|
||||
@@ -134,50 +128,52 @@ class LangMemManager:
|
||||
# Add memories for each message
|
||||
for conv in tqdm(chat_history, desc=f"Processing messages {key}", leave=False):
|
||||
message = f"{conv['timestamp']} | {conv['speaker']}: {conv['text']}"
|
||||
if conv['speaker'] == speaker1:
|
||||
if conv["speaker"] == speaker1:
|
||||
agent1.add_memory(message, config)
|
||||
elif conv['speaker'] == speaker2:
|
||||
elif conv["speaker"] == speaker2:
|
||||
agent2.add_memory(message, config)
|
||||
else:
|
||||
raise ValueError(f"Expected speaker1 or speaker2, got {conv['speaker']}")
|
||||
|
||||
# Process questions
|
||||
for q in tqdm(questions, desc=f"Processing questions {key}", leave=False):
|
||||
category = q['category']
|
||||
category = q["category"]
|
||||
|
||||
if int(category) == 5:
|
||||
continue
|
||||
|
||||
answer = q['answer']
|
||||
question = q['question']
|
||||
answer = q["answer"]
|
||||
question = q["question"]
|
||||
response1, speaker1_memory_time = agent1.search_memory(question, config)
|
||||
response2, speaker2_memory_time = agent2.search_memory(question, config)
|
||||
|
||||
generated_answer, response_time = get_answer(
|
||||
question, speaker1, response1, speaker2, response2
|
||||
)
|
||||
generated_answer, response_time = get_answer(question, speaker1, response1, speaker2, response2)
|
||||
|
||||
result[key].append({
|
||||
"question": question,
|
||||
"answer": answer,
|
||||
"response1": response1,
|
||||
"response2": response2,
|
||||
"category": category,
|
||||
"speaker1_memory_time": speaker1_memory_time,
|
||||
"speaker2_memory_time": speaker2_memory_time,
|
||||
"response_time": response_time,
|
||||
'response': generated_answer
|
||||
})
|
||||
result[key].append(
|
||||
{
|
||||
"question": question,
|
||||
"answer": answer,
|
||||
"response1": response1,
|
||||
"response2": response2,
|
||||
"category": category,
|
||||
"speaker1_memory_time": speaker1_memory_time,
|
||||
"speaker2_memory_time": speaker2_memory_time,
|
||||
"response_time": response_time,
|
||||
"response": generated_answer,
|
||||
}
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
# Use multiprocessing to process conversations in parallel
|
||||
with mp.Pool(processes=10) as pool:
|
||||
results = list(tqdm(
|
||||
pool.imap(process_conversation, list(self.data.items())),
|
||||
total=len(self.data),
|
||||
desc="Processing conversations"
|
||||
))
|
||||
results = list(
|
||||
tqdm(
|
||||
pool.imap(process_conversation, list(self.data.items())),
|
||||
total=len(self.data),
|
||||
desc="Processing conversations",
|
||||
)
|
||||
)
|
||||
|
||||
# Combine results from all workers
|
||||
for result in results:
|
||||
@@ -185,5 +181,5 @@ class LangMemManager:
|
||||
OUTPUT[key].extend(items)
|
||||
|
||||
# Save final results
|
||||
with open(output_file_path, 'w') as f:
|
||||
with open(output_file_path, "w") as f:
|
||||
json.dump(OUTPUT, f, indent=4)
|
||||
|
||||
@@ -13,7 +13,7 @@ load_dotenv()
|
||||
|
||||
|
||||
# Update custom instructions
|
||||
custom_instructions ="""
|
||||
custom_instructions = """
|
||||
Generate personal memories that follow these guidelines:
|
||||
|
||||
1. Each memory should be self-contained with complete context, including:
|
||||
@@ -47,7 +47,7 @@ class MemoryADD:
|
||||
self.mem0_client = MemoryClient(
|
||||
api_key=os.getenv("MEM0_API_KEY"),
|
||||
org_id=os.getenv("MEM0_ORGANIZATION_ID"),
|
||||
project_id=os.getenv("MEM0_PROJECT_ID")
|
||||
project_id=os.getenv("MEM0_PROJECT_ID"),
|
||||
)
|
||||
|
||||
self.mem0_client.update_project(custom_instructions=custom_instructions)
|
||||
@@ -59,15 +59,16 @@ class MemoryADD:
|
||||
self.load_data()
|
||||
|
||||
def load_data(self):
|
||||
with open(self.data_path, 'r') as f:
|
||||
with open(self.data_path, "r") as f:
|
||||
self.data = json.load(f)
|
||||
return self.data
|
||||
|
||||
def add_memory(self, user_id, message, metadata, retries=3):
|
||||
for attempt in range(retries):
|
||||
try:
|
||||
_ = self.mem0_client.add(message, user_id=user_id, version="v2",
|
||||
metadata=metadata, enable_graph=self.is_graph)
|
||||
_ = self.mem0_client.add(
|
||||
message, user_id=user_id, version="v2", metadata=metadata, enable_graph=self.is_graph
|
||||
)
|
||||
return
|
||||
except Exception as e:
|
||||
if attempt < retries - 1:
|
||||
@@ -78,13 +79,13 @@ class MemoryADD:
|
||||
|
||||
def add_memories_for_speaker(self, speaker, messages, timestamp, desc):
|
||||
for i in tqdm(range(0, len(messages), self.batch_size), desc=desc):
|
||||
batch_messages = messages[i:i+self.batch_size]
|
||||
batch_messages = messages[i : i + self.batch_size]
|
||||
self.add_memory(speaker, batch_messages, metadata={"timestamp": timestamp})
|
||||
|
||||
def process_conversation(self, item, idx):
|
||||
conversation = item['conversation']
|
||||
speaker_a = conversation['speaker_a']
|
||||
speaker_b = conversation['speaker_b']
|
||||
conversation = item["conversation"]
|
||||
speaker_a = conversation["speaker_a"]
|
||||
speaker_b = conversation["speaker_b"]
|
||||
|
||||
speaker_a_user_id = f"{speaker_a}_{idx}"
|
||||
speaker_b_user_id = f"{speaker_b}_{idx}"
|
||||
@@ -94,7 +95,7 @@ class MemoryADD:
|
||||
self.mem0_client.delete_all(user_id=speaker_b_user_id)
|
||||
|
||||
for key in conversation.keys():
|
||||
if key in ['speaker_a', 'speaker_b'] or "date" in key or "timestamp" in key:
|
||||
if key in ["speaker_a", "speaker_b"] or "date" in key or "timestamp" in key:
|
||||
continue
|
||||
|
||||
date_time_key = key + "_date_time"
|
||||
@@ -104,10 +105,10 @@ class MemoryADD:
|
||||
messages = []
|
||||
messages_reverse = []
|
||||
for chat in chats:
|
||||
if chat['speaker'] == speaker_a:
|
||||
if chat["speaker"] == speaker_a:
|
||||
messages.append({"role": "user", "content": f"{speaker_a}: {chat['text']}"})
|
||||
messages_reverse.append({"role": "assistant", "content": f"{speaker_a}: {chat['text']}"})
|
||||
elif chat['speaker'] == speaker_b:
|
||||
elif chat["speaker"] == speaker_b:
|
||||
messages.append({"role": "assistant", "content": f"{speaker_b}: {chat['text']}"})
|
||||
messages_reverse.append({"role": "user", "content": f"{speaker_b}: {chat['text']}"})
|
||||
else:
|
||||
@@ -116,11 +117,11 @@ class MemoryADD:
|
||||
# add memories for the two users on different threads
|
||||
thread_a = threading.Thread(
|
||||
target=self.add_memories_for_speaker,
|
||||
args=(speaker_a_user_id, messages, timestamp, "Adding Memories for Speaker A")
|
||||
args=(speaker_a_user_id, messages, timestamp, "Adding Memories for Speaker A"),
|
||||
)
|
||||
thread_b = threading.Thread(
|
||||
target=self.add_memories_for_speaker,
|
||||
args=(speaker_b_user_id, messages_reverse, timestamp, "Adding Memories for Speaker B")
|
||||
args=(speaker_b_user_id, messages_reverse, timestamp, "Adding Memories for Speaker B"),
|
||||
)
|
||||
|
||||
thread_a.start()
|
||||
@@ -134,10 +135,7 @@ class MemoryADD:
|
||||
if not self.data:
|
||||
raise ValueError("No data loaded. Please set data_path and call load_data() first.")
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
futures = [
|
||||
executor.submit(self.process_conversation, item, idx)
|
||||
for idx, item in enumerate(self.data)
|
||||
]
|
||||
futures = [executor.submit(self.process_conversation, item, idx) for idx, item in enumerate(self.data)]
|
||||
|
||||
for future in futures:
|
||||
future.result()
|
||||
future.result()
|
||||
|
||||
@@ -16,12 +16,11 @@ load_dotenv()
|
||||
|
||||
|
||||
class MemorySearch:
|
||||
|
||||
def __init__(self, output_path='results.json', top_k=10, filter_memories=False, is_graph=False):
|
||||
def __init__(self, output_path="results.json", top_k=10, filter_memories=False, is_graph=False):
|
||||
self.mem0_client = MemoryClient(
|
||||
api_key=os.getenv("MEM0_API_KEY"),
|
||||
org_id=os.getenv("MEM0_ORGANIZATION_ID"),
|
||||
project_id=os.getenv("MEM0_PROJECT_ID")
|
||||
project_id=os.getenv("MEM0_PROJECT_ID"),
|
||||
)
|
||||
self.top_k = top_k
|
||||
self.openai_client = OpenAI()
|
||||
@@ -42,11 +41,18 @@ class MemorySearch:
|
||||
try:
|
||||
if self.is_graph:
|
||||
print("Searching with graph")
|
||||
memories = self.mem0_client.search(query, user_id=user_id, top_k=self.top_k,
|
||||
filter_memories=self.filter_memories, enable_graph=True, output_format='v1.1')
|
||||
memories = self.mem0_client.search(
|
||||
query,
|
||||
user_id=user_id,
|
||||
top_k=self.top_k,
|
||||
filter_memories=self.filter_memories,
|
||||
enable_graph=True,
|
||||
output_format="v1.1",
|
||||
)
|
||||
else:
|
||||
memories = self.mem0_client.search(query, user_id=user_id, top_k=self.top_k,
|
||||
filter_memories=self.filter_memories)
|
||||
memories = self.mem0_client.search(
|
||||
query, user_id=user_id, top_k=self.top_k, filter_memories=self.filter_memories
|
||||
)
|
||||
break
|
||||
except Exception as e:
|
||||
print("Retrying...")
|
||||
@@ -57,64 +63,86 @@ class MemorySearch:
|
||||
|
||||
end_time = time.time()
|
||||
if not self.is_graph:
|
||||
semantic_memories = [{'memory': memory['memory'],
|
||||
'timestamp': memory['metadata']['timestamp'],
|
||||
'score': round(memory['score'], 2)}
|
||||
for memory in memories]
|
||||
semantic_memories = [
|
||||
{
|
||||
"memory": memory["memory"],
|
||||
"timestamp": memory["metadata"]["timestamp"],
|
||||
"score": round(memory["score"], 2),
|
||||
}
|
||||
for memory in memories
|
||||
]
|
||||
graph_memories = None
|
||||
else:
|
||||
semantic_memories = [{'memory': memory['memory'],
|
||||
'timestamp': memory['metadata']['timestamp'],
|
||||
'score': round(memory['score'], 2)} for memory in memories['results']]
|
||||
graph_memories = [{"source": relation['source'], "relationship": relation['relationship'], "target": relation['target']} for relation in memories['relations']]
|
||||
semantic_memories = [
|
||||
{
|
||||
"memory": memory["memory"],
|
||||
"timestamp": memory["metadata"]["timestamp"],
|
||||
"score": round(memory["score"], 2),
|
||||
}
|
||||
for memory in memories["results"]
|
||||
]
|
||||
graph_memories = [
|
||||
{"source": relation["source"], "relationship": relation["relationship"], "target": relation["target"]}
|
||||
for relation in memories["relations"]
|
||||
]
|
||||
return semantic_memories, graph_memories, end_time - start_time
|
||||
|
||||
def answer_question(self, speaker_1_user_id, speaker_2_user_id, question, answer, category):
|
||||
speaker_1_memories, speaker_1_graph_memories, speaker_1_memory_time = self.search_memory(speaker_1_user_id, question)
|
||||
speaker_2_memories, speaker_2_graph_memories, speaker_2_memory_time = self.search_memory(speaker_2_user_id, question)
|
||||
speaker_1_memories, speaker_1_graph_memories, speaker_1_memory_time = self.search_memory(
|
||||
speaker_1_user_id, question
|
||||
)
|
||||
speaker_2_memories, speaker_2_graph_memories, speaker_2_memory_time = self.search_memory(
|
||||
speaker_2_user_id, question
|
||||
)
|
||||
|
||||
search_1_memory = [f"{item['timestamp']}: {item['memory']}"
|
||||
for item in speaker_1_memories]
|
||||
search_2_memory = [f"{item['timestamp']}: {item['memory']}"
|
||||
for item in speaker_2_memories]
|
||||
search_1_memory = [f"{item['timestamp']}: {item['memory']}" for item in speaker_1_memories]
|
||||
search_2_memory = [f"{item['timestamp']}: {item['memory']}" for item in speaker_2_memories]
|
||||
|
||||
template = Template(self.ANSWER_PROMPT)
|
||||
answer_prompt = template.render(
|
||||
speaker_1_user_id=speaker_1_user_id.split('_')[0],
|
||||
speaker_2_user_id=speaker_2_user_id.split('_')[0],
|
||||
speaker_1_user_id=speaker_1_user_id.split("_")[0],
|
||||
speaker_2_user_id=speaker_2_user_id.split("_")[0],
|
||||
speaker_1_memories=json.dumps(search_1_memory, indent=4),
|
||||
speaker_2_memories=json.dumps(search_2_memory, indent=4),
|
||||
speaker_1_graph_memories=json.dumps(speaker_1_graph_memories, indent=4),
|
||||
speaker_2_graph_memories=json.dumps(speaker_2_graph_memories, indent=4),
|
||||
question=question
|
||||
question=question,
|
||||
)
|
||||
|
||||
t1 = time.time()
|
||||
response = self.openai_client.chat.completions.create(
|
||||
model=os.getenv("MODEL"),
|
||||
messages=[
|
||||
{"role": "system", "content": answer_prompt}
|
||||
],
|
||||
temperature=0.0
|
||||
model=os.getenv("MODEL"), messages=[{"role": "system", "content": answer_prompt}], temperature=0.0
|
||||
)
|
||||
t2 = time.time()
|
||||
response_time = t2 - t1
|
||||
return response.choices[0].message.content, speaker_1_memories, speaker_2_memories, speaker_1_memory_time, speaker_2_memory_time, speaker_1_graph_memories, speaker_2_graph_memories, response_time
|
||||
return (
|
||||
response.choices[0].message.content,
|
||||
speaker_1_memories,
|
||||
speaker_2_memories,
|
||||
speaker_1_memory_time,
|
||||
speaker_2_memory_time,
|
||||
speaker_1_graph_memories,
|
||||
speaker_2_graph_memories,
|
||||
response_time,
|
||||
)
|
||||
|
||||
def process_question(self, val, speaker_a_user_id, speaker_b_user_id):
|
||||
question = val.get('question', '')
|
||||
answer = val.get('answer', '')
|
||||
category = val.get('category', -1)
|
||||
evidence = val.get('evidence', [])
|
||||
adversarial_answer = val.get('adversarial_answer', '')
|
||||
question = val.get("question", "")
|
||||
answer = val.get("answer", "")
|
||||
category = val.get("category", -1)
|
||||
evidence = val.get("evidence", [])
|
||||
adversarial_answer = val.get("adversarial_answer", "")
|
||||
|
||||
response, speaker_1_memories, speaker_2_memories, speaker_1_memory_time, speaker_2_memory_time, speaker_1_graph_memories, speaker_2_graph_memories, response_time = self.answer_question(
|
||||
speaker_a_user_id,
|
||||
speaker_b_user_id,
|
||||
question,
|
||||
answer,
|
||||
category
|
||||
)
|
||||
(
|
||||
response,
|
||||
speaker_1_memories,
|
||||
speaker_2_memories,
|
||||
speaker_1_memory_time,
|
||||
speaker_2_memory_time,
|
||||
speaker_1_graph_memories,
|
||||
speaker_2_graph_memories,
|
||||
response_time,
|
||||
) = self.answer_question(speaker_a_user_id, speaker_b_user_id, question, answer, category)
|
||||
|
||||
result = {
|
||||
"question": question,
|
||||
@@ -125,67 +153,63 @@ class MemorySearch:
|
||||
"adversarial_answer": adversarial_answer,
|
||||
"speaker_1_memories": speaker_1_memories,
|
||||
"speaker_2_memories": speaker_2_memories,
|
||||
'num_speaker_1_memories': len(speaker_1_memories),
|
||||
'num_speaker_2_memories': len(speaker_2_memories),
|
||||
'speaker_1_memory_time': speaker_1_memory_time,
|
||||
'speaker_2_memory_time': speaker_2_memory_time,
|
||||
"num_speaker_1_memories": len(speaker_1_memories),
|
||||
"num_speaker_2_memories": len(speaker_2_memories),
|
||||
"speaker_1_memory_time": speaker_1_memory_time,
|
||||
"speaker_2_memory_time": speaker_2_memory_time,
|
||||
"speaker_1_graph_memories": speaker_1_graph_memories,
|
||||
"speaker_2_graph_memories": speaker_2_graph_memories,
|
||||
"response_time": response_time
|
||||
"response_time": response_time,
|
||||
}
|
||||
|
||||
# Save results after each question is processed
|
||||
with open(self.output_path, 'w') as f:
|
||||
with open(self.output_path, "w") as f:
|
||||
json.dump(self.results, f, indent=4)
|
||||
|
||||
return result
|
||||
|
||||
def process_data_file(self, file_path):
|
||||
with open(file_path, 'r') as f:
|
||||
with open(file_path, "r") as f:
|
||||
data = json.load(f)
|
||||
|
||||
for idx, item in tqdm(enumerate(data), total=len(data), desc="Processing conversations"):
|
||||
qa = item['qa']
|
||||
conversation = item['conversation']
|
||||
speaker_a = conversation['speaker_a']
|
||||
speaker_b = conversation['speaker_b']
|
||||
qa = item["qa"]
|
||||
conversation = item["conversation"]
|
||||
speaker_a = conversation["speaker_a"]
|
||||
speaker_b = conversation["speaker_b"]
|
||||
|
||||
speaker_a_user_id = f"{speaker_a}_{idx}"
|
||||
speaker_b_user_id = f"{speaker_b}_{idx}"
|
||||
|
||||
for question_item in tqdm(qa, total=len(qa), desc=f"Processing questions for conversation {idx}", leave=False):
|
||||
result = self.process_question(
|
||||
question_item,
|
||||
speaker_a_user_id,
|
||||
speaker_b_user_id
|
||||
)
|
||||
for question_item in tqdm(
|
||||
qa, total=len(qa), desc=f"Processing questions for conversation {idx}", leave=False
|
||||
):
|
||||
result = self.process_question(question_item, speaker_a_user_id, speaker_b_user_id)
|
||||
self.results[idx].append(result)
|
||||
|
||||
# Save results after each question is processed
|
||||
with open(self.output_path, 'w') as f:
|
||||
with open(self.output_path, "w") as f:
|
||||
json.dump(self.results, f, indent=4)
|
||||
|
||||
# Final save at the end
|
||||
with open(self.output_path, 'w') as f:
|
||||
with open(self.output_path, "w") as f:
|
||||
json.dump(self.results, f, indent=4)
|
||||
|
||||
def process_questions_parallel(self, qa_list, speaker_a_user_id, speaker_b_user_id, max_workers=1):
|
||||
def process_single_question(val):
|
||||
result = self.process_question(val, speaker_a_user_id, speaker_b_user_id)
|
||||
# Save results after each question is processed
|
||||
with open(self.output_path, 'w') as f:
|
||||
with open(self.output_path, "w") as f:
|
||||
json.dump(self.results, f, indent=4)
|
||||
return result
|
||||
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
results = list(tqdm(
|
||||
executor.map(process_single_question, qa_list),
|
||||
total=len(qa_list),
|
||||
desc="Answering Questions"
|
||||
))
|
||||
results = list(
|
||||
tqdm(executor.map(process_single_question, qa_list), total=len(qa_list), desc="Answering Questions")
|
||||
)
|
||||
|
||||
# Final save at the end
|
||||
with open(self.output_path, 'w') as f:
|
||||
with open(self.output_path, "w") as f:
|
||||
json.dump(self.results, f, indent=4)
|
||||
|
||||
return results
|
||||
|
||||
@@ -59,23 +59,19 @@ class OpenAIPredict:
|
||||
self.results = defaultdict(list)
|
||||
|
||||
def search_memory(self, idx):
|
||||
|
||||
with open(f'memories/{idx}.txt', 'r') as file:
|
||||
with open(f"memories/{idx}.txt", "r") as file:
|
||||
memories = file.read()
|
||||
|
||||
return memories, 0
|
||||
|
||||
def process_question(self, val, idx):
|
||||
question = val.get('question', '')
|
||||
answer = val.get('answer', '')
|
||||
category = val.get('category', -1)
|
||||
evidence = val.get('evidence', [])
|
||||
adversarial_answer = val.get('adversarial_answer', '')
|
||||
question = val.get("question", "")
|
||||
answer = val.get("answer", "")
|
||||
category = val.get("category", -1)
|
||||
evidence = val.get("evidence", [])
|
||||
adversarial_answer = val.get("adversarial_answer", "")
|
||||
|
||||
response, search_memory_time, response_time, context = self.answer_question(
|
||||
idx,
|
||||
question
|
||||
)
|
||||
response, search_memory_time, response_time, context = self.answer_question(idx, question)
|
||||
|
||||
result = {
|
||||
"question": question,
|
||||
@@ -86,7 +82,7 @@ class OpenAIPredict:
|
||||
"adversarial_answer": adversarial_answer,
|
||||
"search_memory_time": search_memory_time,
|
||||
"response_time": response_time,
|
||||
"context": context
|
||||
"context": context,
|
||||
}
|
||||
|
||||
return result
|
||||
@@ -95,43 +91,35 @@ class OpenAIPredict:
|
||||
memories, search_memory_time = self.search_memory(idx)
|
||||
|
||||
template = Template(ANSWER_PROMPT)
|
||||
answer_prompt = template.render(
|
||||
memories=memories,
|
||||
question=question
|
||||
)
|
||||
answer_prompt = template.render(memories=memories, question=question)
|
||||
|
||||
t1 = time.time()
|
||||
response = self.openai_client.chat.completions.create(
|
||||
model=os.getenv("MODEL"),
|
||||
messages=[
|
||||
{"role": "system", "content": answer_prompt}
|
||||
],
|
||||
temperature=0.0
|
||||
model=os.getenv("MODEL"), messages=[{"role": "system", "content": answer_prompt}], temperature=0.0
|
||||
)
|
||||
t2 = time.time()
|
||||
response_time = t2 - t1
|
||||
return response.choices[0].message.content, search_memory_time, response_time, memories
|
||||
|
||||
def process_data_file(self, file_path, output_file_path):
|
||||
with open(file_path, 'r') as f:
|
||||
with open(file_path, "r") as f:
|
||||
data = json.load(f)
|
||||
|
||||
for idx, item in tqdm(enumerate(data), total=len(data), desc="Processing conversations"):
|
||||
qa = item['qa']
|
||||
qa = item["qa"]
|
||||
|
||||
for question_item in tqdm(qa, total=len(qa), desc=f"Processing questions for conversation {idx}", leave=False):
|
||||
result = self.process_question(
|
||||
question_item,
|
||||
idx
|
||||
)
|
||||
for question_item in tqdm(
|
||||
qa, total=len(qa), desc=f"Processing questions for conversation {idx}", leave=False
|
||||
):
|
||||
result = self.process_question(question_item, idx)
|
||||
self.results[idx].append(result)
|
||||
|
||||
# Save results after each question is processed
|
||||
with open(output_file_path, 'w') as f:
|
||||
with open(output_file_path, "w") as f:
|
||||
json.dump(self.results, f, indent=4)
|
||||
|
||||
# Final save at the end
|
||||
with open(output_file_path, 'w') as f:
|
||||
with open(output_file_path, "w") as f:
|
||||
json.dump(self.results, f, indent=4)
|
||||
|
||||
|
||||
@@ -141,4 +129,3 @@ if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
openai_predict = OpenAIPredict()
|
||||
openai_predict.process_data_file("../../dataset/locomo10.json", args.output_file_path)
|
||||
|
||||
|
||||
@@ -33,10 +33,7 @@ class RAGManager:
|
||||
|
||||
def generate_response(self, question, context):
|
||||
template = Template(PROMPT)
|
||||
prompt = template.render(
|
||||
CONTEXT=context,
|
||||
QUESTION=question
|
||||
)
|
||||
prompt = template.render(CONTEXT=context, QUESTION=question)
|
||||
|
||||
max_retries = 3
|
||||
retries = 0
|
||||
@@ -47,19 +44,21 @@ class RAGManager:
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=[
|
||||
{"role": "system",
|
||||
"content": "You are a helpful assistant that can answer "
|
||||
"questions based on the provided context."
|
||||
"If the question involves timing, use the conversation date for reference."
|
||||
"Provide the shortest possible answer."
|
||||
"Use words directly from the conversation when possible."
|
||||
"Avoid using subjects in your answer."},
|
||||
{"role": "user", "content": prompt}
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant that can answer "
|
||||
"questions based on the provided context."
|
||||
"If the question involves timing, use the conversation date for reference."
|
||||
"Provide the shortest possible answer."
|
||||
"Use words directly from the conversation when possible."
|
||||
"Avoid using subjects in your answer.",
|
||||
},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
temperature=0
|
||||
temperature=0,
|
||||
)
|
||||
t2 = time.time()
|
||||
return response.choices[0].message.content.strip(), t2-t1
|
||||
return response.choices[0].message.content.strip(), t2 - t1
|
||||
except Exception as e:
|
||||
retries += 1
|
||||
if retries > max_retries:
|
||||
@@ -69,21 +68,16 @@ class RAGManager:
|
||||
def clean_chat_history(self, chat_history):
|
||||
cleaned_chat_history = ""
|
||||
for c in chat_history:
|
||||
cleaned_chat_history += (f"{c['timestamp']} | {c['speaker']}: "
|
||||
f"{c['text']}\n")
|
||||
cleaned_chat_history += f"{c['timestamp']} | {c['speaker']}: " f"{c['text']}\n"
|
||||
|
||||
return cleaned_chat_history
|
||||
|
||||
def calculate_embedding(self, document):
|
||||
response = self.client.embeddings.create(
|
||||
model=os.getenv("EMBEDDING_MODEL"),
|
||||
input=document
|
||||
)
|
||||
response = self.client.embeddings.create(model=os.getenv("EMBEDDING_MODEL"), input=document)
|
||||
return response.data[0].embedding
|
||||
|
||||
def calculate_similarity(self, embedding1, embedding2):
|
||||
return np.dot(embedding1, embedding2) / (
|
||||
np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
|
||||
return np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
|
||||
|
||||
def search(self, query, chunks, embeddings, k=1):
|
||||
"""
|
||||
@@ -101,10 +95,7 @@ class RAGManager:
|
||||
"""
|
||||
t1 = time.time()
|
||||
query_embedding = self.calculate_embedding(query)
|
||||
similarities = [
|
||||
self.calculate_similarity(query_embedding, embedding)
|
||||
for embedding in embeddings
|
||||
]
|
||||
similarities = [self.calculate_similarity(query_embedding, embedding) for embedding in embeddings]
|
||||
|
||||
# Get indices of top-k most similar chunks
|
||||
if k == 1:
|
||||
@@ -118,7 +109,7 @@ class RAGManager:
|
||||
combined_chunks = "\n<->\n".join([chunks[i] for i in top_indices])
|
||||
|
||||
t2 = time.time()
|
||||
return combined_chunks, t2-t1
|
||||
return combined_chunks, t2 - t1
|
||||
|
||||
def create_chunks(self, chat_history, chunk_size=500):
|
||||
"""
|
||||
@@ -139,7 +130,7 @@ class RAGManager:
|
||||
|
||||
# Split into chunks based on token count
|
||||
for i in range(0, len(tokens), chunk_size):
|
||||
chunk_tokens = tokens[i:i+chunk_size]
|
||||
chunk_tokens = tokens[i : i + chunk_size]
|
||||
chunk = encoding.decode(chunk_tokens)
|
||||
chunks.append(chunk)
|
||||
|
||||
@@ -159,13 +150,9 @@ class RAGManager:
|
||||
chat_history = value["conversation"]
|
||||
questions = value["question"]
|
||||
|
||||
chunks, embeddings = self.create_chunks(
|
||||
chat_history, self.chunk_size
|
||||
)
|
||||
chunks, embeddings = self.create_chunks(chat_history, self.chunk_size)
|
||||
|
||||
for item in tqdm(
|
||||
questions, desc="Answering questions", leave=False
|
||||
):
|
||||
for item in tqdm(questions, desc="Answering questions", leave=False):
|
||||
question = item["question"]
|
||||
answer = item.get("answer", "")
|
||||
category = item["category"]
|
||||
@@ -174,22 +161,20 @@ class RAGManager:
|
||||
context = chunks[0]
|
||||
search_time = 0
|
||||
else:
|
||||
context, search_time = self.search(
|
||||
question, chunks, embeddings, k=self.k
|
||||
)
|
||||
response, response_time = self.generate_response(
|
||||
question, context
|
||||
)
|
||||
context, search_time = self.search(question, chunks, embeddings, k=self.k)
|
||||
response, response_time = self.generate_response(question, context)
|
||||
|
||||
FINAL_RESULTS[key].append({
|
||||
"question": question,
|
||||
"answer": answer,
|
||||
"category": category,
|
||||
"context": context,
|
||||
"response": response,
|
||||
"search_time": search_time,
|
||||
"response_time": response_time,
|
||||
})
|
||||
FINAL_RESULTS[key].append(
|
||||
{
|
||||
"question": question,
|
||||
"answer": answer,
|
||||
"category": category,
|
||||
"context": context,
|
||||
"response": response,
|
||||
"search_time": search_time,
|
||||
"response_time": response_time,
|
||||
}
|
||||
)
|
||||
with open(output_file_path, "w+") as f:
|
||||
json.dump(FINAL_RESULTS, f, indent=4)
|
||||
|
||||
|
||||
@@ -1,12 +1,3 @@
|
||||
TECHNIQUES = [
|
||||
"mem0",
|
||||
"rag",
|
||||
"langmem",
|
||||
"zep",
|
||||
"openai"
|
||||
]
|
||||
TECHNIQUES = ["mem0", "rag", "langmem", "zep", "openai"]
|
||||
|
||||
METHODS = [
|
||||
"add",
|
||||
"search"
|
||||
]
|
||||
METHODS = ["add", "search"]
|
||||
|
||||
@@ -19,12 +19,12 @@ class ZepAdd:
|
||||
self.load_data()
|
||||
|
||||
def load_data(self):
|
||||
with open(self.data_path, 'r') as f:
|
||||
with open(self.data_path, "r") as f:
|
||||
self.data = json.load(f)
|
||||
return self.data
|
||||
|
||||
def process_conversation(self, run_id, item, idx):
|
||||
conversation = item['conversation']
|
||||
conversation = item["conversation"]
|
||||
|
||||
user_id = f"run_id_{run_id}_experiment_user_{idx}"
|
||||
session_id = f"run_id_{run_id}_experiment_session_{idx}"
|
||||
@@ -41,7 +41,7 @@ class ZepAdd:
|
||||
|
||||
print("Starting to add memories... for user", user_id)
|
||||
for key in tqdm(conversation.keys(), desc=f"Processing user {user_id}"):
|
||||
if key in ['speaker_a', 'speaker_b'] or "date" in key:
|
||||
if key in ["speaker_a", "speaker_b"] or "date" in key:
|
||||
continue
|
||||
|
||||
date_time_key = key + "_date_time"
|
||||
@@ -51,11 +51,13 @@ class ZepAdd:
|
||||
for chat in tqdm(chats, desc=f"Adding chats for {key}", leave=False):
|
||||
self.zep_client.memory.add(
|
||||
session_id=session_id,
|
||||
messages=[Message(
|
||||
role=chat['speaker'],
|
||||
role_type="user",
|
||||
content=f"{timestamp}: {chat['text']}",
|
||||
)]
|
||||
messages=[
|
||||
Message(
|
||||
role=chat["speaker"],
|
||||
role_type="user",
|
||||
content=f"{timestamp}: {chat['text']}",
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
def process_all_conversations(self, run_id):
|
||||
@@ -71,4 +73,4 @@ if __name__ == "__main__":
|
||||
parser.add_argument("--run_id", type=str, required=True)
|
||||
args = parser.parse_args()
|
||||
zep_add = ZepAdd(data_path="../../dataset/locomo10.json")
|
||||
zep_add.process_all_conversations(args.run_id)
|
||||
zep_add.process_all_conversations(args.run_id)
|
||||
|
||||
@@ -42,9 +42,9 @@ class ZepSearch:
|
||||
return f"{edge.valid_at if edge.valid_at else 'date unknown'} - {(edge.invalid_at if edge.invalid_at else 'present')}"
|
||||
|
||||
def compose_search_context(self, edges: list[EntityEdge], nodes: list[EntityNode]) -> str:
|
||||
facts = [f' - {edge.fact} ({self.format_edge_date_range(edge)})' for edge in edges]
|
||||
entities = [f' - {node.name}: {node.summary}' for node in nodes]
|
||||
return TEMPLATE.format(facts='\n'.join(facts), entities='\n'.join(entities))
|
||||
facts = [f" - {edge.fact} ({self.format_edge_date_range(edge)})" for edge in edges]
|
||||
entities = [f" - {node.name}: {node.summary}" for node in nodes]
|
||||
return TEMPLATE.format(facts="\n".join(facts), entities="\n".join(entities))
|
||||
|
||||
def search_memory(self, run_id, idx, query, max_retries=3, retry_delay=1):
|
||||
start_time = time.time()
|
||||
@@ -52,8 +52,14 @@ class ZepSearch:
|
||||
while retries < max_retries:
|
||||
try:
|
||||
user_id = f"run_id_{run_id}_experiment_user_{idx}"
|
||||
edges_results = (self.zep_client.graph.search(user_id=user_id, reranker='cross_encoder', query=query, scope='edges', limit=20)).edges
|
||||
node_results = (self.zep_client.graph.search(user_id=user_id, reranker='rrf', query=query, scope='nodes', limit=20)).nodes
|
||||
edges_results = (
|
||||
self.zep_client.graph.search(
|
||||
user_id=user_id, reranker="cross_encoder", query=query, scope="edges", limit=20
|
||||
)
|
||||
).edges
|
||||
node_results = (
|
||||
self.zep_client.graph.search(user_id=user_id, reranker="rrf", query=query, scope="nodes", limit=20)
|
||||
).nodes
|
||||
context = self.compose_search_context(edges_results, node_results)
|
||||
break
|
||||
except Exception as e:
|
||||
@@ -68,17 +74,13 @@ class ZepSearch:
|
||||
return context, end_time - start_time
|
||||
|
||||
def process_question(self, run_id, val, idx):
|
||||
question = val.get('question', '')
|
||||
answer = val.get('answer', '')
|
||||
category = val.get('category', -1)
|
||||
evidence = val.get('evidence', [])
|
||||
adversarial_answer = val.get('adversarial_answer', '')
|
||||
question = val.get("question", "")
|
||||
answer = val.get("answer", "")
|
||||
category = val.get("category", -1)
|
||||
evidence = val.get("evidence", [])
|
||||
adversarial_answer = val.get("adversarial_answer", "")
|
||||
|
||||
response, search_memory_time, response_time, context = self.answer_question(
|
||||
run_id,
|
||||
idx,
|
||||
question
|
||||
)
|
||||
response, search_memory_time, response_time, context = self.answer_question(run_id, idx, question)
|
||||
|
||||
result = {
|
||||
"question": question,
|
||||
@@ -89,7 +91,7 @@ class ZepSearch:
|
||||
"adversarial_answer": adversarial_answer,
|
||||
"search_memory_time": search_memory_time,
|
||||
"response_time": response_time,
|
||||
"context": context
|
||||
"context": context,
|
||||
}
|
||||
|
||||
return result
|
||||
@@ -98,44 +100,35 @@ class ZepSearch:
|
||||
context, search_memory_time = self.search_memory(run_id, idx, question)
|
||||
|
||||
template = Template(ANSWER_PROMPT_ZEP)
|
||||
answer_prompt = template.render(
|
||||
memories=context,
|
||||
question=question
|
||||
)
|
||||
answer_prompt = template.render(memories=context, question=question)
|
||||
|
||||
t1 = time.time()
|
||||
response = self.openai_client.chat.completions.create(
|
||||
model=os.getenv("MODEL"),
|
||||
messages=[
|
||||
{"role": "system", "content": answer_prompt}
|
||||
],
|
||||
temperature=0.0
|
||||
model=os.getenv("MODEL"), messages=[{"role": "system", "content": answer_prompt}], temperature=0.0
|
||||
)
|
||||
t2 = time.time()
|
||||
response_time = t2 - t1
|
||||
return response.choices[0].message.content, search_memory_time, response_time, context
|
||||
|
||||
def process_data_file(self, file_path, run_id, output_file_path):
|
||||
with open(file_path, 'r') as f:
|
||||
with open(file_path, "r") as f:
|
||||
data = json.load(f)
|
||||
|
||||
for idx, item in tqdm(enumerate(data), total=len(data), desc="Processing conversations"):
|
||||
qa = item['qa']
|
||||
qa = item["qa"]
|
||||
|
||||
for question_item in tqdm(qa, total=len(qa), desc=f"Processing questions for conversation {idx}", leave=False):
|
||||
result = self.process_question(
|
||||
run_id,
|
||||
question_item,
|
||||
idx
|
||||
)
|
||||
for question_item in tqdm(
|
||||
qa, total=len(qa), desc=f"Processing questions for conversation {idx}", leave=False
|
||||
):
|
||||
result = self.process_question(run_id, question_item, idx)
|
||||
self.results[idx].append(result)
|
||||
|
||||
# Save results after each question is processed
|
||||
with open(output_file_path, 'w') as f:
|
||||
with open(output_file_path, "w") as f:
|
||||
json.dump(self.results, f, indent=4)
|
||||
|
||||
# Final save at the end
|
||||
with open(output_file_path, 'w') as f:
|
||||
with open(output_file_path, "w") as f:
|
||||
json.dump(self.results, f, indent=4)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user