Files
lang-agent/lang_agent/eval/evaluator.py
2025-10-29 22:19:58 +08:00

97 lines
3.0 KiB
Python

from dataclasses import dataclass, field
from typing import Type, Literal
import tyro
from loguru import logger
import functools
import os
import os.path as osp
import glob
import pandas as pd
from lang_agent.config import InstantiateConfig
from lang_agent.pipeline import Pipeline, PipelineConfig
from lang_agent.eval.validator import ValidatorConfig, Validator
from langsmith import Client
@tyro.conf.configure(tyro.conf.SuppressFixed)
@dataclass
class EvaluatorConfig(InstantiateConfig):
_target: Type = field(default_factory=lambda:Evaluator)
experiment_prefix:str = "simple test"
"""name of experiment"""
experiment_desc:str = "testing if this works or not"
"""describe the experiment"""
dataset_name:Literal["Toxic Queries"] = "dev_langagent"
"""name of the dataset to evaluate"""
log_dir:str = "logs"
pipe_config: PipelineConfig = field(default_factory=PipelineConfig)
validator_config: ValidatorConfig = field(default_factory=ValidatorConfig)
class Evaluator:
def __init__(self, config: EvaluatorConfig):
self.config = config
self.populate_modules()
def populate_modules(self):
logger.info("preparing to run experiment")
self.pipeline:Pipeline = self.config.pipe_config.setup()
self.cli = Client()
self.validator:Validator = self.config.validator_config.setup()
self.dataset = self.cli.read_dataset(dataset_name=self.config.dataset_name)
def evaluate(self):
logger.info("running experiment")
inp_fnc = self.validator.get_inp_fnc(self.config.dataset_name)
runnable = functools.partial(inp_fnc, pipeline=self.pipeline)
self.result = self.cli.evaluate(
runnable,
data=self.dataset.name,
evaluators=self.validator.get_val_fnc(self.config.dataset_name),
experiment_prefix=self.config.experiment_prefix,
description=self.config.experiment_desc,
max_concurrency=4,
upload_results=False
)
def save_results(self):
os.makedirs(self.config.log_dir, exist_ok=True)
assert hasattr(self, "result"), "NO RESULTS, run evaluate() before saving results"
head_path = osp.join(self.config.log_dir, f"{self.dataset.name}-{self.config.experiment_prefix}")
n_exp = len(glob.glob(f"{head_path}*"))
exp_save_f = osp.join(f"{head_path}-{n_exp}.csv")
df = self.result.to_pandas()
logger.info(f"saving experiment results to: {exp_save_f}")
df.to_csv(exp_save_f, index=False)
metric_col = [e for e in df.columns if "feedback" in e]
df_curr_m = df[metric_col].mean().to_frame().T
df_curr_m.index = [f'{osp.basename(head_path)}-{n_exp}']
metric_f = osp.join(self.config.log_dir, "0_exp_metrics.csv") # start with 0 for first file in folder
if osp.exists(metric_f):
df_m = pd.read_csv(metric_f, index_col=0)
df_m = pd.concat([df_m, df_curr_m])
else:
df_m = df_curr_m
df_m.to_csv(metric_f)