diff --git a/lang_agent/eval/evaluator.py b/lang_agent/eval/evaluator.py index b3ad69a..c7d07af 100644 --- a/lang_agent/eval/evaluator.py +++ b/lang_agent/eval/evaluator.py @@ -3,6 +3,10 @@ from typing import Type, Literal import tyro from loguru import logger import functools +import os +import os.path as osp +import glob +import pandas as pd from lang_agent.config import InstantiateConfig from lang_agent.pipeline import Pipeline, PipelineConfig @@ -10,9 +14,6 @@ from lang_agent.eval.validator import ValidatorConfig, Validator from langsmith import Client -from langchain_core.messages import HumanMessage -from langchain_core.runnables import RunnableLambda - @tyro.conf.configure(tyro.conf.SuppressFixed) @dataclass class EvaluatorConfig(InstantiateConfig): @@ -27,6 +28,8 @@ class EvaluatorConfig(InstantiateConfig): dataset_name:Literal["Toxic Queries"] = "dev_langagent" """name of the dataset to evaluate""" + log_dir:str = "logs" + pipe_config: PipelineConfig = field(default_factory=PipelineConfig) validator_config: ValidatorConfig = field(default_factory=ValidatorConfig) @@ -49,7 +52,6 @@ class Evaluator: def evaluate(self): logger.info("running experiment") - inp_fnc = self.validator.get_inp_fnc(self.config.dataset_name) runnable = functools.partial(inp_fnc, pipeline=self.pipeline) @@ -59,10 +61,34 @@ class Evaluator: evaluators=self.validator.get_val_fnc(self.config.dataset_name), experiment_prefix=self.config.experiment_prefix, description=self.config.experiment_desc, - max_concurrency=4 + max_concurrency=4, + upload_results=False ) - + + def save_results(self): + os.makedirs(self.config.log_dir, exist_ok=True) + + assert hasattr(self, "result"), "NO RESULTS, run evaluate() before saving results" + + head_path = osp.join(self.config.log_dir, f"{self.dataset.name}-{self.config.experiment_prefix}") + n_exp = len(glob.glob(f"{head_path}*")) + exp_save_f = osp.join(f"{head_path}-{n_exp}.csv") + + df = self.result.to_pandas() + df.to_csv(exp_save_f, index=False) + + metric_col = [e for e in df.columns if "feedback" in e] + + df_curr_m = df[metric_col].mean() + + metric_f = osp.join(self.config.log_dir, "0_exp_metrics.csv") # start with 0 for first file in folder + if osp.exists(metric_f): + df_m = pd.read_csv(metric_f) + df_m = pd.concat([df_m, df_curr_m]) + else: + df_m = df_curr_m + df_m.to_csv(metric_f, index=False)