save results locally

This commit is contained in:
2025-10-29 18:53:58 +08:00
parent 87ce104931
commit d93c72f24d

View File

@@ -3,6 +3,10 @@ from typing import Type, Literal
import tyro import tyro
from loguru import logger from loguru import logger
import functools import functools
import os
import os.path as osp
import glob
import pandas as pd
from lang_agent.config import InstantiateConfig from lang_agent.config import InstantiateConfig
from lang_agent.pipeline import Pipeline, PipelineConfig from lang_agent.pipeline import Pipeline, PipelineConfig
@@ -10,9 +14,6 @@ from lang_agent.eval.validator import ValidatorConfig, Validator
from langsmith import Client from langsmith import Client
from langchain_core.messages import HumanMessage
from langchain_core.runnables import RunnableLambda
@tyro.conf.configure(tyro.conf.SuppressFixed) @tyro.conf.configure(tyro.conf.SuppressFixed)
@dataclass @dataclass
class EvaluatorConfig(InstantiateConfig): class EvaluatorConfig(InstantiateConfig):
@@ -27,6 +28,8 @@ class EvaluatorConfig(InstantiateConfig):
dataset_name:Literal["Toxic Queries"] = "dev_langagent" dataset_name:Literal["Toxic Queries"] = "dev_langagent"
"""name of the dataset to evaluate""" """name of the dataset to evaluate"""
log_dir:str = "logs"
pipe_config: PipelineConfig = field(default_factory=PipelineConfig) pipe_config: PipelineConfig = field(default_factory=PipelineConfig)
validator_config: ValidatorConfig = field(default_factory=ValidatorConfig) validator_config: ValidatorConfig = field(default_factory=ValidatorConfig)
@@ -49,7 +52,6 @@ class Evaluator:
def evaluate(self): def evaluate(self):
logger.info("running experiment") logger.info("running experiment")
inp_fnc = self.validator.get_inp_fnc(self.config.dataset_name) inp_fnc = self.validator.get_inp_fnc(self.config.dataset_name)
runnable = functools.partial(inp_fnc, pipeline=self.pipeline) runnable = functools.partial(inp_fnc, pipeline=self.pipeline)
@@ -59,10 +61,34 @@ class Evaluator:
evaluators=self.validator.get_val_fnc(self.config.dataset_name), evaluators=self.validator.get_val_fnc(self.config.dataset_name),
experiment_prefix=self.config.experiment_prefix, experiment_prefix=self.config.experiment_prefix,
description=self.config.experiment_desc, description=self.config.experiment_desc,
max_concurrency=4 max_concurrency=4,
upload_results=False
) )
def save_results(self):
os.makedirs(self.config.log_dir, exist_ok=True)
assert hasattr(self, "result"), "NO RESULTS, run evaluate() before saving results"
head_path = osp.join(self.config.log_dir, f"{self.dataset.name}-{self.config.experiment_prefix}")
n_exp = len(glob.glob(f"{head_path}*"))
exp_save_f = osp.join(f"{head_path}-{n_exp}.csv")
df = self.result.to_pandas()
df.to_csv(exp_save_f, index=False)
metric_col = [e for e in df.columns if "feedback" in e]
df_curr_m = df[metric_col].mean()
metric_f = osp.join(self.config.log_dir, "0_exp_metrics.csv") # start with 0 for first file in folder
if osp.exists(metric_f):
df_m = pd.read_csv(metric_f)
df_m = pd.concat([df_m, df_curr_m])
else:
df_m = df_curr_m
df_m.to_csv(metric_f, index=False)