save results locally
This commit is contained in:
@@ -3,6 +3,10 @@ from typing import Type, Literal
|
|||||||
import tyro
|
import tyro
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
import functools
|
import functools
|
||||||
|
import os
|
||||||
|
import os.path as osp
|
||||||
|
import glob
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
from lang_agent.config import InstantiateConfig
|
from lang_agent.config import InstantiateConfig
|
||||||
from lang_agent.pipeline import Pipeline, PipelineConfig
|
from lang_agent.pipeline import Pipeline, PipelineConfig
|
||||||
@@ -10,9 +14,6 @@ from lang_agent.eval.validator import ValidatorConfig, Validator
|
|||||||
|
|
||||||
from langsmith import Client
|
from langsmith import Client
|
||||||
|
|
||||||
from langchain_core.messages import HumanMessage
|
|
||||||
from langchain_core.runnables import RunnableLambda
|
|
||||||
|
|
||||||
@tyro.conf.configure(tyro.conf.SuppressFixed)
|
@tyro.conf.configure(tyro.conf.SuppressFixed)
|
||||||
@dataclass
|
@dataclass
|
||||||
class EvaluatorConfig(InstantiateConfig):
|
class EvaluatorConfig(InstantiateConfig):
|
||||||
@@ -27,6 +28,8 @@ class EvaluatorConfig(InstantiateConfig):
|
|||||||
dataset_name:Literal["Toxic Queries"] = "dev_langagent"
|
dataset_name:Literal["Toxic Queries"] = "dev_langagent"
|
||||||
"""name of the dataset to evaluate"""
|
"""name of the dataset to evaluate"""
|
||||||
|
|
||||||
|
log_dir:str = "logs"
|
||||||
|
|
||||||
pipe_config: PipelineConfig = field(default_factory=PipelineConfig)
|
pipe_config: PipelineConfig = field(default_factory=PipelineConfig)
|
||||||
|
|
||||||
validator_config: ValidatorConfig = field(default_factory=ValidatorConfig)
|
validator_config: ValidatorConfig = field(default_factory=ValidatorConfig)
|
||||||
@@ -49,7 +52,6 @@ class Evaluator:
|
|||||||
def evaluate(self):
|
def evaluate(self):
|
||||||
logger.info("running experiment")
|
logger.info("running experiment")
|
||||||
|
|
||||||
|
|
||||||
inp_fnc = self.validator.get_inp_fnc(self.config.dataset_name)
|
inp_fnc = self.validator.get_inp_fnc(self.config.dataset_name)
|
||||||
runnable = functools.partial(inp_fnc, pipeline=self.pipeline)
|
runnable = functools.partial(inp_fnc, pipeline=self.pipeline)
|
||||||
|
|
||||||
@@ -59,10 +61,34 @@ class Evaluator:
|
|||||||
evaluators=self.validator.get_val_fnc(self.config.dataset_name),
|
evaluators=self.validator.get_val_fnc(self.config.dataset_name),
|
||||||
experiment_prefix=self.config.experiment_prefix,
|
experiment_prefix=self.config.experiment_prefix,
|
||||||
description=self.config.experiment_desc,
|
description=self.config.experiment_desc,
|
||||||
max_concurrency=4
|
max_concurrency=4,
|
||||||
|
upload_results=False
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def save_results(self):
|
||||||
|
os.makedirs(self.config.log_dir, exist_ok=True)
|
||||||
|
|
||||||
|
assert hasattr(self, "result"), "NO RESULTS, run evaluate() before saving results"
|
||||||
|
|
||||||
|
head_path = osp.join(self.config.log_dir, f"{self.dataset.name}-{self.config.experiment_prefix}")
|
||||||
|
n_exp = len(glob.glob(f"{head_path}*"))
|
||||||
|
exp_save_f = osp.join(f"{head_path}-{n_exp}.csv")
|
||||||
|
|
||||||
|
df = self.result.to_pandas()
|
||||||
|
df.to_csv(exp_save_f, index=False)
|
||||||
|
|
||||||
|
metric_col = [e for e in df.columns if "feedback" in e]
|
||||||
|
|
||||||
|
df_curr_m = df[metric_col].mean()
|
||||||
|
|
||||||
|
metric_f = osp.join(self.config.log_dir, "0_exp_metrics.csv") # start with 0 for first file in folder
|
||||||
|
if osp.exists(metric_f):
|
||||||
|
df_m = pd.read_csv(metric_f)
|
||||||
|
df_m = pd.concat([df_m, df_curr_m])
|
||||||
|
else:
|
||||||
|
df_m = df_curr_m
|
||||||
|
|
||||||
|
df_m.to_csv(metric_f, index=False)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user