Skip to content

Commit 46c6504

Browse files
update eval_utils with runs api change
also clean up if condition so that url will be printed even for first run
1 parent a0ed406 commit 46c6504

File tree

1 file changed

+115
-67
lines changed

1 file changed

+115
-67
lines changed

src/humanloop/eval_utils.py

Lines changed: 115 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
from typing_extensions import NotRequired, TypedDict
1919
import time
2020
import sys
21-
import uuid
2221
from concurrent.futures import ThreadPoolExecutor, as_completed
2322

2423
from .client import BaseHumanloop
@@ -41,11 +40,13 @@
4140
from .types import ToolKernelRequest as Tool
4241
from .types import BooleanEvaluatorStatsResponse as BooleanStats
4342
from .types import NumericEvaluatorStatsResponse as NumericStats
44-
from .types import UpdateDatesetAction as UpdateDatasetAction # TODO: fix original type typo
43+
from .types import (
44+
UpdateDatesetAction as UpdateDatasetAction,
45+
) # TODO: fix original type typo
4546
from .types import DatapointResponse as Datapoint
4647
from .types import (
4748
EvaluationStats,
48-
VersionStatsResponse,
49+
RunStatsResponse,
4950
EvaluatorArgumentsType,
5051
EvaluatorReturnTypeEnum,
5152
EvaluationResponse,
@@ -61,7 +62,9 @@
6162
if not logger.hasHandlers():
6263
logger.addHandler(console_handler)
6364

64-
EvaluatorDict = Union[CodeEvaluatorDict, LLMEvaluatorDict, HumanEvaluatorDict, ExternalEvaluator]
65+
EvaluatorDict = Union[
66+
CodeEvaluatorDict, LLMEvaluatorDict, HumanEvaluatorDict, ExternalEvaluator
67+
]
6568
Version = Union[FlowDict, PromptDict, ToolDict, EvaluatorDict]
6669
FileType = Literal["flow", "prompt", "tool", "evaluator"]
6770

@@ -202,9 +205,13 @@ def _run_eval(
202205
function_ = file.pop("callable")
203206
except KeyError as _:
204207
if type_ == "flow":
205-
raise ValueError("You must provide a `callable` for your Flow `file` to run a local eval.")
208+
raise ValueError(
209+
"You must provide a `callable` for your Flow `file` to run a local eval."
210+
)
206211
else:
207-
logger.info(f"No `callable` provided for your {type_} file - will attempt to generate logs on Humanloop.")
212+
logger.info(
213+
f"No `callable` provided for your {type_} file - will attempt to generate logs on Humanloop."
214+
)
208215

209216
custom_logger = file.pop("custom_logger", None)
210217
file_dict = {**file, **version}
@@ -222,15 +229,19 @@ def _run_eval(
222229
try:
223230
_ = Prompt.parse_obj(version)
224231
except ValidationError as error_:
225-
logger.error(msg=f"Invalid Prompt `version` in your `file` request. \n\nValidation error: \n)")
232+
logger.error(
233+
msg=f"Invalid Prompt `version` in your `file` request. \n\nValidation error: \n)"
234+
)
226235
raise error_
227236
hl_file = client.prompts.upsert(**file_dict)
228237

229238
elif type_ == "tool":
230239
try:
231240
_ = Tool.parse_obj(version)
232241
except ValidationError as error_:
233-
logger.error(msg=f"Invalid Tool `version` in your `file` request. \n\nValidation error: \n)")
242+
logger.error(
243+
msg=f"Invalid Tool `version` in your `file` request. \n\nValidation error: \n)"
244+
)
234245
raise error_
235246
hl_file = client.tools.upsert(**file_dict)
236247

@@ -263,7 +274,9 @@ def _run_eval(
263274
attributes={"code": inspect.getsource(eval_function)},
264275
evaluator_type="external",
265276
)
266-
_ = client.evaluators.upsert(id=evaluator.get("id"), path=evaluator.get("path"), spec=spec)
277+
_ = client.evaluators.upsert(
278+
id=evaluator.get("id"), path=evaluator.get("path"), spec=spec
279+
)
267280

268281
# Validate upfront that the local Evaluators and Dataset fit
269282
requires_target = False
@@ -286,7 +299,6 @@ def _run_eval(
286299
try:
287300
evaluation = client.evaluations.create(
288301
name=name,
289-
dataset={"file_id": hl_dataset.id},
290302
evaluators=[{"path": e["path"]} for e in evaluators],
291303
file={"id": hl_file.id},
292304
)
@@ -301,15 +313,21 @@ def _run_eval(
301313
if not evaluation:
302314
raise ValueError(f"Evaluation with name {name} not found.")
303315

304-
# Every run will generate a new batch of logs
305-
batch_id = uuid.uuid4().hex[:10] # ignore risk of collision
316+
# Create a new Run
317+
run = client.evaluations.create_run(
318+
id=evaluation.id,
319+
dataset={"version_id": hl_dataset.version_id},
320+
orchestrated=False,
321+
)
322+
323+
# Every Run will generate a new batch of Logs
324+
run_id = run.id
306325
log_func = _get_log_func(
307326
client=client,
308327
type_=type_,
309328
file_id=hl_file.id,
310329
version_id=hl_file.version_id,
311-
evaluation_id=evaluation.id,
312-
batch_id=batch_id,
330+
run_id=run_id,
313331
)
314332

315333
# Define the function to execute your function in parallel and Log to Humanloop
@@ -318,7 +336,9 @@ def process_datapoint(datapoint: Datapoint):
318336
datapoint_dict = datapoint.dict()
319337
try:
320338
if "messages" in datapoint_dict:
321-
output = function_(**datapoint_dict["inputs"], messages=datapoint_dict["messages"])
339+
output = function_(
340+
**datapoint_dict["inputs"], messages=datapoint_dict["messages"]
341+
)
322342
else:
323343
output = function_(**datapoint_dict["inputs"])
324344
if custom_logger:
@@ -343,7 +363,9 @@ def process_datapoint(datapoint: Datapoint):
343363
start_time=start_time,
344364
end_time=datetime.now(),
345365
)
346-
logger.warning(msg=f"\nYour {type_}'s `callable` failed for Datapoint: {datapoint.id}. \n Error: {str(e)}")
366+
logger.warning(
367+
msg=f"\nYour {type_}'s `callable` failed for Datapoint: {datapoint.id}. \n Error: {str(e)}"
368+
)
347369

348370
# Apply local Evaluators
349371
for local_evaluator in local_evaluators:
@@ -376,28 +398,35 @@ def process_datapoint(datapoint: Datapoint):
376398
start_time=start_time,
377399
end_time=datetime.now(),
378400
)
379-
logger.warning(f"\nEvaluator {local_evaluator['path']} failed with error {str(e)}")
401+
logger.warning(
402+
f"\nEvaluator {local_evaluator['path']} failed with error {str(e)}"
403+
)
380404

381405
# Execute the function and send the logs to Humanloop in parallel
382406
total_datapoints = len(hl_dataset.datapoints)
383407
logger.info(f"\n{CYAN}Navigate to your Evaluation:{RESET}\n{evaluation.url}\n")
384408
logger.info(f"{CYAN}{type_.capitalize()} Version ID: {hl_file.version_id}{RESET}")
385-
logger.info(f"{CYAN}Run ID: {batch_id}{RESET}")
409+
logger.info(f"{CYAN}Run ID: {run_id}{RESET}")
386410

387411
# Generate locally if a file `callable` is provided
388412
if function_:
389413
logger.info(
390-
f"{CYAN}\nRunning {hl_file.name} over the Dataset {hl_dataset.name} using {workers} workers{RESET} "
414+
f"{CYAN}\nRunning '{hl_file.name}' over the Dataset '{hl_dataset.name}' using {workers} workers{RESET} "
391415
)
392416
completed_tasks = 0
393417
with ThreadPoolExecutor(max_workers=workers) as executor:
394-
futures = [executor.submit(process_datapoint, datapoint) for datapoint in hl_dataset.datapoints]
418+
futures = [
419+
executor.submit(process_datapoint, datapoint)
420+
for datapoint in hl_dataset.datapoints
421+
]
395422
for _ in as_completed(futures):
396423
completed_tasks += 1
397424
_progress_bar(total_datapoints, completed_tasks)
398425
else:
399426
# TODO: trigger run when updated API is available
400-
logger.info(f"{CYAN}\nRunning {hl_file.name} over the Dataset {hl_dataset.name}{RESET}")
427+
logger.info(
428+
f"{CYAN}\nRunning '{hl_file.name}' over the Dataset '{hl_dataset.name}'{RESET}"
429+
)
401430

402431
# Wait for the Evaluation to complete then print the results
403432
complete = False
@@ -413,39 +442,43 @@ def process_datapoint(datapoint: Datapoint):
413442
logger.info(stats.report)
414443

415444
checks: List[EvaluatorCheck] = []
416-
if all(evaluator.get("threshold") is None for evaluator in evaluators) and len(stats.version_stats) == 1:
417-
# Skip `check_evaluation_improvement` if no thresholds were provided and there is only one run.
418-
# (Or the logs would not be helpful)
419-
return checks
420-
for evaluator in evaluators:
421-
_, score, delta = check_evaluation_improvement(
422-
evaluation=evaluation,
423-
stats=stats,
424-
evaluator_path=evaluator["path"],
425-
batch_id=batch_id,
426-
)
427-
threshold_check = None
428-
threshold = evaluator.get("threshold")
429-
if threshold is not None:
430-
threshold_check = check_evaluation_threshold(
445+
446+
# Skip `check_evaluation_improvement` if no thresholds were provided and there is only one run.
447+
# (Or the logs would not be helpful)
448+
if (
449+
any(evaluator.get("threshold") is not None for evaluator in evaluators)
450+
or len(stats.run_stats) > 1
451+
):
452+
for evaluator in evaluators:
453+
_, score, delta = check_evaluation_improvement(
431454
evaluation=evaluation,
432455
stats=stats,
433456
evaluator_path=evaluator["path"],
434-
threshold=threshold,
435-
batch_id=batch_id,
457+
run_id=run_id,
436458
)
437-
checks.append(
438-
EvaluatorCheck(
439-
path=evaluator["path"],
440-
# TODO: Add back in with number valence on Evaluators
441-
# improvement_check=improvement_check,
442-
score=score,
443-
delta=delta,
444-
threshold=threshold,
445-
threshold_check=threshold_check,
446-
evaluation_id=evaluation.id,
459+
threshold_check = None
460+
threshold = evaluator.get("threshold")
461+
if threshold is not None:
462+
threshold_check = check_evaluation_threshold(
463+
evaluation=evaluation,
464+
stats=stats,
465+
evaluator_path=evaluator["path"],
466+
threshold=threshold,
467+
run_id=run_id,
468+
)
469+
checks.append(
470+
EvaluatorCheck(
471+
path=evaluator["path"],
472+
# TODO: Add back in with number valence on Evaluators
473+
# improvement_check=improvement_check,
474+
score=score,
475+
delta=delta,
476+
threshold=threshold,
477+
threshold_check=threshold_check,
478+
evaluation_id=evaluation.id,
479+
)
447480
)
448-
)
481+
449482
logger.info(f"\n{CYAN}View your Evaluation:{RESET}\n{evaluation.url}\n")
450483
return checks
451484

@@ -455,17 +488,15 @@ def _get_log_func(
455488
type_: FileType,
456489
file_id: str,
457490
version_id: str,
458-
evaluation_id: str,
459-
batch_id: str,
491+
run_id: str,
460492
) -> Callable:
461493
"""Returns the appropriate log function pre-filled with common parameters."""
462494
log_request = {
463495
# TODO: why does the Log `id` field refer to the file ID in the API?
464496
# Why are both `id` and `version_id` needed in the API?
465497
"id": file_id,
466498
"version_id": version_id,
467-
"evaluation_id": evaluation_id,
468-
"batch_id": batch_id,
499+
"run_id": run_id,
469500
}
470501
if type_ == "flow":
471502
return partial(client.flows.log, **log_request, trace_status="complete")
@@ -479,7 +510,9 @@ def _get_log_func(
479510
raise NotImplementedError(f"Unsupported File version: {type_}")
480511

481512

482-
def get_score_from_evaluator_stat(stat: Union[NumericStats, BooleanStats]) -> Union[float, None]:
513+
def get_score_from_evaluator_stat(
514+
stat: Union[NumericStats, BooleanStats],
515+
) -> Union[float, None]:
483516
"""Get the score from an Evaluator Stat."""
484517
score = None
485518
if isinstance(stat, BooleanStats):
@@ -526,14 +559,18 @@ def _progress_bar(total: int, progress: int):
526559

527560

528561
def get_evaluator_stats_by_path(
529-
stat: VersionStatsResponse, evaluation: EvaluationResponse
562+
stat: RunStatsResponse, evaluation: EvaluationResponse
530563
) -> Dict[str, Union[NumericStats, BooleanStats]]:
531564
"""Get the Evaluator stats by path."""
532565
# TODO: Update the API so this is not necessary
533-
evaluators_by_id = {evaluator.version.version_id: evaluator for evaluator in evaluation.evaluators}
566+
evaluators_by_id = {
567+
evaluator.version.version_id: evaluator for evaluator in evaluation.evaluators
568+
}
534569
evaluator_stats_by_path = {
535-
evaluators_by_id[evaluator_stat.evaluator_version_id].version.path: evaluator_stat
536-
for evaluator_stat in stat.evaluator_version_stats
570+
evaluators_by_id[
571+
evaluator_stat.evaluator_version_id
572+
].version.path: evaluator_stat
573+
for evaluator_stat in stat.evaluator_stats
537574
}
538575
return evaluator_stats_by_path
539576

@@ -543,12 +580,13 @@ def check_evaluation_threshold(
543580
stats: EvaluationStats,
544581
evaluator_path: str,
545582
threshold: float,
546-
batch_id: str,
583+
run_id: str,
547584
) -> bool:
548585
"""Checks if the latest version has an average Evaluator result above a threshold."""
549586
# TODO: Update the API so this is not necessary
550587
evaluator_stats_by_path = get_evaluator_stats_by_path(
551-
stat=next((stat for stat in stats.version_stats if stat.batch_id == batch_id), None), evaluation=evaluation
588+
stat=next((stat for stat in stats.run_stats if stat.run_id == run_id), None),
589+
evaluation=evaluation,
552590
)
553591
if evaluator_path in evaluator_stats_by_path:
554592
evaluator_stat = evaluator_stats_by_path[evaluator_path]
@@ -571,7 +609,7 @@ def check_evaluation_improvement(
571609
evaluation: EvaluationResponse,
572610
evaluator_path: str,
573611
stats: EvaluationStats,
574-
batch_id: str,
612+
run_id: str,
575613
) -> Tuple[bool, float, float]:
576614
"""
577615
Check the latest version has improved across for a specific Evaluator.
@@ -581,24 +619,34 @@ def check_evaluation_improvement(
581619
# TODO: Update the API so this is not necessary
582620

583621
latest_evaluator_stats_by_path = get_evaluator_stats_by_path(
584-
stat=next((stat for stat in stats.version_stats if stat.batch_id == batch_id), None), evaluation=evaluation
622+
stat=next((stat for stat in stats.run_stats if stat.run_id == run_id), None),
623+
evaluation=evaluation,
585624
)
586-
if len(stats.version_stats) == 1:
625+
if len(stats.run_stats) == 1:
587626
logger.info(f"{YELLOW}⚠️ No previous versions to compare with.{RESET}")
588627
return True, 0, 0
589628

590-
previous_evaluator_stats_by_path = get_evaluator_stats_by_path(stat=stats.version_stats[-2], evaluation=evaluation)
591-
if evaluator_path in latest_evaluator_stats_by_path and evaluator_path in previous_evaluator_stats_by_path:
629+
previous_evaluator_stats_by_path = get_evaluator_stats_by_path(
630+
stat=stats.run_stats[-2], evaluation=evaluation
631+
)
632+
if (
633+
evaluator_path in latest_evaluator_stats_by_path
634+
and evaluator_path in previous_evaluator_stats_by_path
635+
):
592636
latest_evaluator_stat = latest_evaluator_stats_by_path[evaluator_path]
593637
previous_evaluator_stat = previous_evaluator_stats_by_path[evaluator_path]
594638
latest_score = get_score_from_evaluator_stat(stat=latest_evaluator_stat)
595639
previous_score = get_score_from_evaluator_stat(stat=previous_evaluator_stat)
596640
diff = round(latest_score - previous_score, 2)
597641
if diff >= 0:
598-
logger.info(f"{CYAN}Change of [{diff}] for Evaluator {evaluator_path}{RESET}")
642+
logger.info(
643+
f"{CYAN}Change of [{diff}] for Evaluator {evaluator_path}{RESET}"
644+
)
599645
return True, latest_score, diff
600646
else:
601-
logger.info(f"{CYAN}Change of [{diff}] for Evaluator {evaluator_path}{RESET}")
647+
logger.info(
648+
f"{CYAN}Change of [{diff}] for Evaluator {evaluator_path}{RESET}"
649+
)
602650
return False, latest_score, diff
603651
else:
604652
raise ValueError(f"Evaluator {evaluator_path} not found in the stats.")

0 commit comments

Comments
 (0)