1818from typing_extensions import NotRequired , TypedDict
1919import time
2020import sys
21- import uuid
2221from concurrent .futures import ThreadPoolExecutor , as_completed
2322
2423from .client import BaseHumanloop
4140from .types import ToolKernelRequest as Tool
4241from .types import BooleanEvaluatorStatsResponse as BooleanStats
4342from .types import NumericEvaluatorStatsResponse as NumericStats
44- from .types import UpdateDatesetAction as UpdateDatasetAction # TODO: fix original type typo
43+ from .types import (
44+ UpdateDatesetAction as UpdateDatasetAction ,
45+ ) # TODO: fix original type typo
4546from .types import DatapointResponse as Datapoint
4647from .types import (
4748 EvaluationStats ,
48- VersionStatsResponse ,
49+ RunStatsResponse ,
4950 EvaluatorArgumentsType ,
5051 EvaluatorReturnTypeEnum ,
5152 EvaluationResponse ,
6162if not logger .hasHandlers ():
6263 logger .addHandler (console_handler )
6364
64- EvaluatorDict = Union [CodeEvaluatorDict , LLMEvaluatorDict , HumanEvaluatorDict , ExternalEvaluator ]
65+ EvaluatorDict = Union [
66+ CodeEvaluatorDict , LLMEvaluatorDict , HumanEvaluatorDict , ExternalEvaluator
67+ ]
6568Version = Union [FlowDict , PromptDict , ToolDict , EvaluatorDict ]
6669FileType = Literal ["flow" , "prompt" , "tool" , "evaluator" ]
6770
@@ -202,9 +205,13 @@ def _run_eval(
202205 function_ = file .pop ("callable" )
203206 except KeyError as _ :
204207 if type_ == "flow" :
205- raise ValueError ("You must provide a `callable` for your Flow `file` to run a local eval." )
208+ raise ValueError (
209+ "You must provide a `callable` for your Flow `file` to run a local eval."
210+ )
206211 else :
207- logger .info (f"No `callable` provided for your { type_ } file - will attempt to generate logs on Humanloop." )
212+ logger .info (
213+ f"No `callable` provided for your { type_ } file - will attempt to generate logs on Humanloop."
214+ )
208215
209216 custom_logger = file .pop ("custom_logger" , None )
210217 file_dict = {** file , ** version }
@@ -222,15 +229,19 @@ def _run_eval(
222229 try :
223230 _ = Prompt .parse_obj (version )
224231 except ValidationError as error_ :
225- logger .error (msg = f"Invalid Prompt `version` in your `file` request. \n \n Validation error: \n )" )
232+ logger .error (
233+ msg = f"Invalid Prompt `version` in your `file` request. \n \n Validation error: \n )"
234+ )
226235 raise error_
227236 hl_file = client .prompts .upsert (** file_dict )
228237
229238 elif type_ == "tool" :
230239 try :
231240 _ = Tool .parse_obj (version )
232241 except ValidationError as error_ :
233- logger .error (msg = f"Invalid Tool `version` in your `file` request. \n \n Validation error: \n )" )
242+ logger .error (
243+ msg = f"Invalid Tool `version` in your `file` request. \n \n Validation error: \n )"
244+ )
234245 raise error_
235246 hl_file = client .tools .upsert (** file_dict )
236247
@@ -263,7 +274,9 @@ def _run_eval(
263274 attributes = {"code" : inspect .getsource (eval_function )},
264275 evaluator_type = "external" ,
265276 )
266- _ = client .evaluators .upsert (id = evaluator .get ("id" ), path = evaluator .get ("path" ), spec = spec )
277+ _ = client .evaluators .upsert (
278+ id = evaluator .get ("id" ), path = evaluator .get ("path" ), spec = spec
279+ )
267280
268281 # Validate upfront that the local Evaluators and Dataset fit
269282 requires_target = False
@@ -286,7 +299,6 @@ def _run_eval(
286299 try :
287300 evaluation = client .evaluations .create (
288301 name = name ,
289- dataset = {"file_id" : hl_dataset .id },
290302 evaluators = [{"path" : e ["path" ]} for e in evaluators ],
291303 file = {"id" : hl_file .id },
292304 )
@@ -301,15 +313,21 @@ def _run_eval(
301313 if not evaluation :
302314 raise ValueError (f"Evaluation with name { name } not found." )
303315
304- # Every run will generate a new batch of logs
305- batch_id = uuid .uuid4 ().hex [:10 ] # ignore risk of collision
316+ # Create a new Run
317+ run = client .evaluations .create_run (
318+ id = evaluation .id ,
319+ dataset = {"version_id" : hl_dataset .version_id },
320+ orchestrated = False ,
321+ )
322+
323+ # Every Run will generate a new batch of Logs
324+ run_id = run .id
306325 log_func = _get_log_func (
307326 client = client ,
308327 type_ = type_ ,
309328 file_id = hl_file .id ,
310329 version_id = hl_file .version_id ,
311- evaluation_id = evaluation .id ,
312- batch_id = batch_id ,
330+ run_id = run_id ,
313331 )
314332
315333 # Define the function to execute your function in parallel and Log to Humanloop
@@ -318,7 +336,9 @@ def process_datapoint(datapoint: Datapoint):
318336 datapoint_dict = datapoint .dict ()
319337 try :
320338 if "messages" in datapoint_dict :
321- output = function_ (** datapoint_dict ["inputs" ], messages = datapoint_dict ["messages" ])
339+ output = function_ (
340+ ** datapoint_dict ["inputs" ], messages = datapoint_dict ["messages" ]
341+ )
322342 else :
323343 output = function_ (** datapoint_dict ["inputs" ])
324344 if custom_logger :
@@ -343,7 +363,9 @@ def process_datapoint(datapoint: Datapoint):
343363 start_time = start_time ,
344364 end_time = datetime .now (),
345365 )
346- logger .warning (msg = f"\n Your { type_ } 's `callable` failed for Datapoint: { datapoint .id } . \n Error: { str (e )} " )
366+ logger .warning (
367+ msg = f"\n Your { type_ } 's `callable` failed for Datapoint: { datapoint .id } . \n Error: { str (e )} "
368+ )
347369
348370 # Apply local Evaluators
349371 for local_evaluator in local_evaluators :
@@ -376,28 +398,35 @@ def process_datapoint(datapoint: Datapoint):
376398 start_time = start_time ,
377399 end_time = datetime .now (),
378400 )
379- logger .warning (f"\n Evaluator { local_evaluator ['path' ]} failed with error { str (e )} " )
401+ logger .warning (
402+ f"\n Evaluator { local_evaluator ['path' ]} failed with error { str (e )} "
403+ )
380404
381405 # Execute the function and send the logs to Humanloop in parallel
382406 total_datapoints = len (hl_dataset .datapoints )
383407 logger .info (f"\n { CYAN } Navigate to your Evaluation:{ RESET } \n { evaluation .url } \n " )
384408 logger .info (f"{ CYAN } { type_ .capitalize ()} Version ID: { hl_file .version_id } { RESET } " )
385- logger .info (f"{ CYAN } Run ID: { batch_id } { RESET } " )
409+ logger .info (f"{ CYAN } Run ID: { run_id } { RESET } " )
386410
387411 # Generate locally if a file `callable` is provided
388412 if function_ :
389413 logger .info (
390- f"{ CYAN } \n Running { hl_file .name } over the Dataset { hl_dataset .name } using { workers } workers{ RESET } "
414+ f"{ CYAN } \n Running ' { hl_file .name } ' over the Dataset ' { hl_dataset .name } ' using { workers } workers{ RESET } "
391415 )
392416 completed_tasks = 0
393417 with ThreadPoolExecutor (max_workers = workers ) as executor :
394- futures = [executor .submit (process_datapoint , datapoint ) for datapoint in hl_dataset .datapoints ]
418+ futures = [
419+ executor .submit (process_datapoint , datapoint )
420+ for datapoint in hl_dataset .datapoints
421+ ]
395422 for _ in as_completed (futures ):
396423 completed_tasks += 1
397424 _progress_bar (total_datapoints , completed_tasks )
398425 else :
399426 # TODO: trigger run when updated API is available
400- logger .info (f"{ CYAN } \n Running { hl_file .name } over the Dataset { hl_dataset .name } { RESET } " )
427+ logger .info (
428+ f"{ CYAN } \n Running '{ hl_file .name } ' over the Dataset '{ hl_dataset .name } '{ RESET } "
429+ )
401430
402431 # Wait for the Evaluation to complete then print the results
403432 complete = False
@@ -413,39 +442,43 @@ def process_datapoint(datapoint: Datapoint):
413442 logger .info (stats .report )
414443
415444 checks : List [EvaluatorCheck ] = []
416- if all (evaluator .get ("threshold" ) is None for evaluator in evaluators ) and len (stats .version_stats ) == 1 :
417- # Skip `check_evaluation_improvement` if no thresholds were provided and there is only one run.
418- # (Or the logs would not be helpful)
419- return checks
420- for evaluator in evaluators :
421- _ , score , delta = check_evaluation_improvement (
422- evaluation = evaluation ,
423- stats = stats ,
424- evaluator_path = evaluator ["path" ],
425- batch_id = batch_id ,
426- )
427- threshold_check = None
428- threshold = evaluator .get ("threshold" )
429- if threshold is not None :
430- threshold_check = check_evaluation_threshold (
445+
446+ # Skip `check_evaluation_improvement` if no thresholds were provided and there is only one run.
447+ # (Or the logs would not be helpful)
448+ if (
449+ any (evaluator .get ("threshold" ) is not None for evaluator in evaluators )
450+ or len (stats .run_stats ) > 1
451+ ):
452+ for evaluator in evaluators :
453+ _ , score , delta = check_evaluation_improvement (
431454 evaluation = evaluation ,
432455 stats = stats ,
433456 evaluator_path = evaluator ["path" ],
434- threshold = threshold ,
435- batch_id = batch_id ,
457+ run_id = run_id ,
436458 )
437- checks .append (
438- EvaluatorCheck (
439- path = evaluator ["path" ],
440- # TODO: Add back in with number valence on Evaluators
441- # improvement_check=improvement_check,
442- score = score ,
443- delta = delta ,
444- threshold = threshold ,
445- threshold_check = threshold_check ,
446- evaluation_id = evaluation .id ,
459+ threshold_check = None
460+ threshold = evaluator .get ("threshold" )
461+ if threshold is not None :
462+ threshold_check = check_evaluation_threshold (
463+ evaluation = evaluation ,
464+ stats = stats ,
465+ evaluator_path = evaluator ["path" ],
466+ threshold = threshold ,
467+ run_id = run_id ,
468+ )
469+ checks .append (
470+ EvaluatorCheck (
471+ path = evaluator ["path" ],
472+ # TODO: Add back in with number valence on Evaluators
473+ # improvement_check=improvement_check,
474+ score = score ,
475+ delta = delta ,
476+ threshold = threshold ,
477+ threshold_check = threshold_check ,
478+ evaluation_id = evaluation .id ,
479+ )
447480 )
448- )
481+
449482 logger .info (f"\n { CYAN } View your Evaluation:{ RESET } \n { evaluation .url } \n " )
450483 return checks
451484
@@ -455,17 +488,15 @@ def _get_log_func(
455488 type_ : FileType ,
456489 file_id : str ,
457490 version_id : str ,
458- evaluation_id : str ,
459- batch_id : str ,
491+ run_id : str ,
460492) -> Callable :
461493 """Returns the appropriate log function pre-filled with common parameters."""
462494 log_request = {
463495 # TODO: why does the Log `id` field refer to the file ID in the API?
464496 # Why are both `id` and `version_id` needed in the API?
465497 "id" : file_id ,
466498 "version_id" : version_id ,
467- "evaluation_id" : evaluation_id ,
468- "batch_id" : batch_id ,
499+ "run_id" : run_id ,
469500 }
470501 if type_ == "flow" :
471502 return partial (client .flows .log , ** log_request , trace_status = "complete" )
@@ -479,7 +510,9 @@ def _get_log_func(
479510 raise NotImplementedError (f"Unsupported File version: { type_ } " )
480511
481512
482- def get_score_from_evaluator_stat (stat : Union [NumericStats , BooleanStats ]) -> Union [float , None ]:
513+ def get_score_from_evaluator_stat (
514+ stat : Union [NumericStats , BooleanStats ],
515+ ) -> Union [float , None ]:
483516 """Get the score from an Evaluator Stat."""
484517 score = None
485518 if isinstance (stat , BooleanStats ):
@@ -526,14 +559,18 @@ def _progress_bar(total: int, progress: int):
526559
527560
528561def get_evaluator_stats_by_path (
529- stat : VersionStatsResponse , evaluation : EvaluationResponse
562+ stat : RunStatsResponse , evaluation : EvaluationResponse
530563) -> Dict [str , Union [NumericStats , BooleanStats ]]:
531564 """Get the Evaluator stats by path."""
532565 # TODO: Update the API so this is not necessary
533- evaluators_by_id = {evaluator .version .version_id : evaluator for evaluator in evaluation .evaluators }
566+ evaluators_by_id = {
567+ evaluator .version .version_id : evaluator for evaluator in evaluation .evaluators
568+ }
534569 evaluator_stats_by_path = {
535- evaluators_by_id [evaluator_stat .evaluator_version_id ].version .path : evaluator_stat
536- for evaluator_stat in stat .evaluator_version_stats
570+ evaluators_by_id [
571+ evaluator_stat .evaluator_version_id
572+ ].version .path : evaluator_stat
573+ for evaluator_stat in stat .evaluator_stats
537574 }
538575 return evaluator_stats_by_path
539576
@@ -543,12 +580,13 @@ def check_evaluation_threshold(
543580 stats : EvaluationStats ,
544581 evaluator_path : str ,
545582 threshold : float ,
546- batch_id : str ,
583+ run_id : str ,
547584) -> bool :
548585 """Checks if the latest version has an average Evaluator result above a threshold."""
549586 # TODO: Update the API so this is not necessary
550587 evaluator_stats_by_path = get_evaluator_stats_by_path (
551- stat = next ((stat for stat in stats .version_stats if stat .batch_id == batch_id ), None ), evaluation = evaluation
588+ stat = next ((stat for stat in stats .run_stats if stat .run_id == run_id ), None ),
589+ evaluation = evaluation ,
552590 )
553591 if evaluator_path in evaluator_stats_by_path :
554592 evaluator_stat = evaluator_stats_by_path [evaluator_path ]
@@ -571,7 +609,7 @@ def check_evaluation_improvement(
571609 evaluation : EvaluationResponse ,
572610 evaluator_path : str ,
573611 stats : EvaluationStats ,
574- batch_id : str ,
612+ run_id : str ,
575613) -> Tuple [bool , float , float ]:
576614 """
577615 Check the latest version has improved across for a specific Evaluator.
@@ -581,24 +619,34 @@ def check_evaluation_improvement(
581619 # TODO: Update the API so this is not necessary
582620
583621 latest_evaluator_stats_by_path = get_evaluator_stats_by_path (
584- stat = next ((stat for stat in stats .version_stats if stat .batch_id == batch_id ), None ), evaluation = evaluation
622+ stat = next ((stat for stat in stats .run_stats if stat .run_id == run_id ), None ),
623+ evaluation = evaluation ,
585624 )
586- if len (stats .version_stats ) == 1 :
625+ if len (stats .run_stats ) == 1 :
587626 logger .info (f"{ YELLOW } ⚠️ No previous versions to compare with.{ RESET } " )
588627 return True , 0 , 0
589628
590- previous_evaluator_stats_by_path = get_evaluator_stats_by_path (stat = stats .version_stats [- 2 ], evaluation = evaluation )
591- if evaluator_path in latest_evaluator_stats_by_path and evaluator_path in previous_evaluator_stats_by_path :
629+ previous_evaluator_stats_by_path = get_evaluator_stats_by_path (
630+ stat = stats .run_stats [- 2 ], evaluation = evaluation
631+ )
632+ if (
633+ evaluator_path in latest_evaluator_stats_by_path
634+ and evaluator_path in previous_evaluator_stats_by_path
635+ ):
592636 latest_evaluator_stat = latest_evaluator_stats_by_path [evaluator_path ]
593637 previous_evaluator_stat = previous_evaluator_stats_by_path [evaluator_path ]
594638 latest_score = get_score_from_evaluator_stat (stat = latest_evaluator_stat )
595639 previous_score = get_score_from_evaluator_stat (stat = previous_evaluator_stat )
596640 diff = round (latest_score - previous_score , 2 )
597641 if diff >= 0 :
598- logger .info (f"{ CYAN } Change of [{ diff } ] for Evaluator { evaluator_path } { RESET } " )
642+ logger .info (
643+ f"{ CYAN } Change of [{ diff } ] for Evaluator { evaluator_path } { RESET } "
644+ )
599645 return True , latest_score , diff
600646 else :
601- logger .info (f"{ CYAN } Change of [{ diff } ] for Evaluator { evaluator_path } { RESET } " )
647+ logger .info (
648+ f"{ CYAN } Change of [{ diff } ] for Evaluator { evaluator_path } { RESET } "
649+ )
602650 return False , latest_score , diff
603651 else :
604652 raise ValueError (f"Evaluator { evaluator_path } not found in the stats." )
0 commit comments