Skip to content

Commit 7052cf8

Browse files
author
Andrei Bratu
committed
Added missing logic from #28 and #29
1 parent 5104392 commit 7052cf8

File tree

2 files changed

+62
-73
lines changed

2 files changed

+62
-73
lines changed

src/humanloop/eval_utils/run.py

Lines changed: 56 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@
99
"""
1010

1111
import inspect
12+
import json
1213
import logging
14+
import copy
1315
import sys
1416
import threading
1517
import time
@@ -245,7 +247,7 @@ def run_eval(
245247

246248
if isinstance(file, Callable): # type: ignore
247249
# Decorated function
248-
file_: File = file.file # type: ignore
250+
file_: File = copy.deepcopy(file.file) # type: ignore
249251
else:
250252
file_ = file # type: ignore
251253

@@ -276,7 +278,6 @@ def run_eval(
276278
else:
277279
logger.info(f"No `callable` provided for your {type_} file - will attempt to generate logs on Humanloop.")
278280

279-
custom_logger = file_.pop("custom_logger", None)
280281
file_dict = {**file_, **version}
281282
hl_file: Union[PromptResponse, FlowResponse, ToolResponse, EvaluatorResponse]
282283

@@ -316,8 +317,20 @@ def run_eval(
316317
raise NotImplementedError(f"Unsupported File type: {type_}")
317318

318319
# Upsert the Dataset
319-
hl_dataset = client.datasets.upsert(**dataset)
320-
hl_dataset = client.datasets.get(id=hl_dataset.id, include_datapoints=True)
320+
if "action" not in dataset:
321+
dataset["action"] = "set"
322+
if "datapoints" not in dataset:
323+
dataset["datapoints"] = []
324+
# Use `upsert` to get existing dataset ID if no datapoints provided, given we can't `get` on path.
325+
dataset["action"] = "add"
326+
hl_dataset = client.datasets.upsert(
327+
**dataset,
328+
)
329+
hl_dataset = client.datasets.get(
330+
id=hl_dataset.id,
331+
version_id=hl_dataset.version_id,
332+
include_datapoints=True,
333+
)
321334

322335
# Upsert the local Evaluators; other Evaluators are just referenced by `path` or `id`
323336
local_evaluators: List[Evaluator] = []
@@ -329,7 +342,9 @@ def run_eval(
329342
# TODO: support the case where `file` logs generated on Humanloop but Evaluator logs generated locally
330343
if function_ is None:
331344
raise ValueError(
332-
f"Local Evaluators are only supported when generating Logs locally using your {type_}'s `callable`. Please provide a `callable` for your file in order to run Evaluators locally."
345+
"Local Evaluators are only supported when generating Logs locally using your "
346+
f"{type_}'s `callable`. Please provide a `callable` for your file in order "
347+
"to run Evaluators locally."
333348
)
334349
local_evaluators.append(evaluator)
335350
spec = ExternalEvaluator(
@@ -345,16 +360,6 @@ def run_eval(
345360
)
346361
function_ = typing.cast(Callable, function_)
347362

348-
# Validate signature of the called function
349-
function_signature = inspect.signature(function_)
350-
parameter_names = list(function_signature.parameters.keys())
351-
if parameter_names != ["inputs", "messages"] and parameter_names != ["inputs"]:
352-
raise ValueError(
353-
f"Your {type_}'s `callable` must have the signature `def "
354-
"function(inputs: dict, messages: Optional[dict] = None):` "
355-
"or `def function(inputs: dict):`"
356-
)
357-
358363
# Validate upfront that the local Evaluators and Dataset fit
359364
requires_target = False
360365
for local_evaluator in local_evaluators:
@@ -368,7 +373,8 @@ def run_eval(
368373
missing_target += 1
369374
if missing_target > 0:
370375
raise ValueError(
371-
f"{missing_target} Datapoints have no target. A target is required for the Evaluator: {local_evaluator['path']}"
376+
f"{missing_target} Datapoints have no target. A target "
377+
f"is required for the Evaluator: {local_evaluator['path']}"
372378
)
373379

374380
# Get or create the Evaluation based on the name
@@ -431,14 +437,14 @@ def upload_callback(log: dict):
431437
# handle the logging, which will call the upload_callback
432438
# function above when it's done
433439
function_( # type: ignore
434-
datapoint_dict["inputs"],
440+
**datapoint_dict["inputs"],
435441
messages=datapoint_dict["messages"],
436442
)
437443
else:
438444
# function_ is decorated by Humanloop, the OTel Exporter will
439445
# handle the logging, which will call the upload_callback
440446
# function above when it's done
441-
function_(datapoint_dict["inputs"]) # type: ignore
447+
function_(**datapoint_dict["inputs"]) # type: ignore
442448

443449
else:
444450
# Define the function to execute your function in parallel and Log to Humanloop
@@ -461,20 +467,21 @@ def process_datapoint(dp: Datapoint, file_id: str, file_path: str, run_id: str):
461467
)
462468
else:
463469
output = function_(**datapoint_dict["inputs"]) # type: ignore
464-
if custom_logger:
465-
log = custom_logger(client=client, output=output) # type: ignore
466-
else:
467-
if not isinstance(output, str):
470+
if not isinstance(output, str):
471+
try:
472+
output = json.dumps(output)
473+
# throw error if it fails to serialize
474+
except Exception as _:
468475
raise ValueError(
469-
f"Your {type_}'s `callable` must return a string if you do not provide a custom logger."
476+
f"Your {type_}'s `callable` must return a string or a JSON serializable object."
470477
)
471-
log = log_func(
472-
inputs=dp.inputs,
473-
output=output,
474-
source_datapoint_id=dp.id,
475-
start_time=start_time,
476-
end_time=datetime.now(),
477-
)
478+
log = log_func(
479+
inputs=datapoint.inputs,
480+
output=output,
481+
source_datapoint_id=datapoint.id,
482+
start_time=start_time,
483+
end_time=datetime.now(),
484+
)
478485
except Exception as e:
479486
log = log_func(
480487
inputs=dp.inputs,
@@ -522,7 +529,11 @@ def process_datapoint(dp: Datapoint, file_id: str, file_path: str, run_id: str):
522529
while not complete:
523530
stats = client.evaluations.get_stats(id=evaluation.id)
524531
logger.info(f"\r{stats.progress}")
525-
complete = stats.status == "completed"
532+
run_stats = next(
533+
(run_stats for run_stats in stats.run_stats if run_stats.run_id == run_id),
534+
None,
535+
)
536+
complete = run_stats is not None and run_stats.status == "completed"
526537
if not complete:
527538
time.sleep(5)
528539

@@ -681,12 +692,17 @@ def _check_evaluation_improvement(
681692
logger.info(f"{YELLOW}⚠️ No previous versions to compare with.{RESET}")
682693
return True, 0, 0
683694

684-
previous_evaluator_stats_by_path = _get_evaluator_stats_by_path(stat=stats.run_stats[-2], evaluation=evaluation)
695+
previous_evaluator_stats_by_path = _get_evaluator_stats_by_path(
696+
stat=stats.run_stats[1],
697+
evaluation=evaluation,
698+
)
685699
if evaluator_path in latest_evaluator_stats_by_path and evaluator_path in previous_evaluator_stats_by_path:
686700
latest_evaluator_stat = latest_evaluator_stats_by_path[evaluator_path]
687701
previous_evaluator_stat = previous_evaluator_stats_by_path[evaluator_path]
688702
latest_score = _get_score_from_evaluator_stat(stat=latest_evaluator_stat)
689703
previous_score = _get_score_from_evaluator_stat(stat=previous_evaluator_stat)
704+
if latest_score is None or previous_score is None:
705+
raise ValueError(f"Could not find score for Evaluator {evaluator_path}.")
690706
diff = round(latest_score - previous_score, 2) # type: ignore
691707
if diff >= 0:
692708
logger.info(f"{CYAN}Change of [{diff}] for Evaluator {evaluator_path}{RESET}")
@@ -716,17 +732,14 @@ def _add_log_to_evaluation(
716732
else:
717733
judgement = eval_function(log)
718734

719-
if local_evaluator.get("custom_logger", None):
720-
local_evaluator["custom_logger"](judgement, start_time, datetime.now())
721-
else:
722-
_ = client.evaluators.log(
723-
parent_id=log["id"],
724-
judgment=judgement,
725-
id=local_evaluator.get("id"),
726-
path=local_evaluator.get("path"),
727-
start_time=start_time,
728-
end_time=datetime.now(),
729-
)
735+
_ = client.evaluators.log(
736+
parent_id=log["id"],
737+
judgment=judgement,
738+
id=local_evaluator.get("id"),
739+
path=local_evaluator.get("path"),
740+
start_time=start_time,
741+
end_time=datetime.now(),
742+
)
730743
except Exception as e:
731744
_ = client.evaluators.log(
732745
parent_id=log["id"],

src/humanloop/eval_utils/types.py

Lines changed: 6 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -46,23 +46,13 @@ class File(Identifiers):
4646
"""The function being evaluated.
4747
It will be called using your Dataset `inputs` as follows: `output = callable(**datapoint.inputs)`.
4848
If `messages` are defined in your Dataset, then `output = callable(**datapoint.inputs, messages=datapoint.messages)`.
49-
It should return a single string output. If not, you must provide a `custom_logger`.
49+
It should return a string or json serializable output.
5050
"""
51-
custom_logger: NotRequired[Callable]
52-
"""function that logs the output of your function to Humanloop, replacing the default logging.
53-
If provided, it will be called as follows:
54-
```
55-
output = callable(**datapoint.inputs).
56-
log = custom_logger(client, output)
57-
```
58-
Inside the custom_logger, you can use the Humanloop `client` to log the output of your function.
59-
If not provided your pipeline must return a single string.
60-
"""
61-
is_decorated: NotRequired[Literal[True]]
51+
is_decorated: NotRequired[bool]
6252

6353

6454
class Dataset(Identifiers):
65-
datapoints: Sequence[DatapointDict]
55+
datapoints: NotRequired[Sequence[DatapointDict]]
6656
"""The datapoints to map your function over to produce the outputs required by the evaluation."""
6757
action: NotRequired[UpdateDatasetAction]
6858
"""How to update the Dataset given the provided Datapoints;
@@ -72,26 +62,12 @@ class Dataset(Identifiers):
7262
class Evaluator(Identifiers):
7363
"""The Evaluator to provide judgments for this Evaluation."""
7464

75-
custom_logger: NotRequired[Callable]
76-
77-
"""The type of arguments the Evaluator expects - only required for local Evaluators."""
7865
args_type: NotRequired[EvaluatorArgumentsType]
79-
80-
"""The type of return value the Evaluator produces - only required for local Evaluators."""
66+
"""The type of arguments the Evaluator expects - only required for local Evaluators."""
8167
return_type: NotRequired[EvaluatorReturnTypeEnum]
82-
83-
"""The function to run on the logs to produce the judgment - only required for local Evaluators."""
68+
"""The type of return value the Evaluator produces - only required for local Evaluators."""
8469
callable: NotRequired[Callable]
85-
86-
"""optional function that logs the output judgment from your Evaluator to Humanloop, if provided, it will be called as follows:
87-
custom_logger: NotRequired[Callable]
88-
```
89-
judgment = callable(log_dict)
90-
log = custom_logger(client, judgment)
91-
```
92-
Inside the custom_logger, you can use the Humanloop `client` to log the judgment to Humanloop.
93-
If not provided your function must return a single string and by default the code will be used to inform the version of the external Evaluator on Humanloop.
94-
"""
70+
"""The function to run on the logs to produce the judgment - only required for local Evaluators."""
9571
threshold: NotRequired[float]
9672
"""The threshold to check the Evaluator against. If the aggregate value of the Evaluator is below this threshold, the check will fail."""
9773

0 commit comments

Comments
 (0)