From 7fc45aeb055677adb254b6fa99eda7c27362bf72 Mon Sep 17 00:00:00 2001 From: Shichao Song <60967965+Ki-Seki@users.noreply.github.com> Date: Tue, 20 Jan 2026 16:13:21 +0800 Subject: [PATCH 1/4] feat: display progress information in log Fixes #27 --- src/gimbench/ctp/evaluators.py | 6 ++++++ src/gimbench/match/evaluators.py | 6 ++++++ src/gimbench/mcqa/evaluators.py | 6 ++++++ 3 files changed, 18 insertions(+) diff --git a/src/gimbench/ctp/evaluators.py b/src/gimbench/ctp/evaluators.py index 189a77b..da3563b 100644 --- a/src/gimbench/ctp/evaluators.py +++ b/src/gimbench/ctp/evaluators.py @@ -104,6 +104,12 @@ def evaluate(self) -> EvalResult: result = self._evaluate_item(self.dataset[idx]) evaled_items.append(result) + if idx % 10 == 0: + logger.info(f"Progress: {idx}/{total} items evaluated.") + logger.info( + f"Estimated Time Remaining: {(total - idx) * ((datetime.now() - self.start_time).total_seconds() / (idx + 1)) / 60:.2f} minutes" + ) + self.end_time = datetime.now() logger.info(f"Evaluation completed at {self.end_time}") diff --git a/src/gimbench/match/evaluators.py b/src/gimbench/match/evaluators.py index f24218b..4e66fc0 100644 --- a/src/gimbench/match/evaluators.py +++ b/src/gimbench/match/evaluators.py @@ -102,6 +102,12 @@ def evaluate(self) -> EvalResult: result = self._evaluate_item(self.dataset[idx]) evaled_items.append(result) + if idx % 10 == 0: + logger.info(f"Progress: {idx}/{total} items evaluated.") + logger.info( + f"Estimated Time Remaining: {(total - idx) * ((datetime.now() - self.start_time).total_seconds() / (idx + 1)) / 60:.2f} minutes" + ) + self.end_time = datetime.now() logger.info(f"Evaluation completed at {self.end_time}") diff --git a/src/gimbench/mcqa/evaluators.py b/src/gimbench/mcqa/evaluators.py index f8e5c58..d572cb8 100644 --- a/src/gimbench/mcqa/evaluators.py +++ b/src/gimbench/mcqa/evaluators.py @@ -128,6 +128,12 @@ def evaluate(self) -> EvalResult: for idx in tqdm(range(total), desc=f"Evaluating {self.args.model_name}"): result = self._evaluate_item(self.dataset[idx]) evaled_items.append(result) + + if idx % 10 == 0: + logger.info(f"Progress: {idx}/{total} items evaluated.") + logger.info( + f"Estimated Time Remaining: {(total - idx) * ((datetime.now() - self.start_time).total_seconds() / (idx + 1)) / 60:.2f} minutes" + ) else: with ThreadPoolExecutor(max_workers=self.args.num_proc) as executor: results = executor.map(self._evaluate_item, (self.dataset[i] for i in range(total))) From ee2c613d869e966a04fc0b8f497fae24fd29769e Mon Sep 17 00:00:00 2001 From: Shichao Song <60967965+Ki-Seki@users.noreply.github.com> Date: Tue, 20 Jan 2026 16:28:22 +0800 Subject: [PATCH 2/4] feat: implement progress logging in evaluators --- src/gimbench/base.py | 9 +++++++++ src/gimbench/ctp/evaluators.py | 6 +----- src/gimbench/match/evaluators.py | 6 +----- src/gimbench/mcqa/evaluators.py | 6 +----- 4 files changed, 12 insertions(+), 15 deletions(-) diff --git a/src/gimbench/base.py b/src/gimbench/base.py index 0c240f6..72ce85c 100644 --- a/src/gimbench/base.py +++ b/src/gimbench/base.py @@ -87,3 +87,12 @@ def __init__(self, args: Namespace, dataset: Dataset): def _safe_average(items: list, attr: str) -> float: values = [getattr(item, attr) for item in items if getattr(item, attr) != -1] return sum(values) / len(values) if values else 0.0 + + def _log_progress(self, total: int, curr_idx: int, log_interval: int = 10) -> None: + completed = curr_idx + 1 + speed = (datetime.now() - self.start_time).total_seconds() / completed + if completed % log_interval == 0: + logger.info( + f"Progress: {completed}/{total} items evaluated with speed {speed:.2f} seconds/item. " + f"Time Remaining: {(total - completed) * speed / 60:.2f} minutes" + ) diff --git a/src/gimbench/ctp/evaluators.py b/src/gimbench/ctp/evaluators.py index da3563b..18b1209 100644 --- a/src/gimbench/ctp/evaluators.py +++ b/src/gimbench/ctp/evaluators.py @@ -104,11 +104,7 @@ def evaluate(self) -> EvalResult: result = self._evaluate_item(self.dataset[idx]) evaled_items.append(result) - if idx % 10 == 0: - logger.info(f"Progress: {idx}/{total} items evaluated.") - logger.info( - f"Estimated Time Remaining: {(total - idx) * ((datetime.now() - self.start_time).total_seconds() / (idx + 1)) / 60:.2f} minutes" - ) + self._log_progress(total, idx) self.end_time = datetime.now() logger.info(f"Evaluation completed at {self.end_time}") diff --git a/src/gimbench/match/evaluators.py b/src/gimbench/match/evaluators.py index 4e66fc0..4213f16 100644 --- a/src/gimbench/match/evaluators.py +++ b/src/gimbench/match/evaluators.py @@ -102,11 +102,7 @@ def evaluate(self) -> EvalResult: result = self._evaluate_item(self.dataset[idx]) evaled_items.append(result) - if idx % 10 == 0: - logger.info(f"Progress: {idx}/{total} items evaluated.") - logger.info( - f"Estimated Time Remaining: {(total - idx) * ((datetime.now() - self.start_time).total_seconds() / (idx + 1)) / 60:.2f} minutes" - ) + self._log_progress(total, idx) self.end_time = datetime.now() logger.info(f"Evaluation completed at {self.end_time}") diff --git a/src/gimbench/mcqa/evaluators.py b/src/gimbench/mcqa/evaluators.py index d572cb8..0996558 100644 --- a/src/gimbench/mcqa/evaluators.py +++ b/src/gimbench/mcqa/evaluators.py @@ -129,11 +129,7 @@ def evaluate(self) -> EvalResult: result = self._evaluate_item(self.dataset[idx]) evaled_items.append(result) - if idx % 10 == 0: - logger.info(f"Progress: {idx}/{total} items evaluated.") - logger.info( - f"Estimated Time Remaining: {(total - idx) * ((datetime.now() - self.start_time).total_seconds() / (idx + 1)) / 60:.2f} minutes" - ) + self._log_progress(total, idx) else: with ThreadPoolExecutor(max_workers=self.args.num_proc) as executor: results = executor.map(self._evaluate_item, (self.dataset[i] for i in range(total))) From efa41097a25ceb527ebf35342af63a0d9f57acf7 Mon Sep 17 00:00:00 2001 From: Shichao Song <60967965+Ki-Seki@users.noreply.github.com> Date: Tue, 20 Jan 2026 16:29:50 +0800 Subject: [PATCH 3/4] feat: add TODO for progress logging in multi-threaded evaluation --- src/gimbench/mcqa/evaluators.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/gimbench/mcqa/evaluators.py b/src/gimbench/mcqa/evaluators.py index 0996558..9c7e276 100644 --- a/src/gimbench/mcqa/evaluators.py +++ b/src/gimbench/mcqa/evaluators.py @@ -134,6 +134,7 @@ def evaluate(self) -> EvalResult: with ThreadPoolExecutor(max_workers=self.args.num_proc) as executor: results = executor.map(self._evaluate_item, (self.dataset[i] for i in range(total))) evaled_items = list(tqdm(results, total=total, desc=f"Evaluating {self.args.model_name}")) + # TODO: Add progress logging for multi-threaded evaluation errors = sum(1 for item in evaled_items if item.error_msg) corrects = sum(1 for item in evaled_items if item.conclusion) From c7bbccb4ce1a891d37f54570d9e529e71ad2fbb3 Mon Sep 17 00:00:00 2001 From: Shichao Song <60967965+Ki-Seki@users.noreply.github.com> Date: Tue, 20 Jan 2026 16:35:29 +0800 Subject: [PATCH 4/4] feat: improve progress logging speed calculation in BaseEvaluator --- src/gimbench/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gimbench/base.py b/src/gimbench/base.py index 72ce85c..f48ca31 100644 --- a/src/gimbench/base.py +++ b/src/gimbench/base.py @@ -90,8 +90,8 @@ def _safe_average(items: list, attr: str) -> float: def _log_progress(self, total: int, curr_idx: int, log_interval: int = 10) -> None: completed = curr_idx + 1 - speed = (datetime.now() - self.start_time).total_seconds() / completed if completed % log_interval == 0: + speed = (datetime.now() - self.start_time).total_seconds() / completed logger.info( f"Progress: {completed}/{total} items evaluated with speed {speed:.2f} seconds/item. " f"Time Remaining: {(total - completed) * speed / 60:.2f} minutes"