From 25c953f12edbc5d801d0221a04cb2e8934591973 Mon Sep 17 00:00:00 2001 From: Shichao Song <60967965+Ki-Seki@users.noreply.github.com> Date: Tue, 20 Jan 2026 15:05:48 +0800 Subject: [PATCH 1/8] feat: add auto-budget option --- src/gimbench/arguments.py | 1 + src/gimbench/mcqa/evaluators.py | 22 +++++++++++++++++++--- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/src/gimbench/arguments.py b/src/gimbench/arguments.py index b231987..3adb339 100644 --- a/src/gimbench/arguments.py +++ b/src/gimbench/arguments.py @@ -113,6 +113,7 @@ def _add_mcqa_eval_args(parser): default=0, help="Number of reasoning steps to include in the prompt", ) + parser.add_argument("--auto_budget", action="store_true", help="Automatically determine the reasoning budget") def validate_and_standardize(args: argparse.Namespace) -> argparse.Namespace: diff --git a/src/gimbench/mcqa/evaluators.py b/src/gimbench/mcqa/evaluators.py index b11c824..8bbda5d 100644 --- a/src/gimbench/mcqa/evaluators.py +++ b/src/gimbench/mcqa/evaluators.py @@ -165,12 +165,28 @@ def __init__(self, args: Namespace, dataset: Dataset): self.model = SimpleGIM(args) def _form_cot_query(self, question: str, choices: list[str]) -> str: + reason_budget = self.args.reason_budget + if self.args.auto_budget: + try: + r = self.model.generate( + f"I'll show you a question. " + f"You need to determine how many reasoning steps are required to accurately answer it.\n\n" + f"## Question: Find the sum of first 5 positive integers.\n\n" + f"## Reasoning steps: 2\n\n" + f"## Question: {question}\n\n" + f"## Reasoning steps: " + guide(desc="A positive integer number", regex=r"\\d+") + ) + budget = int(r.tags[0].content) + except Exception as e: + logger.warning(f"Auto-budget determination failed: {e}") + budget = 1 + reason_budget = max(1, budget) + logger.info(f"Auto-determined reasoning budget: {reason_budget}") reasoning_guides = [ - f"## Step {idx + 1}\n\n" + guide(desc="One thinking step. About 60 words") - for idx in range(self.args.reason_budget) + f"## Step {idx + 1}\n\n" + guide(desc="One thinking step. About 60 words") for idx in range(reason_budget) ] prompt = SHARED_PROMPT_PREFIX + f"\n\nQuestion: {question}\n\n" - if self.args.reason_budget > 0: + if reason_budget > 0: prompt += "Let's think step by step.\n\n" + "\n\n".join(reasoning_guides) + "\n\n" prompt += "## Conclusion\n\nFinal answer: " + guide.select(choices=choices, name="predicted_choice") return prompt From 63fb04f458a58268e8d553e552ac1a97e0f47101 Mon Sep 17 00:00:00 2001 From: Shichao Song <60967965+Ki-Seki@users.noreply.github.com> Date: Tue, 20 Jan 2026 15:15:06 +0800 Subject: [PATCH 2/8] Update src/gimbench/mcqa/evaluators.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/gimbench/mcqa/evaluators.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gimbench/mcqa/evaluators.py b/src/gimbench/mcqa/evaluators.py index 8bbda5d..f06a9ae 100644 --- a/src/gimbench/mcqa/evaluators.py +++ b/src/gimbench/mcqa/evaluators.py @@ -174,9 +174,9 @@ def _form_cot_query(self, question: str, choices: list[str]) -> str: f"## Question: Find the sum of first 5 positive integers.\n\n" f"## Reasoning steps: 2\n\n" f"## Question: {question}\n\n" - f"## Reasoning steps: " + guide(desc="A positive integer number", regex=r"\\d+") + f"## Reasoning steps: " + guide(name="reason_budget", desc="A positive integer number", regex=r"\\d+") ) - budget = int(r.tags[0].content) + budget = int(r.tags["reason_budget"].content) except Exception as e: logger.warning(f"Auto-budget determination failed: {e}") budget = 1 From b308b128be8c5ff2d54d96bf45deb02911572cc8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 20 Jan 2026 07:15:13 +0000 Subject: [PATCH 3/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/gimbench/mcqa/evaluators.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/gimbench/mcqa/evaluators.py b/src/gimbench/mcqa/evaluators.py index f06a9ae..eb5a5fc 100644 --- a/src/gimbench/mcqa/evaluators.py +++ b/src/gimbench/mcqa/evaluators.py @@ -174,7 +174,8 @@ def _form_cot_query(self, question: str, choices: list[str]) -> str: f"## Question: Find the sum of first 5 positive integers.\n\n" f"## Reasoning steps: 2\n\n" f"## Question: {question}\n\n" - f"## Reasoning steps: " + guide(name="reason_budget", desc="A positive integer number", regex=r"\\d+") + f"## Reasoning steps: " + + guide(name="reason_budget", desc="A positive integer number", regex=r"\\d+") ) budget = int(r.tags["reason_budget"].content) except Exception as e: From b6f1f79a2272122075d2056c180031a0f69c718d Mon Sep 17 00:00:00 2001 From: Shichao Song <60967965+Ki-Seki@users.noreply.github.com> Date: Tue, 20 Jan 2026 15:16:41 +0800 Subject: [PATCH 4/8] Update src/gimbench/arguments.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/gimbench/arguments.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/gimbench/arguments.py b/src/gimbench/arguments.py index 3adb339..54a3d03 100644 --- a/src/gimbench/arguments.py +++ b/src/gimbench/arguments.py @@ -113,7 +113,11 @@ def _add_mcqa_eval_args(parser): default=0, help="Number of reasoning steps to include in the prompt", ) - parser.add_argument("--auto_budget", action="store_true", help="Automatically determine the reasoning budget") + parser.add_argument( + "--auto_budget", + action="store_true", + help="Automatically determine the reasoning budget (overrides --reason_budget if both are set)", + ) def validate_and_standardize(args: argparse.Namespace) -> argparse.Namespace: From 514298af5e32b30e8134339e7e5e152eca2327d0 Mon Sep 17 00:00:00 2001 From: Shichao Song <60967965+Ki-Seki@users.noreply.github.com> Date: Tue, 20 Jan 2026 15:20:14 +0800 Subject: [PATCH 5/8] fix: ensure budget defaults to 1 if not provided in GIMEvaluator --- src/gimbench/mcqa/evaluators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gimbench/mcqa/evaluators.py b/src/gimbench/mcqa/evaluators.py index eb5a5fc..baca7a8 100644 --- a/src/gimbench/mcqa/evaluators.py +++ b/src/gimbench/mcqa/evaluators.py @@ -177,7 +177,7 @@ def _form_cot_query(self, question: str, choices: list[str]) -> str: f"## Reasoning steps: " + guide(name="reason_budget", desc="A positive integer number", regex=r"\\d+") ) - budget = int(r.tags["reason_budget"].content) + budget = int(r.tags["reason_budget"].content or "1") except Exception as e: logger.warning(f"Auto-budget determination failed: {e}") budget = 1 From b17750d3a9be0ce34ede48dfe1da33f77065bc85 Mon Sep 17 00:00:00 2001 From: Shichao Song <60967965+Ki-Seki@users.noreply.github.com> Date: Tue, 20 Jan 2026 15:40:07 +0800 Subject: [PATCH 6/8] feat: enhance reasoning budget handling in MCQAEvaluator --- src/gimbench/mcqa/evaluators.py | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/src/gimbench/mcqa/evaluators.py b/src/gimbench/mcqa/evaluators.py index baca7a8..a4d63f1 100644 --- a/src/gimbench/mcqa/evaluators.py +++ b/src/gimbench/mcqa/evaluators.py @@ -33,6 +33,7 @@ class EvalItemResult(BaseModel): response_tokens: int = -1 query_len: int = -1 response_len: int = -1 + reason_budget: int = -1 error_msg: str = "" additional_info: dict = {} @@ -52,6 +53,7 @@ class EvalResult(BaseEvalResult): avg_response_tokens: float avg_query_len: float avg_response_len: float + avg_reason_budget: float evaled_items: list[EvalItemResult] @@ -64,7 +66,10 @@ def __init__(self, args: Namespace, dataset: Dataset): logger.info(f"Loaded tokenizer {args.counter_tokenizer} for token counting.") @abstractmethod - def _form_cot_query(self, question: str, choices: list[str]) -> str: ... + def _get_reason_budget(self, question: str) -> int: ... + + @abstractmethod + def _form_cot_query(self, question: str, choices: list[str], *args) -> str: ... @abstractmethod def _model_call(self, query: str) -> Any: ... @@ -80,9 +85,15 @@ def _evaluate_item(self, item: dict) -> EvalItemResult: item["choices"], item["correct_choice"], ) - query = self._form_cot_query(question, choices) try: - raw_response = self._model_call(query) + if self.args.no_gimkit: + reason_budget = -1 + query = self._form_cot_query(question, choices) + raw_response = self._model_call(query) + else: + reason_budget = self._get_reason_budget(question) + query = self._form_cot_query(question, choices, reason_budget) + raw_response = self._model_call(query) response, model_choice, additional_info = self._parse_response(raw_response, choices) conclusion = model_choice == correct_choice error_msg = "" @@ -103,6 +114,7 @@ def _evaluate_item(self, item: dict) -> EvalItemResult: response_tokens=self._count_tokens(response) if response != "ERROR" else -1, query_len=len(query), response_len=len(response), + reason_budget=reason_budget, error_msg=error_msg, additional_info=additional_info, ) @@ -141,6 +153,7 @@ def evaluate(self) -> EvalResult: avg_response_tokens=self._safe_average(evaled_items, "response_tokens"), avg_query_len=self._safe_average(evaled_items, "query_len"), avg_response_len=self._safe_average(evaled_items, "response_len"), + avg_reason_budget=self._safe_average(evaled_items, "reason_budget"), start_time=self.start_time, end_time=self.end_time, elapsed_minutes=(self.end_time - self.start_time).total_seconds() / 60.0, @@ -164,7 +177,7 @@ def __init__(self, args: Namespace, dataset: Dataset): super().__init__(args, dataset) self.model = SimpleGIM(args) - def _form_cot_query(self, question: str, choices: list[str]) -> str: + def _get_reason_budget(self, question: str) -> int: reason_budget = self.args.reason_budget if self.args.auto_budget: try: @@ -183,6 +196,9 @@ def _form_cot_query(self, question: str, choices: list[str]) -> str: budget = 1 reason_budget = max(1, budget) logger.info(f"Auto-determined reasoning budget: {reason_budget}") + return reason_budget + + def _form_cot_query(self, question: str, choices: list[str], reason_budget: int) -> str: reasoning_guides = [ f"## Step {idx + 1}\n\n" + guide(desc="One thinking step. About 60 words") for idx in range(reason_budget) ] @@ -210,7 +226,10 @@ def __init__(self, args: Namespace, dataset: Dataset): super().__init__(args, dataset) self.model = OpenAI(api_key=args.api_key, base_url=args.base_url) - def _form_cot_query(self, question: str, choices: list[str]) -> str: + def _get_reason_budget(self, question: str) -> int: + raise NotImplementedError("CommonEvaluator does not support reason budget.") + + def _form_cot_query(self, question: str, choices: list[str], *args) -> str: prompt = SHARED_PROMPT_PREFIX + ( " Remember to end with `The answer is: xxx`.\n\n" f"Question: {question}\n\n" From 1c04b4213eecd00256a6476bbda1b1af02d78bf7 Mon Sep 17 00:00:00 2001 From: Shichao Song <60967965+Ki-Seki@users.noreply.github.com> Date: Tue, 20 Jan 2026 15:48:19 +0800 Subject: [PATCH 7/8] Update src/gimbench/mcqa/evaluators.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/gimbench/mcqa/evaluators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gimbench/mcqa/evaluators.py b/src/gimbench/mcqa/evaluators.py index a4d63f1..aa6ecdd 100644 --- a/src/gimbench/mcqa/evaluators.py +++ b/src/gimbench/mcqa/evaluators.py @@ -188,7 +188,7 @@ def _get_reason_budget(self, question: str) -> int: f"## Reasoning steps: 2\n\n" f"## Question: {question}\n\n" f"## Reasoning steps: " - + guide(name="reason_budget", desc="A positive integer number", regex=r"\\d+") + + guide(name="reason_budget", desc="A positive integer number", regex=r"\d+") ) budget = int(r.tags["reason_budget"].content or "1") except Exception as e: From 06981614b2045a0470f37a0fd3f8b76a53b93d15 Mon Sep 17 00:00:00 2001 From: Shichao Song <60967965+Ki-Seki@users.noreply.github.com> Date: Tue, 20 Jan 2026 15:52:19 +0800 Subject: [PATCH 8/8] feat: implement fixed reasoning budget handling in GIMEvaluator --- src/gimbench/mcqa/evaluators.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/gimbench/mcqa/evaluators.py b/src/gimbench/mcqa/evaluators.py index aa6ecdd..f8e5c58 100644 --- a/src/gimbench/mcqa/evaluators.py +++ b/src/gimbench/mcqa/evaluators.py @@ -178,7 +178,6 @@ def __init__(self, args: Namespace, dataset: Dataset): self.model = SimpleGIM(args) def _get_reason_budget(self, question: str) -> int: - reason_budget = self.args.reason_budget if self.args.auto_budget: try: r = self.model.generate( @@ -196,6 +195,8 @@ def _get_reason_budget(self, question: str) -> int: budget = 1 reason_budget = max(1, budget) logger.info(f"Auto-determined reasoning budget: {reason_budget}") + else: + reason_budget = self.args.reason_budget return reason_budget def _form_cot_query(self, question: str, choices: list[str], reason_budget: int) -> str: