From 1bdcf82d4561377842db2879d0e017bc3db63a12 Mon Sep 17 00:00:00 2001 From: Adnan Qureshi Date: Fri, 16 Jan 2026 17:30:37 +0530 Subject: [PATCH 1/4] feature(workflows): added a agentic workflow for sudoku --- trinity/common/workflows/__init__.py | 2 + trinity/common/workflows/sudoku_workflow.py | 78 +++++++++++++++++++++ 2 files changed, 80 insertions(+) create mode 100644 trinity/common/workflows/sudoku_workflow.py diff --git a/trinity/common/workflows/__init__.py b/trinity/common/workflows/__init__.py index ea7390b4a4..6086dba4c9 100644 --- a/trinity/common/workflows/__init__.py +++ b/trinity/common/workflows/__init__.py @@ -48,6 +48,8 @@ # on-policy distillation workflows "on_policy_distill_workflow": "trinity.common.workflows.on_policy_distill_workflow.OnPolicyDistillWorkflow", "on_policy_distill_math_workflow": "trinity.common.workflows.on_policy_distill_workflow.OnPolicyDistillMathWorkflow", + # customed workflows + "sudoku_workflow": "trinity.common.workflows.sudoku_workflow.SudokuWorkflow", }, ) diff --git a/trinity/common/workflows/sudoku_workflow.py b/trinity/common/workflows/sudoku_workflow.py new file mode 100644 index 0000000000..d5605a8617 --- /dev/null +++ b/trinity/common/workflows/sudoku_workflow.py @@ -0,0 +1,78 @@ +# -*- coding: utf-8 -*- +""" +Sudoku Workflow for Trinity-RFT +This workflow demonstrates a simple single-turn task where the model solves a Sudoku puzzle. +""" + +from typing import List + +from trinity.common.experience import Experience +from trinity.common.models.model import ModelWrapper +from trinity.common.workflows import Task +from trinity.common.workflows.workflow import Workflow + + +class SudokuWorkflow(Workflow): + """ + Workflow: SudokuWorkflow + Purpose: Ask the model to solve a Sudoku puzzle and give reward based on correctness. + """ + + # Workflow does not support reset or repeated runs for now. + can_reset: bool = False + can_repeat: bool = False + + def __init__(self, task: Task, model: ModelWrapper, auxiliary_models=None): + """ + Initialize workflow with: + - task.raw_task["puzzle"] + - task.raw_task["solution"]g + """ + + super().__init__(task=task, model=model, auxiliary_models=auxiliary_models) + + # Extract puzzle input and ground truth + self.puzzle = task.raw_task.get("puzzle") + self.solution = task.raw_task.get("solution") + + # Rollout arguments (e.g., temperature, n) + self.rollout_args = task.rollout_args + + def calculate_reward(self, predicted: str) -> float: + """ + Reward function: + Returns 1.0 if predicted output matches solution exactly, else 0.0. + """ + return 1.0 if predicted.strip() == self.solution.strip() else 0.0 + + def run(self) -> List[Experience]: + """ + Primary execution step of the workflow: + 1. Send puzzle to model + 2. Collect response + 3. Evaluate with reward + 4. Package into Experience list + """ + + responses = self.model.chat( + [ + { + "role": "user", + "content": f"Solve this Sudoku puzzle:\n{self.puzzle}", + } + ], + temperature=self.rollout_args.temperature, + ) + + resp = responses[0] # Single response + reward = self.calculate_reward(resp.response_text) + + # Return experience in expected format + return [ + Experience( + tokens=resp.tokens, + prompt_length=resp.prompt_length, + reward=reward, + logprobs=resp.logprobs, + ) + ] From fe7131641c307471f6689aaa49bc8ee4b3c0d708 Mon Sep 17 00:00:00 2001 From: Adnan Qureshi Date: Fri, 16 Jan 2026 19:35:21 +0530 Subject: [PATCH 2/4] feature: added Sudoku generator, and judge --- trinity/common/workflows/sudoku_generator.py | 33 ++++ trinity/common/workflows/sudoku_judge.py | 43 +++++ trinity/common/workflows/sudoku_workflow.py | 166 +++++++++++-------- 3 files changed, 176 insertions(+), 66 deletions(-) create mode 100644 trinity/common/workflows/sudoku_generator.py create mode 100644 trinity/common/workflows/sudoku_judge.py diff --git a/trinity/common/workflows/sudoku_generator.py b/trinity/common/workflows/sudoku_generator.py new file mode 100644 index 0000000000..bd4bfd1b62 --- /dev/null +++ b/trinity/common/workflows/sudoku_generator.py @@ -0,0 +1,33 @@ +import random + + +class SudokuGenerator: + """ + Very simple Sudoku generator. + - Uses a fixed solved grid + - Removes 'holes' positions to create a puzzle + """ + + BASE_SOLUTION = [ + [5, 3, 4, 6, 7, 8, 9, 1, 2], + [6, 7, 2, 1, 9, 5, 3, 4, 8], + [1, 9, 8, 3, 4, 2, 5, 6, 7], + [8, 5, 9, 7, 6, 1, 4, 2, 3], + [4, 2, 6, 8, 5, 3, 7, 9, 1], + [7, 1, 3, 9, 2, 4, 8, 5, 6], + [9, 6, 1, 5, 3, 7, 2, 8, 4], + [2, 8, 7, 4, 1, 9, 6, 3, 5], + [3, 4, 5, 2, 8, 6, 1, 7, 9], + ] + + def generate(self, holes=40): + """Return (puzzle, solution) tuple.""" + solution = [row[:] for row in self.BASE_SOLUTION] + puzzle = [row[:] for row in solution] + + for _ in range(holes): + r = random.randint(0, 8) + c = random.randint(0, 8) + puzzle[r][c] = 0 + + return puzzle, solution diff --git a/trinity/common/workflows/sudoku_judge.py b/trinity/common/workflows/sudoku_judge.py new file mode 100644 index 0000000000..9fee423710 --- /dev/null +++ b/trinity/common/workflows/sudoku_judge.py @@ -0,0 +1,43 @@ +class SudokuJudge: + """ + Judge Sudoku board state. + - Checks row validity + - Checks column validity + - Checks 3x3 block validity + """ + + @staticmethod + def is_valid(board): + # Check rows + for row in board: + nums = [v for v in row if v != 0] + if len(nums) != len(set(nums)): + return False + + # Check columns + for col in range(9): + nums = [] + for row in range(9): + v = board[row][col] + if v != 0: + nums.append(v) + if len(nums) != len(set(nums)): + return False + + # Check 3x3 sub-grids + for br in range(0, 9, 3): + for bc in range(0, 9, 3): + nums = [] + for r in range(br, br + 3): + for c in range(bc, bc + 3): + v = board[r][c] + if v != 0: + nums.append(v) + if len(nums) != len(set(nums)): + return False + + return True + + @staticmethod + def is_solved(board, solution): + return board == solution diff --git a/trinity/common/workflows/sudoku_workflow.py b/trinity/common/workflows/sudoku_workflow.py index d5605a8617..5b99a4bb88 100644 --- a/trinity/common/workflows/sudoku_workflow.py +++ b/trinity/common/workflows/sudoku_workflow.py @@ -1,78 +1,112 @@ -# -*- coding: utf-8 -*- -""" -Sudoku Workflow for Trinity-RFT -This workflow demonstrates a simple single-turn task where the model solves a Sudoku puzzle. -""" - -from typing import List - from trinity.common.experience import Experience -from trinity.common.models.model import ModelWrapper -from trinity.common.workflows import Task from trinity.common.workflows.workflow import Workflow +from .sudoku_generator import SudokuGenerator +from .sudoku_judge import SudokuJudge + class SudokuWorkflow(Workflow): """ - Workflow: SudokuWorkflow - Purpose: Ask the model to solve a Sudoku puzzle and give reward based on correctness. + Multi-step Sudoku solving workflow. + - Shows current puzzle board to model + - Model returns a move: "r c v" + - Workflow applies move + - Judge checks validity + - Continues for max_steps """ - # Workflow does not support reset or repeated runs for now. - can_reset: bool = False - can_repeat: bool = False - - def __init__(self, task: Task, model: ModelWrapper, auxiliary_models=None): - """ - Initialize workflow with: - - task.raw_task["puzzle"] - - task.raw_task["solution"]g - """ + can_reset = True + def __init__(self, task, model, auxiliary_models=None): super().__init__(task=task, model=model, auxiliary_models=auxiliary_models) - # Extract puzzle input and ground truth - self.puzzle = task.raw_task.get("puzzle") - self.solution = task.raw_task.get("solution") - - # Rollout arguments (e.g., temperature, n) - self.rollout_args = task.rollout_args - - def calculate_reward(self, predicted: str) -> float: - """ - Reward function: - Returns 1.0 if predicted output matches solution exactly, else 0.0. - """ - return 1.0 if predicted.strip() == self.solution.strip() else 0.0 - - def run(self) -> List[Experience]: - """ - Primary execution step of the workflow: - 1. Send puzzle to model - 2. Collect response - 3. Evaluate with reward - 4. Package into Experience list - """ - - responses = self.model.chat( - [ - { - "role": "user", - "content": f"Solve this Sudoku puzzle:\n{self.puzzle}", - } - ], - temperature=self.rollout_args.temperature, - ) - - resp = responses[0] # Single response - reward = self.calculate_reward(resp.response_text) - - # Return experience in expected format - return [ - Experience( - tokens=resp.tokens, - prompt_length=resp.prompt_length, - reward=reward, - logprobs=resp.logprobs, + # If no dataset provided, generate puzzle + if "puzzle" in task.raw_task: + self.board = [row[:] for row in task.raw_task["puzzle"]] + self.solution = [row[:] for row in task.raw_task["solution"]] + else: + generator = SudokuGenerator() + self.board, self.solution = generator.generate() + + self.judge = SudokuJudge() + self.max_steps = 20 + + def reset(self, task): + """Reset puzzle for new task.""" + self.board = [row[:] for row in task.raw_task["puzzle"]] + self.solution = [row[:] for row in task.raw_task["solution"]] + + def parse_action(self, text): + """Expected model output: 'row col value'""" + try: + parts = text.strip().split() + if len(parts) != 3: + return None + r, c, v = map(int, parts) + if not (0 <= r <= 8 and 0 <= c <= 8 and 1 <= v <= 9): + return None + return r, c, v + except Exception: + return None + + def apply_move(self, r, c, v): + if self.board[r][c] == 0: + self.board[r][c] = v + + def run(self): + experiences = [] + + for step in range(self.max_steps): + prompt = f""" +Solve Sudoku by giving moves one at a time. +Current board (0 = empty): + +{self.board} + +Respond ONLY with: row col value +""" + + # Call model + responses = self.model.chat([{"role": "user", "content": prompt}]) + resp = responses[0] + + action = self.parse_action(resp.response_text) + if action is None: + reward = -1.0 + break + + r, c, v = action + self.apply_move(r, c, v) + + # Check validity + if not self.judge.is_valid(self.board): + reward = -1.0 + break + + # Check solved + if self.judge.is_solved(self.board, self.solution): + reward = 1.0 + experiences.append( + Experience( + tokens=resp.tokens, + prompt_length=resp.prompt_length, + reward=reward, + logprobs=resp.logprobs, + ) + ) + break + + # Neutral step reward + reward = 0.0 + + # Add experience + experiences.append( + Experience( + tokens=resp.tokens, + prompt_length=resp.prompt_length, + reward=reward, + logprobs=resp.logprobs, + ) ) - ] + + return experiences From 3b8796d8a3f750b78f80b8c4f2b71c8b764c2a05 Mon Sep 17 00:00:00 2001 From: Adnan Qureshi Date: Sat, 17 Jan 2026 19:00:34 +0530 Subject: [PATCH 3/4] feature(workflows): improve SudokuWorkflow prompt and generator --- trinity/common/workflows/__init__.py | 2 +- trinity/common/workflows/sudoku_generator.py | 51 +++++++- trinity/common/workflows/sudoku_workflow.py | 124 +++++++++++++++---- 3 files changed, 144 insertions(+), 33 deletions(-) diff --git a/trinity/common/workflows/__init__.py b/trinity/common/workflows/__init__.py index 6086dba4c9..ad38cb935c 100644 --- a/trinity/common/workflows/__init__.py +++ b/trinity/common/workflows/__init__.py @@ -48,7 +48,7 @@ # on-policy distillation workflows "on_policy_distill_workflow": "trinity.common.workflows.on_policy_distill_workflow.OnPolicyDistillWorkflow", "on_policy_distill_math_workflow": "trinity.common.workflows.on_policy_distill_workflow.OnPolicyDistillMathWorkflow", - # customed workflows + # custom workflows "sudoku_workflow": "trinity.common.workflows.sudoku_workflow.SudokuWorkflow", }, ) diff --git a/trinity/common/workflows/sudoku_generator.py b/trinity/common/workflows/sudoku_generator.py index bd4bfd1b62..761268d6dc 100644 --- a/trinity/common/workflows/sudoku_generator.py +++ b/trinity/common/workflows/sudoku_generator.py @@ -3,9 +3,13 @@ class SudokuGenerator: """ - Very simple Sudoku generator. - - Uses a fixed solved grid - - Removes 'holes' positions to create a puzzle + Lightweight Sudoku generator. + + This generator avoids relying on a single canonical solution by applying + randomized transformations to a solved grid before removing values to + create a puzzle. The difficulty is controlled by the number of removed + cells (holes). + """ BASE_SOLUTION = [ @@ -20,9 +24,46 @@ class SudokuGenerator: [3, 4, 5, 2, 8, 6, 1, 7, 9], ] + def _shuffle_solution(self, board): + """ + Randomize a solved Sudoku grid while preserving validity. + + This follows common Sudoku generation techniques: + - permuting numbers + - shuffling rows + - shuffling columns + """ + board = [row[:] for row in board] + + # Shuffle numbers 1–9 + numbers = list(range(1, 10)) + shuffled_numbers = numbers[:] + random.shuffle(shuffled_numbers) + mapping = dict(zip(numbers, shuffled_numbers)) + board = [[mapping[v] for v in row] for row in board] + + # Shuffle rows + random.shuffle(board) + + # Shuffle columns + board = list(map(list, zip(*board))) + random.shuffle(board) + board = list(map(list, zip(*board))) + + return board + def generate(self, holes=40): - """Return (puzzle, solution) tuple.""" - solution = [row[:] for row in self.BASE_SOLUTION] + """ + Generate a Sudoku puzzle. + + Args: + holes (int): Number of empty cells (0s) in the puzzle. + Larger values correspond to higher difficulty. + + Returns: + tuple: (puzzle, solution) + """ + solution = self._shuffle_solution(self.BASE_SOLUTION) puzzle = [row[:] for row in solution] for _ in range(holes): diff --git a/trinity/common/workflows/sudoku_workflow.py b/trinity/common/workflows/sudoku_workflow.py index 5b99a4bb88..dcc1df28dd 100644 --- a/trinity/common/workflows/sudoku_workflow.py +++ b/trinity/common/workflows/sudoku_workflow.py @@ -1,3 +1,5 @@ +import re + from trinity.common.experience import Experience from trinity.common.workflows.workflow import Workflow @@ -8,11 +10,13 @@ class SudokuWorkflow(Workflow): """ Multi-step Sudoku solving workflow. - - Shows current puzzle board to model - - Model returns a move: "r c v" - - Workflow applies move - - Judge checks validity - - Continues for max_steps + + This workflow follows a FrozenLake-style agentic interaction pattern: + - Maintains an internal environment state (Sudoku board) + - Interacts with the model step by step + - Provides explicit rules, task description, and strict output format + - Gives feedback on invalid or ineffective actions + - Terminates on success or failure """ can_reset = True @@ -20,8 +24,8 @@ class SudokuWorkflow(Workflow): def __init__(self, task, model, auxiliary_models=None): super().__init__(task=task, model=model, auxiliary_models=auxiliary_models) - # If no dataset provided, generate puzzle - if "puzzle" in task.raw_task: + # Initialize puzzle + if "puzzle" in task.raw_task and "solution" in task.raw_task: self.board = [row[:] for row in task.raw_task["puzzle"]] self.solution = [row[:] for row in task.raw_task["solution"]] else: @@ -31,59 +35,124 @@ def __init__(self, task, model, auxiliary_models=None): self.judge = SudokuJudge() self.max_steps = 20 + # State tracking (FrozenLake-style) + self.current_step = 0 + self.last_board = None + self.last_action = None + def reset(self, task): - """Reset puzzle for new task.""" + """Reset the workflow state for a new task.""" self.board = [row[:] for row in task.raw_task["puzzle"]] self.solution = [row[:] for row in task.raw_task["solution"]] + self.current_step = 0 + self.last_board = None + self.last_action = None + + def _build_prompt(self): + """ + Build a detailed, step-aware prompt inspired by the Frozen Lake example. + """ + prompt = ( + "You are playing a Sudoku game.\n\n" + "Game Rules:\n" + "- The board is a 9x9 grid.\n" + "- A value of 0 represents an empty cell.\n" + "- Each row must contain the numbers 1 through 9 exactly once.\n" + "- Each column must contain the numbers 1 through 9 exactly once.\n" + "- Each 3x3 sub-grid must contain the numbers 1 through 9 exactly once.\n" + "- You may only place numbers in empty cells.\n\n" + "Task:\n" + "- At each step, output ONE valid move to progress toward solving the puzzle.\n\n" + "Output Format (STRICT):\n" + "```row col value```\n\n" + "Example:\n" + "```0 2 4```\n\n" + f"Current Step: {self.current_step}\n" + f"Remaining Steps: {self.max_steps - self.current_step}\n\n" + f"Current Board:\n{self.board}\n" + ) + + if self.last_board is not None and self.board == self.last_board: + prompt += ( + "\nYour last response was invalid or had no effect. " + "Please recheck the Sudoku rules and the required output format." + ) + + return prompt def parse_action(self, text): - """Expected model output: 'row col value'""" + """ + Parse model output. + + Expected format: + ```row col value``` + """ + matches = re.findall(r"```(.*?)```", text, re.DOTALL) + if not matches: + return None + try: - parts = text.strip().split() + parts = matches[-1].strip().split() if len(parts) != 3: return None + r, c, v = map(int, parts) if not (0 <= r <= 8 and 0 <= c <= 8 and 1 <= v <= 9): return None + return r, c, v - except Exception: + except ValueError: return None def apply_move(self, r, c, v): + """Apply a move to the board if the cell is empty.""" if self.board[r][c] == 0: self.board[r][c] = v def run(self): + """ + Execute the Sudoku workflow step by step. + """ experiences = [] - for step in range(self.max_steps): - prompt = f""" -Solve Sudoku by giving moves one at a time. -Current board (0 = empty): - -{self.board} + for _ in range(self.max_steps): + prompt = self._build_prompt() -Respond ONLY with: row col value -""" - - # Call model responses = self.model.chat([{"role": "user", "content": prompt}]) resp = responses[0] + self.last_board = [row[:] for row in self.board] + action = self.parse_action(resp.response_text) if action is None: reward = -1.0 + experiences.append( + Experience( + tokens=resp.tokens, + prompt_length=resp.prompt_length, + reward=reward, + logprobs=resp.logprobs, + ) + ) break r, c, v = action self.apply_move(r, c, v) - # Check validity - if not self.judge.is_valid(self.board): + # Invalid or ineffective action + if self.board == self.last_board or not self.judge.is_valid(self.board): reward = -1.0 + experiences.append( + Experience( + tokens=resp.tokens, + prompt_length=resp.prompt_length, + reward=reward, + logprobs=resp.logprobs, + ) + ) break - # Check solved + # Solved if self.judge.is_solved(self.board, self.solution): reward = 1.0 experiences.append( @@ -96,10 +165,8 @@ def run(self): ) break - # Neutral step reward + # Intermediate step reward = 0.0 - - # Add experience experiences.append( Experience( tokens=resp.tokens, @@ -109,4 +176,7 @@ def run(self): ) ) + self.last_action = action + self.current_step += 1 + return experiences From bf1156ab69daa151e51ff74547f4180c5a44f2c5 Mon Sep 17 00:00:00 2001 From: Adnan Qureshi Date: Wed, 21 Jan 2026 02:33:18 +0530 Subject: [PATCH 4/4] improved sudoku workflow and generator --- trinity/common/workflows/sudoku_generator.py | 133 ++++++++++--------- trinity/common/workflows/sudoku_workflow.py | 129 ++++++++---------- 2 files changed, 120 insertions(+), 142 deletions(-) diff --git a/trinity/common/workflows/sudoku_generator.py b/trinity/common/workflows/sudoku_generator.py index 761268d6dc..bcca7e12cc 100644 --- a/trinity/common/workflows/sudoku_generator.py +++ b/trinity/common/workflows/sudoku_generator.py @@ -3,72 +3,73 @@ class SudokuGenerator: """ - Lightweight Sudoku generator. - - This generator avoids relying on a single canonical solution by applying - randomized transformations to a solved grid before removing values to - create a puzzle. The difficulty is controlled by the number of removed - cells (holes). + Sudoku puzzle generator inspired by standard backtracking-based generators. + - Generates a fresh solved Sudoku board using backtracking + - Removes cells based on difficulty (number of empty cells) + - Avoids relying on a single canonical solution """ - BASE_SOLUTION = [ - [5, 3, 4, 6, 7, 8, 9, 1, 2], - [6, 7, 2, 1, 9, 5, 3, 4, 8], - [1, 9, 8, 3, 4, 2, 5, 6, 7], - [8, 5, 9, 7, 6, 1, 4, 2, 3], - [4, 2, 6, 8, 5, 3, 7, 9, 1], - [7, 1, 3, 9, 2, 4, 8, 5, 6], - [9, 6, 1, 5, 3, 7, 2, 8, 4], - [2, 8, 7, 4, 1, 9, 6, 3, 5], - [3, 4, 5, 2, 8, 6, 1, 7, 9], - ] - - def _shuffle_solution(self, board): - """ - Randomize a solved Sudoku grid while preserving validity. - - This follows common Sudoku generation techniques: - - permuting numbers - - shuffling rows - - shuffling columns - """ - board = [row[:] for row in board] - - # Shuffle numbers 1–9 - numbers = list(range(1, 10)) - shuffled_numbers = numbers[:] - random.shuffle(shuffled_numbers) - mapping = dict(zip(numbers, shuffled_numbers)) - board = [[mapping[v] for v in row] for row in board] - - # Shuffle rows - random.shuffle(board) - - # Shuffle columns - board = list(map(list, zip(*board))) - random.shuffle(board) - board = list(map(list, zip(*board))) - - return board - - def generate(self, holes=40): - """ - Generate a Sudoku puzzle. - - Args: - holes (int): Number of empty cells (0s) in the puzzle. - Larger values correspond to higher difficulty. - - Returns: - tuple: (puzzle, solution) - """ - solution = self._shuffle_solution(self.BASE_SOLUTION) - puzzle = [row[:] for row in solution] - - for _ in range(holes): - r = random.randint(0, 8) - c = random.randint(0, 8) - puzzle[r][c] = 0 - - return puzzle, solution + def generate(self, difficulty="medium"): + holes_map = { + "easy": 30, + "medium": 40, + "hard": 50, + } + holes = holes_map.get(difficulty, 40) + + board = [[0 for _ in range(9)] for _ in range(9)] + self._fill_board(board) + + solution = [row[:] for row in board] + self._remove_cells(board, holes) + + return board, solution + + def _fill_board(self, board): + empty = self._find_empty(board) + if not empty: + return True + + r, c = empty + nums = list(range(1, 10)) + random.shuffle(nums) + + for v in nums: + if self._is_valid(board, r, c, v): + board[r][c] = v + if self._fill_board(board): + return True + board[r][c] = 0 + + return False + + def _find_empty(self, board): + for i in range(9): + for j in range(9): + if board[i][j] == 0: + return i, j + return None + + def _is_valid(self, board, r, c, v): + if v in board[r]: + return False + + if v in [board[i][c] for i in range(9)]: + return False + + br, bc = (r // 3) * 3, (c // 3) * 3 + for i in range(br, br + 3): + for j in range(bc, bc + 3): + if board[i][j] == v: + return False + + return True + + def _remove_cells(self, board, holes): + cells = [(i, j) for i in range(9) for j in range(9)] + random.shuffle(cells) + + for i in range(min(holes, 81)): + r, c = cells[i] + board[r][c] = 0 diff --git a/trinity/common/workflows/sudoku_workflow.py b/trinity/common/workflows/sudoku_workflow.py index dcc1df28dd..e65604ca30 100644 --- a/trinity/common/workflows/sudoku_workflow.py +++ b/trinity/common/workflows/sudoku_workflow.py @@ -1,5 +1,3 @@ -import re - from trinity.common.experience import Experience from trinity.common.workflows.workflow import Workflow @@ -8,23 +6,11 @@ class SudokuWorkflow(Workflow): - """ - Multi-step Sudoku solving workflow. - - This workflow follows a FrozenLake-style agentic interaction pattern: - - Maintains an internal environment state (Sudoku board) - - Interacts with the model step by step - - Provides explicit rules, task description, and strict output format - - Gives feedback on invalid or ineffective actions - - Terminates on success or failure - """ - can_reset = True def __init__(self, task, model, auxiliary_models=None): super().__init__(task=task, model=model, auxiliary_models=auxiliary_models) - # Initialize puzzle if "puzzle" in task.raw_task and "solution" in task.raw_task: self.board = [row[:] for row in task.raw_task["puzzle"]] self.solution = [row[:] for row in task.raw_task["solution"]] @@ -34,149 +20,140 @@ def __init__(self, task, model, auxiliary_models=None): self.judge = SudokuJudge() self.max_steps = 20 + self.max_moves_per_step = 5 - # State tracking (FrozenLake-style) self.current_step = 0 self.last_board = None self.last_action = None def reset(self, task): - """Reset the workflow state for a new task.""" self.board = [row[:] for row in task.raw_task["puzzle"]] self.solution = [row[:] for row in task.raw_task["solution"]] self.current_step = 0 self.last_board = None self.last_action = None + def render_board(self): + return "\n".join(" ".join(str(v) for v in row) for row in self.board) + def _build_prompt(self): - """ - Build a detailed, step-aware prompt inspired by the Frozen Lake example. - """ prompt = ( "You are playing a Sudoku game.\n\n" - "Game Rules:\n" - "- The board is a 9x9 grid.\n" - "- A value of 0 represents an empty cell.\n" - "- Each row must contain the numbers 1 through 9 exactly once.\n" - "- Each column must contain the numbers 1 through 9 exactly once.\n" - "- Each 3x3 sub-grid must contain the numbers 1 through 9 exactly once.\n" - "- You may only place numbers in empty cells.\n\n" + "Rules:\n" + "- The board is 9x9.\n" + "- 0 means empty.\n" + "- Numbers 1–9 must appear exactly once in every row, column, and 3x3 block.\n" + "- You may only fill empty cells.\n\n" "Task:\n" - "- At each step, output ONE valid move to progress toward solving the puzzle.\n\n" - "Output Format (STRICT):\n" - "```row col value```\n\n" + "- In each step, output ONE OR MORE valid moves.\n" + f"- You may output up to {self.max_moves_per_step} moves per step.\n\n" + "Output format (STRICT):\n" + "row col value\n" + "row col value\n\n" "Example:\n" - "```0 2 4```\n\n" - f"Current Step: {self.current_step}\n" - f"Remaining Steps: {self.max_steps - self.current_step}\n\n" - f"Current Board:\n{self.board}\n" + "0 2 4\n" + "1 3 5\n\n" + f"Current step: {self.current_step}\n" + f"Remaining steps: {self.max_steps - self.current_step}\n\n" + f"Current board:\n{self.render_board()}\n" ) if self.last_board is not None and self.board == self.last_board: prompt += ( - "\nYour last response was invalid or had no effect. " - "Please recheck the Sudoku rules and the required output format." + "\nYour previous response was invalid or had no effect. " + "Please follow the rules and output format strictly." ) return prompt def parse_action(self, text): - """ - Parse model output. - - Expected format: - ```row col value``` - """ - matches = re.findall(r"```(.*?)```", text, re.DOTALL) - if not matches: - return None - - try: - parts = matches[-1].strip().split() + lines = text.strip().splitlines() + actions = [] + + for line in lines: + line = line.strip() + if not line: + continue + parts = line.split() if len(parts) != 3: return None - - r, c, v = map(int, parts) + try: + r, c, v = map(int, parts) + except ValueError: + return None if not (0 <= r <= 8 and 0 <= c <= 8 and 1 <= v <= 9): return None + actions.append((r, c, v)) - return r, c, v - except ValueError: + if not actions or len(actions) > self.max_moves_per_step: return None - def apply_move(self, r, c, v): - """Apply a move to the board if the cell is empty.""" - if self.board[r][c] == 0: - self.board[r][c] = v + return actions def run(self): - """ - Execute the Sudoku workflow step by step. - """ experiences = [] for _ in range(self.max_steps): prompt = self._build_prompt() - responses = self.model.chat([{"role": "user", "content": prompt}]) resp = responses[0] self.last_board = [row[:] for row in self.board] - action = self.parse_action(resp.response_text) - if action is None: - reward = -1.0 + actions = self.parse_action(resp.response_text) + if actions is None: experiences.append( Experience( tokens=resp.tokens, prompt_length=resp.prompt_length, - reward=reward, + reward=-1.0, logprobs=resp.logprobs, ) ) break - r, c, v = action - self.apply_move(r, c, v) + board_changed = False + invalid_move = False + + for r, c, v in actions: + if self.board[r][c] != 0: + invalid_move = True + break + self.board[r][c] = v + board_changed = True - # Invalid or ineffective action - if self.board == self.last_board or not self.judge.is_valid(self.board): - reward = -1.0 + if invalid_move or not board_changed or not self.judge.is_valid(self.board): experiences.append( Experience( tokens=resp.tokens, prompt_length=resp.prompt_length, - reward=reward, + reward=-1.0, logprobs=resp.logprobs, ) ) break - # Solved if self.judge.is_solved(self.board, self.solution): - reward = 1.0 experiences.append( Experience( tokens=resp.tokens, prompt_length=resp.prompt_length, - reward=reward, + reward=1.0, logprobs=resp.logprobs, ) ) break - # Intermediate step - reward = 0.0 experiences.append( Experience( tokens=resp.tokens, prompt_length=resp.prompt_length, - reward=reward, + reward=0.0, logprobs=resp.logprobs, ) ) - self.last_action = action + self.last_action = actions self.current_step += 1 return experiences