Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion openhands-sdk/openhands/sdk/agent/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -497,5 +497,5 @@ def tools_map(self) -> dict[str, ToolDefinition]:
RuntimeError: If the agent has not been initialized.
"""
if not self._initialized:
raise RuntimeError("Agent not initialized; call initialize() before use")
raise RuntimeError("Agent not initialized; call _initialize() before use")
return self._tools
5 changes: 5 additions & 0 deletions openhands-sdk/openhands/sdk/conversation/conversation.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def __new__(
type[ConversationVisualizerBase] | ConversationVisualizerBase | None
) = DefaultConversationVisualizer,
secrets: dict[str, SecretValue] | dict[str, str] | None = None,
stop_agent_on_close: bool = False,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Issue: Confusing parameter name

The name stop_agent_on_close is misleading - it sounds like it stops the agent, but it actually controls whether server-side resources (tmux sessions, executors) are cleaned up. This will confuse users.

Consider a more descriptive name:

  • cleanup_server_resources
  • delete_conversation_on_close
  • release_resources_on_close

Also, defaulting to False means the scalability issues described in the PR (tmux session accumulation, resource leaks) will still occur by default. Users must explicitly opt-in to the fixes. Should this default to True to actually fix the problems?

) -> "LocalConversation": ...

@overload
Expand All @@ -88,6 +89,7 @@ def __new__(
type[ConversationVisualizerBase] | ConversationVisualizerBase | None
) = DefaultConversationVisualizer,
secrets: dict[str, SecretValue] | dict[str, str] | None = None,
stop_agent_on_close: bool = False,
) -> "RemoteConversation": ...

def __new__(
Expand All @@ -109,6 +111,7 @@ def __new__(
type[ConversationVisualizerBase] | ConversationVisualizerBase | None
) = DefaultConversationVisualizer,
secrets: dict[str, SecretValue] | dict[str, str] | None = None,
stop_agent_on_close: bool = False,
) -> BaseConversation:
from openhands.sdk.conversation.impl.local_conversation import LocalConversation
from openhands.sdk.conversation.impl.remote_conversation import (
Expand All @@ -134,6 +137,7 @@ def __new__(
visualizer=visualizer,
workspace=workspace,
secrets=secrets,
stop_agent_on_close=stop_agent_on_close,
)

return LocalConversation(
Expand All @@ -149,4 +153,5 @@ def __new__(
workspace=workspace,
persistence_dir=persistence_dir,
secrets=secrets,
stop_agent_on_close=stop_agent_on_close,
)
32 changes: 19 additions & 13 deletions openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ class LocalConversation(BaseConversation):
llm_registry: LLMRegistry
_cleanup_initiated: bool
_hook_processor: HookEventProcessor | None
stop_agent_on_close: bool = False

def __init__(
self,
Expand All @@ -77,6 +78,7 @@ def __init__(
type[ConversationVisualizerBase] | ConversationVisualizerBase | None
) = DefaultConversationVisualizer,
secrets: Mapping[str, SecretValue] | None = None,
stop_agent_on_close: bool = False,
**_: object,
):
"""Initialize the conversation.
Expand Down Expand Up @@ -222,6 +224,7 @@ def _default_callback(e):

atexit.register(self.close)
self._start_observability_span(str(desired_id))
self.stop_agent_on_close = stop_agent_on_close

@property
def id(self) -> ConversationID:
Expand Down Expand Up @@ -535,20 +538,23 @@ def close(self) -> None:
except AttributeError:
# Object may be partially constructed; span fields may be missing.
pass
try:
tools_map = self.agent.tools_map
except (AttributeError, RuntimeError):
# Agent not initialized or partially constructed
return
for tool in tools_map.values():
if self.stop_agent_on_close:
try:
executable_tool = tool.as_executable()
executable_tool.executor.close()
except NotImplementedError:
# Tool has no executor, skip it without erroring
continue
except Exception as e:
logger.warning(f"Error closing executor for tool '{tool.name}': {e}")
tools_map = self.agent.tools_map
except (AttributeError, RuntimeError):
# Agent not initialized or partially constructed
return
for tool in tools_map.values():
try:
executable_tool = tool.as_executable()
executable_tool.executor.close()
except NotImplementedError:
# Tool has no executor, skip it without erroring
continue
except Exception as e:
logger.warning(
f"Error closing executor for tool '{tool.name}': {e}"
)
Comment on lines +541 to +557
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Breaking Change: Tool executors no longer cleaned up by default

Previously, tool executors were always closed when conversations ended. Now they are only closed if stop_agent_on_close=True (which defaults to False).

This is a breaking behavioral change that will:

  1. Break existing tests (tests/tools/terminal/test_conversation_cleanup.py expects executors to be cleaned up)
  2. Lead to resource leaks (tmux sessions, open files, etc.) unless users know to set the flag
  3. Contradict the PR description which claims to fix resource accumulation

Why was this behavior changed? If there was a specific reason to make cleanup opt-in, it should be documented. Otherwise, executors should always be cleaned up to prevent resource leaks.


def ask_agent(self, question: str) -> str:
"""Ask the agent a simple, stateless question and get a direct LLM response.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,7 @@ class RemoteConversation(BaseConversation):
_client: httpx.Client
_hook_processor: HookEventProcessor | None
_cleanup_initiated: bool
stop_agent_on_close: bool = False
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shall we consider rename it to smth like: delete_on_close?


def __init__(
self,
Expand All @@ -456,6 +457,7 @@ def __init__(
type[ConversationVisualizerBase] | ConversationVisualizerBase | None
) = DefaultConversationVisualizer,
secrets: Mapping[str, SecretValue] | None = None,
stop_agent_on_close: bool = False,
**_: object,
) -> None:
"""Remote conversation proxy that talks to an agent server.
Expand Down Expand Up @@ -623,6 +625,7 @@ def __init__(
)
self._hook_processor = HookEventProcessor(hook_manager=hook_manager)
self._hook_processor.run_session_start()
self.stop_agent_on_close = stop_agent_on_close

def _create_llm_completion_log_callback(self) -> ConversationCallbackType:
"""Create a callback that writes LLM completion logs to client filesystem."""
Expand Down Expand Up @@ -992,6 +995,13 @@ def close(self) -> None:
pass

self._end_observability_span()
if self.stop_agent_on_close:
try:
# trigger server-side delete_conversation to release resources
# like tmux sessions
_send_request(self._client, "DELETE", f"/api/conversations/{self.id}")
except Exception:
pass
Comment on lines +998 to +1004
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Breaking Change: Server-side cleanup now opt-in

An earlier commit (236ed33) always called DELETE to clean up tmux sessions. This was later changed to be opt-in via stop_agent_on_close=False.

This means:

  1. By default, tmux sessions will still accumulate (the problem is NOT fixed)
  2. The example code in the PR description does not set stop_agent_on_close=True, so it would still have the accumulation problem
  3. Users must know to explicitly enable this flag

Additionally, the bare except Exception: pass on line 1003 silently swallows all errors, which could hide important issues like network failures or permission errors. Consider logging the exception:

Suggested change
if self.stop_agent_on_close:
try:
# trigger server-side delete_conversation to release resources
# like tmux sessions
_send_request(self._client, "DELETE", f"/api/conversations/{self.id}")
except Exception:
pass
if self.stop_agent_on_close:
try:
# trigger server-side delete_conversation to release resources
# like tmux sessions
_send_request(self._client, "DELETE", f"/api/conversations/{self.id}")
except Exception as e:
logger.warning(f"Failed to delete conversation on server: {e}")


def __del__(self) -> None:
try:
Expand Down
11 changes: 8 additions & 3 deletions openhands-sdk/openhands/sdk/workspace/remote/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,17 @@ def client(self) -> httpx.Client:
if client is None:
# Configure reasonable timeouts for HTTP requests
# - connect: 10 seconds to establish connection
# - read: 60 seconds to read response (for LLM operations)
# - read: 600 seconds to read response (for LLM operations)
# - write: 10 seconds to send request
# - pool: 10 seconds to get connection from pool
timeout = httpx.Timeout(connect=10.0, read=60.0, write=10.0, pool=10.0)
timeout = httpx.Timeout(
connect=10.0, read=self.read_timeout, write=10.0, pool=10.0
)
client = httpx.Client(
base_url=self.host, timeout=timeout, headers=self._headers
base_url=self.host,
timeout=timeout,
headers=self._headers,
limits=httpx.Limits(max_connections=None),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Critical: Unbounded connection limit is dangerous

Setting max_connections=None removes all connection pooling limits, which could lead to resource exhaustion. While the PR description mentions needing >100 concurrent connections, unlimited connections can cause:

  • Memory exhaustion
  • File descriptor exhaustion (even with ulimit fixes)
  • Performance degradation

Recommend setting a large but bounded limit instead:

Suggested change
limits=httpx.Limits(max_connections=None),
limits=httpx.Limits(max_connections=1000, max_keepalive_connections=500),

This allows scaling to 1000 concurrent connections (10x the default) while still providing protection against runaway resource usage.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about we keep the limits to be default?

)
self._client = client
return client
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ class RemoteWorkspaceMixin(BaseModel):
working_dir: str = Field(
description="The working directory for agent operations and tool execution."
)
read_timeout: float = Field(
default=600.0,
description="Timeout in seconds for reading operations of httpx.Client.",
)

def model_post_init(self, context: Any) -> None:
# Set up remote host
Expand Down
2 changes: 2 additions & 0 deletions openhands-workspace/openhands/workspace/docker/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,8 @@ def _start_container(self, image: str, context: Any) -> None:
"--platform",
self.platform,
"--rm",
"--ulimit",
"nofile=65536:65536", # prevent "too many open files" errors
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider ApptainerWorkspace

This ulimit fix is only applied to DockerWorkspace. Does ApptainerWorkspace need the same fix? If users run large-scale jobs with Apptainer, they could still hit "too many open files" errors.

Check if apptainer run has similar ulimit options that should be set.

"--name",
f"agent-server-{uuid.uuid4()}",
*flags,
Expand Down
Loading