From fb87d3611d50cc3cbd894b26a881a365dd2bdf57 Mon Sep 17 00:00:00 2001 From: niushengxiao Date: Tue, 23 Dec 2025 19:41:50 +0800 Subject: [PATCH 1/3] fix: coordinate autotune_warmup sequence --- docker/cuda_version_12.8.0/Dockerfile.nixl | 2 +- docker/cuda_version_12.8.0/Dockerfile.nixl.deepep | 2 +- docker/cuda_version_12.8.0/Dockerfile.nixl.deepep.cache | 2 +- lightllm/common/basemodel/basemodel.py | 2 +- requirements.txt | 3 ++- 5 files changed, 6 insertions(+), 5 deletions(-) diff --git a/docker/cuda_version_12.8.0/Dockerfile.nixl b/docker/cuda_version_12.8.0/Dockerfile.nixl index 2febcc3f2..d26fceb68 100644 --- a/docker/cuda_version_12.8.0/Dockerfile.nixl +++ b/docker/cuda_version_12.8.0/Dockerfile.nixl @@ -81,7 +81,7 @@ RUN apt-get update && apt-get install -y cmake automake autotools-dev libtool l RUN apt-get update && apt-get install -y pkg-config tmux net-tools libaio-dev ; \ cd /usr/local/src; \ pip install --upgrade meson pybind11 patchelf; \ - git clone https://github.com/ai-dynamo/nixl.git -b main && \ + git clone https://github.com/ai-dynamo/nixl.git -b v0.8.0 && \ cd nixl && \ rm -rf build && \ mkdir build && \ diff --git a/docker/cuda_version_12.8.0/Dockerfile.nixl.deepep b/docker/cuda_version_12.8.0/Dockerfile.nixl.deepep index bb58d8dc5..19633d252 100644 --- a/docker/cuda_version_12.8.0/Dockerfile.nixl.deepep +++ b/docker/cuda_version_12.8.0/Dockerfile.nixl.deepep @@ -108,7 +108,7 @@ RUN apt-get update && apt-get install -y cmake automake autotools-dev libtool l RUN apt-get update && apt-get install -y pkg-config tmux net-tools libaio-dev ; \ cd /usr/local/src; \ pip install --upgrade meson pybind11 patchelf; \ - git clone https://github.com/ai-dynamo/nixl.git -b main && \ + git clone https://github.com/ai-dynamo/nixl.git -b v0.8.0 && \ cd nixl && \ rm -rf build && \ mkdir build && \ diff --git a/docker/cuda_version_12.8.0/Dockerfile.nixl.deepep.cache b/docker/cuda_version_12.8.0/Dockerfile.nixl.deepep.cache index e60fc24ff..9d4d6f382 100644 --- a/docker/cuda_version_12.8.0/Dockerfile.nixl.deepep.cache +++ b/docker/cuda_version_12.8.0/Dockerfile.nixl.deepep.cache @@ -110,7 +110,7 @@ RUN apt-get update && apt-get install -y cmake automake autotools-dev libtool l RUN apt-get update && apt-get install -y pkg-config tmux net-tools libaio-dev ; \ cd /usr/local/src; \ pip install --upgrade meson pybind11 patchelf; \ - git clone https://github.com/ai-dynamo/nixl.git -b main && \ + git clone https://github.com/ai-dynamo/nixl.git -b v0.8.0 && \ cd nixl && \ rm -rf build && \ mkdir build && \ diff --git a/lightllm/common/basemodel/basemodel.py b/lightllm/common/basemodel/basemodel.py index 84d53f3b1..a702e660e 100755 --- a/lightllm/common/basemodel/basemodel.py +++ b/lightllm/common/basemodel/basemodel.py @@ -112,10 +112,10 @@ def __init__(self, kvargs): self._init_some_value() self._init_custom() self._init_inferstate_cls() - self._autotune_warmup() self._init_padded_req() # wait必须在init cudagraph 之前,避免错误捕获 self._wait_other_modules_ready() + self._autotune_warmup() self._init_cudagraph() self._init_prefill_cuda_graph() self._check_max_len_infer() diff --git a/requirements.txt b/requirements.txt index d3b88e2e1..8d9a011be 100644 --- a/requirements.txt +++ b/requirements.txt @@ -92,4 +92,5 @@ torchvision==0.23.0 interegular==0.3.3 partial_json_parser==0.2.1.1.post6 websockets==15.0.1 -cupy-cuda12x==13.6.0 \ No newline at end of file +cupy-cuda12x==13.6.0 +nixl==0.8.0 From 2bd6cb12b2cbf6f34982af6ffe5aebfe38e8a984 Mon Sep 17 00:00:00 2001 From: hiworldwzj <30762946+hiworldwzj@users.noreply.github.com> Date: Wed, 24 Dec 2025 09:47:27 +0800 Subject: [PATCH 2/3] fix --- lightllm/common/basemodel/basemodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightllm/common/basemodel/basemodel.py b/lightllm/common/basemodel/basemodel.py index a702e660e..2d4209028 100755 --- a/lightllm/common/basemodel/basemodel.py +++ b/lightllm/common/basemodel/basemodel.py @@ -112,10 +112,10 @@ def __init__(self, kvargs): self._init_some_value() self._init_custom() self._init_inferstate_cls() - self._init_padded_req() # wait必须在init cudagraph 之前,避免错误捕获 self._wait_other_modules_ready() self._autotune_warmup() + self._init_padded_req() self._init_cudagraph() self._init_prefill_cuda_graph() self._check_max_len_infer() From 58f8df8b85f1bc1676a22ff3b0fa773da9b9a9a6 Mon Sep 17 00:00:00 2001 From: niushengxiao Date: Wed, 24 Dec 2025 09:54:18 +0800 Subject: [PATCH 3/3] fix --- docker/cuda_version_12.8.0/Dockerfile.nixl | 2 +- docker/cuda_version_12.8.0/Dockerfile.nixl.deepep | 2 +- docker/cuda_version_12.8.0/Dockerfile.nixl.deepep.cache | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docker/cuda_version_12.8.0/Dockerfile.nixl b/docker/cuda_version_12.8.0/Dockerfile.nixl index d26fceb68..4bcb66af5 100644 --- a/docker/cuda_version_12.8.0/Dockerfile.nixl +++ b/docker/cuda_version_12.8.0/Dockerfile.nixl @@ -81,7 +81,7 @@ RUN apt-get update && apt-get install -y cmake automake autotools-dev libtool l RUN apt-get update && apt-get install -y pkg-config tmux net-tools libaio-dev ; \ cd /usr/local/src; \ pip install --upgrade meson pybind11 patchelf; \ - git clone https://github.com/ai-dynamo/nixl.git -b v0.8.0 && \ + git clone https://github.com/ai-dynamo/nixl.git -b 0.8.0 && \ cd nixl && \ rm -rf build && \ mkdir build && \ diff --git a/docker/cuda_version_12.8.0/Dockerfile.nixl.deepep b/docker/cuda_version_12.8.0/Dockerfile.nixl.deepep index 19633d252..96461dcc1 100644 --- a/docker/cuda_version_12.8.0/Dockerfile.nixl.deepep +++ b/docker/cuda_version_12.8.0/Dockerfile.nixl.deepep @@ -108,7 +108,7 @@ RUN apt-get update && apt-get install -y cmake automake autotools-dev libtool l RUN apt-get update && apt-get install -y pkg-config tmux net-tools libaio-dev ; \ cd /usr/local/src; \ pip install --upgrade meson pybind11 patchelf; \ - git clone https://github.com/ai-dynamo/nixl.git -b v0.8.0 && \ + git clone https://github.com/ai-dynamo/nixl.git -b 0.8.0 && \ cd nixl && \ rm -rf build && \ mkdir build && \ diff --git a/docker/cuda_version_12.8.0/Dockerfile.nixl.deepep.cache b/docker/cuda_version_12.8.0/Dockerfile.nixl.deepep.cache index 9d4d6f382..2ff2dc361 100644 --- a/docker/cuda_version_12.8.0/Dockerfile.nixl.deepep.cache +++ b/docker/cuda_version_12.8.0/Dockerfile.nixl.deepep.cache @@ -110,7 +110,7 @@ RUN apt-get update && apt-get install -y cmake automake autotools-dev libtool l RUN apt-get update && apt-get install -y pkg-config tmux net-tools libaio-dev ; \ cd /usr/local/src; \ pip install --upgrade meson pybind11 patchelf; \ - git clone https://github.com/ai-dynamo/nixl.git -b v0.8.0 && \ + git clone https://github.com/ai-dynamo/nixl.git -b 0.8.0 && \ cd nixl && \ rm -rf build && \ mkdir build && \