diff --git a/nemo_deploy/llm/megatronllm_deployable.py b/nemo_deploy/llm/megatronllm_deployable.py index 8cd59d50e1..d4359d1e26 100755 --- a/nemo_deploy/llm/megatronllm_deployable.py +++ b/nemo_deploy/llm/megatronllm_deployable.py @@ -22,7 +22,7 @@ import torch.distributed from jinja2 import Template from megatron.core.inference.common_inference_params import CommonInferenceParams -from megatron.core.inference.inference_request import InferenceRequest +from megatron.core.inference.inference_request import DynamicInferenceRequestRecord, InferenceRequest from nemo_deploy import ITritonDeployable from nemo_deploy.llm.inference.inference_base import create_mcore_engine @@ -470,6 +470,10 @@ def _infer_fn( ) results = self.generate(prompts, inference_params) + # Handle DynamicInferenceRequestRecord objects by merging them into a single request + results = [ + r.merge(self.mcore_tokenizer) if isinstance(r, DynamicInferenceRequestRecord) else r for r in results + ] if echo: output_texts = [r.prompt + r.generated_text if text_only else r for r in results] else: