refine cudagraph

yuanlehome · yuanlehome · commit 7c50810be6fd · 2025-09-11T12:38:09.000+08:00
diff --git a/fastdeploy/input/preprocess.py b/fastdeploy/input/preprocess.py
@@ -72,11 +72,13 @@ def create_processor(self):
         reasoning_parser_obj = None
         tool_parser_obj = None
         try:
-            from fastdeploy.plugins.reasoning_parser import (
-                load_reasoning_parser_plugins,
-            )
+            if self.reasoning_parser == "custom_reasoning_parser":
+                from fastdeploy.plugins.reasoning_parser import (
+                    load_reasoning_parser_plugins,
+                )
 
-            reasoning_parser_obj = load_reasoning_parser_plugins()
+                custom_reasoning_parser = load_reasoning_parser_plugins()
+                reasoning_parser_obj = custom_reasoning_parser
         except:
             if self.reasoning_parser:
                 reasoning_parser_obj = ReasoningParserManager.get_reasoning_parser(self.reasoning_parser)
diff --git a/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py b/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py
@@ -14,14 +14,15 @@
 # limitations under the License.
 """
 
+import os
 from contextlib import contextmanager
-from dataclasses import dataclass
-from typing import Callable, Dict, Optional
+from dataclasses import dataclass, field
+from typing import Callable, Dict, List, Optional
 
-import paddle.jit.dy2static.utils as jit_utils
 import paddle.nn.layer
 from paddle.device.cuda import graphs
 
+from fastdeploy import envs
 from fastdeploy.config import FDConfig
 from fastdeploy.distributed.communication import capture_custom_allreduce
 from fastdeploy.utils import get_logger
@@ -40,36 +41,39 @@ class ConcreteSizeEntry:
     # Has runtime-bs been captured before
     captured: bool = False
 
-    # Need to be captured callable object（dynamic graph or static graph backend）
+    # Need to be captured callable object（dynamic graph or static grpah backend）
     runnable: Callable = None  # type: ignore
     # Number of completed warmups
     num_finished_warmup: int = 0
     # Captured cuda graph object corresponding to the current real shape
     cuda_graph: Optional[graphs.CUDAGraph] = None
-    # Output buffer of cudagraph
-    output_buffer: Optional[paddle.Tensor] = None
+    # Output buffers of cudagraph
+    output_buffers: List[Optional[paddle.Tensor]] = field(default_factory=list)
 
 
 class Dy2StCudaGraphManager:
     def __init__(self):
+        # NOTE(gongshaotian): Use local import to avoid RLHF version problems
+        from paddle.jit.dy2static.utils import CUDAGraphState
 
-        self.state = jit_utils.CUDAGraphState.DISABLE
+        self.state = CUDAGraphState.DISABLE
         self.captured_batch_size = set()
         self.batch_size = -1
 
     def run_impl(self, original_run_impl, inputs, parameters, attrs):
+        from paddle.jit.dy2static.utils import CUDAGraphState
 
         run_state = self.state
         prog_attrs, cuda_graph_attrs = attrs
-        if run_state == jit_utils.CUDAGraphState.REPLAY:
+        if run_state == CUDAGraphState.REPLAY:
             if self.batch_size not in self.captured_batch_size:
-                run_state = jit_utils.CUDAGraphState.DISABLE
-        elif run_state == jit_utils.CUDAGraphState.CAPTURE:
+                run_state = CUDAGraphState.DISABLE
+        elif run_state == CUDAGraphState.CAPTURE:
             self.captured_batch_size.add(self.batch_size)
 
         cuda_graph_attrs |= {
             "cuda_graph_state": run_state,
-            "cuda_graph_dispatch_key": self.batch_size if run_state != jit_utils.CUDAGraphState.DISABLE else 0,
+            "cuda_graph_dispatch_key": self.batch_size if run_state != CUDAGraphState.DISABLE else 0,
         }
         return original_run_impl(inputs, parameters, (prog_attrs, cuda_graph_attrs))
 
@@ -102,6 +106,7 @@ def __init__(
             self.cuda_graph_manager = Dy2StCudaGraphManager()
 
     def run_static_model(self, entry: ConcreteSizeEntry, **kwargs):
+        from paddle.jit.dy2static.utils import CUDAGraphState
 
         if not entry.captured:
             # Warmup the model
@@ -118,21 +123,21 @@ def run_static_model(self, entry: ConcreteSizeEntry, **kwargs):
             entry.input_addresses = input_addresses
 
             # Capture
-            self.cuda_graph_manager.state = jit_utils.CUDAGraphState.CAPTURE
+            self.cuda_graph_manager.state = CUDAGraphState.CAPTURE
             self.cuda_graph_manager.batch_size = entry.real_shape
             entry.captured = True
             with self.cuda_graph_manager.run_impl_guard():
                 entry.runnable(**kwargs)
 
         # Replay
-        self.cuda_graph_manager.state = jit_utils.CUDAGraphState.REPLAY
+        self.cuda_graph_manager.state = CUDAGraphState.REPLAY
         self.cuda_graph_manager.batch_size = entry.real_shape
         with self.cuda_graph_manager.run_impl_guard():
             return entry.runnable(**kwargs)
 
     def __call__(self, **kwargs):
         # Get real shape(all num tokens)
-        ids_remove_padding: paddle.Tensor = kwargs["ids_remove_padding"]
+        ids_remove_padding: paddle.Tensor = kwargs["forward_meta"].ids_remove_padding
         real_shape = ids_remove_padding.shape[0]
         padding_real_shape = self.real_shape_to_captured_size[real_shape]
         logger.debug(
@@ -173,14 +178,22 @@ def __call__(self, **kwargs):
             # Capture
             with capture_custom_allreduce():
                 new_grpah.capture_begin()
-                output = entry.runnable(**kwargs)
+                outputs = entry.runnable(**kwargs)
+                if isinstance(outputs, paddle.Tensor):
+                    assert outputs is not None
+                    outputs = [outputs]
                 new_grpah.capture_end()
 
             # Store output buffer
             entry.cuda_graph = new_grpah
-            entry.output_buffer = paddle.zeros_like(output)
-            output._share_buffer_to(entry.output_buffer)
-            output._clear
+            for output in outputs:
+                if output is not None:
+                    output_buffer = paddle.zeros_like(output)
+                    output._share_buffer_to(output_buffer)
+                    output._clear
+                    entry.output_buffers.append(output_buffer)
+                else:
+                    entry.output_buffers.append(None)
 
             paddle.device.synchronize()
 
@@ -191,7 +204,9 @@ def __call__(self, **kwargs):
         # Replay
         entry.cuda_graph.replay()
         logger.debug(f"[CUDA GRAPH] CUDAGraph replayed for real shape {padding_real_shape}")
-        return entry.output_buffer
+        if len(entry.output_buffers) == 1:
+            return entry.output_buffers[0]
+        return entry.output_buffers
 
     def _create_entry_dict(self):
         """ """
@@ -221,8 +236,11 @@ def clear_graph(self):
 
     def _save_cudagrpah_dot_files(self, entry):
         """Print CUDAGrpah to dot files"""
+        log_dir = envs.FD_LOG_DIR
+        if not os.path.exists(log_dir):
+            os.makedirs(log_dir, exist_ok=True)
         if entry.cuda_graph:
             entry.cuda_graph.print_to_dot_files(
-                f"./log/GraphDotFiles/backend{id(self)}_shape{entry.real_shape}",
+                f"{log_dir}/GraphDotFiles/backend{id(self)}_shape{entry.real_shape}",
                 1 << 0,
             )