fix deepseek oom

wangxiyuan · wangxiyuan · commit b46d84316130 · 2025-09-14T22:34:25.000+08:00
diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
@@ -213,7 +213,8 @@ def __init__(
             self.model_config.max_model_len,
             AscendAttentionBackend.get_supported_block_size()[0])
 
-    def reorder_batch(self, scheduler_output: "SchedulerOutput") -> bool:
+    def reorder_batch(self, input_batch,
+                      scheduler_output: "SchedulerOutput") -> bool:
         return False
 
     def build(
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -2458,7 +2458,8 @@ def align_memory(tensor: torch.Tensor, alignment: int) -> torch.Tensor:
                             num_blocks, kv_cache_spec.block_size,
                             kv_cache_spec.num_kv_heads,
                             kv_cache_spec.head_size)
-                    elif hasattr(attn_backend, "get_supported_block_size"):
+                    elif hasattr(attn_backend, "get_supported_block_size"
+                                 ) and not self.model_config.is_deepseek_mla:
                         block_size = attn_backend.get_supported_block_size()[0]
                         block_size_chunk = kv_cache_spec.block_size // block_size
                         kv_cache_shape = attn_backend.get_kv_cache_shape(
@@ -2605,7 +2606,9 @@ def may_reinitialize_input_batch(self,
                                               [self.cache_config.block_size])
                 else:
                     # Fallback to cache config block_size if no backend found
-                    kernel_block_size_list = [64]
+                    kernel_block_size_list = [
+                        64
+                    ] if not self.model_config.is_deepseek_mla else [0]
                 kernel_block_sizes.append(kernel_block_size_list)
             else:
                 # This is likely Mamba or other non-attention cache,