Skip to content

Commit b46d843

Browse files
committed
fix deepseek oom
1 parent 384f36c commit b46d843

File tree

2 files changed

+7
-3
lines changed

2 files changed

+7
-3
lines changed

vllm_ascend/attention/attention_v1.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,8 @@ def __init__(
213213
self.model_config.max_model_len,
214214
AscendAttentionBackend.get_supported_block_size()[0])
215215

216-
def reorder_batch(self, scheduler_output: "SchedulerOutput") -> bool:
216+
def reorder_batch(self, input_batch,
217+
scheduler_output: "SchedulerOutput") -> bool:
217218
return False
218219

219220
def build(

vllm_ascend/worker/model_runner_v1.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2458,7 +2458,8 @@ def align_memory(tensor: torch.Tensor, alignment: int) -> torch.Tensor:
24582458
num_blocks, kv_cache_spec.block_size,
24592459
kv_cache_spec.num_kv_heads,
24602460
kv_cache_spec.head_size)
2461-
elif hasattr(attn_backend, "get_supported_block_size"):
2461+
elif hasattr(attn_backend, "get_supported_block_size"
2462+
) and not self.model_config.is_deepseek_mla:
24622463
block_size = attn_backend.get_supported_block_size()[0]
24632464
block_size_chunk = kv_cache_spec.block_size // block_size
24642465
kv_cache_shape = attn_backend.get_kv_cache_shape(
@@ -2605,7 +2606,9 @@ def may_reinitialize_input_batch(self,
26052606
[self.cache_config.block_size])
26062607
else:
26072608
# Fallback to cache config block_size if no backend found
2608-
kernel_block_size_list = [64]
2609+
kernel_block_size_list = [
2610+
64
2611+
] if not self.model_config.is_deepseek_mla else [0]
26092612
kernel_block_sizes.append(kernel_block_size_list)
26102613
else:
26112614
# This is likely Mamba or other non-attention cache,

0 commit comments

Comments
 (0)