Update attention.py

This commit is contained in:
Atream 2025-02-15 15:43:35 +08:00 committed by GitHub
parent d90749d35d
commit 92399283b6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -262,7 +262,7 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
"""
# flash attn doesn't support head_dim bigger than 256
# use vLLM triton attention kernel for MQA
# use triton attention kernel adapted from vLLM and SGLang for MQA
decode_attention_fwd_grouped(query_states, compressed_kv_with_k_pe, compressed_kv, attn_output,
page_table,
position_ids.squeeze(0).to(torch.int32), attn_logits,
@ -551,4 +551,4 @@ class KLlamaAttention(BaseInjectedModule):
if not output_attentions:
attn_weights = None
return attn_output, attn_weights, past_key_value
return attn_output, attn_weights, past_key_value