Update attention.py

2026-01-26 00:30:17 +08:00 · 2025-02-15 15:43:35 +08:00 · 2025-02-15 15:43:35 +08:00 · 92399283b6
commit 92399283b6
parent d90749d35d
1 changed files with 2 additions and 2 deletions
--- a/ktransformers/operators/attention.py
+++ b/ktransformers/operators/attention.py
@ -262,7 +262,7 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
            """

            # flash attn doesn't support head_dim bigger than 256
-            # use vLLM triton attention kernel for MQA
+            # use triton attention kernel adapted from vLLM and SGLang for MQA
            decode_attention_fwd_grouped(query_states, compressed_kv_with_k_pe, compressed_kv, attn_output,
                             page_table,
                             position_ids.squeeze(0).to(torch.int32), attn_logits,
@ -551,4 +551,4 @@ class KLlamaAttention(BaseInjectedModule):
        if not output_attentions:
            attn_weights = None

-        return attn_output, attn_weights, past_key_value
+        return attn_output, attn_weights, past_key_value