mirror of
https://github.com/RYDE-WORK/ktransformers.git
synced 2026-01-19 12:43:16 +08:00
Merge branch 'main' into feat-absorb-for-long-prefill
This commit is contained in:
commit
e9b1216a9a
@ -64,7 +64,6 @@ def local_chat(
|
||||
force_think: bool = False,
|
||||
):
|
||||
|
||||
|
||||
torch.set_grad_enabled(False)
|
||||
|
||||
Config().cpu_infer = cpu_infer
|
||||
|
||||
@ -441,10 +441,10 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
|
||||
# mla_wrapper run output: [tokens, self.num_heads, self.kv_lora_rank]
|
||||
# attn_output [bsz, q_len, self.num_heads, self.kv_lora_rank]
|
||||
# out_absorb [self.num_heads, self.v_head_dim, self.kv_lora_rank]
|
||||
attn_output = attn_output.transpose(1, 2)
|
||||
attn_output = torch.matmul(attn_output, out_absorb.mT)
|
||||
attn_output = attn_output.transpose(1, 2) # [bsz, self.num_heads, q_len, self.kv_lora_rank]
|
||||
attn_output = torch.matmul(attn_output, out_absorb.mT) # [bsz, self.num_heads, q_len, self.v_head_dim]
|
||||
|
||||
attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)
|
||||
attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim) # [bsz, q_len, self.num_heads * self.v_head_dim]
|
||||
attn_output = self.o_proj(attn_output)
|
||||
|
||||
return attn_output, None, past_key_value
|
||||
|
||||
@ -159,7 +159,7 @@ class KExpertsCPU(KExpertsBase):
|
||||
down_ptr = ctypes.addressof(
|
||||
ctypes.cast(self.down.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents
|
||||
)
|
||||
# print(self.gate_qtype, self.up_qtype, self.down_qtype)
|
||||
#print(self.gate_type, self.up_type, self.down_type)
|
||||
n_routed_experts = self.n_routed_experts
|
||||
# n_routed_experts = len(self.orig_module)
|
||||
moe_config = MOEConfig(
|
||||
@ -450,9 +450,9 @@ class KExpertsTorch(KExpertsBase):
|
||||
self.up[i] = w["up"][i, ...].to(device=device, dtype=self.dtype)
|
||||
self.down[i] = w["down"][i, ...].to(device=device, dtype=self.dtype)
|
||||
|
||||
self.up = torch.cat(self.gate, dim=0)
|
||||
self.up = torch.cat(self.up, dim=0)
|
||||
self.gate = torch.cat(self.gate, dim=0)
|
||||
self.down = torch.cat(self.gate, dim=0)
|
||||
self.down = torch.cat(self.down, dim=0)
|
||||
return
|
||||
|
||||
def unload(self):
|
||||
|
||||
86
ktransformers/optimize/optimize_rules/Moonlight-16B-A3B.yaml
Normal file
86
ktransformers/optimize/optimize_rules/Moonlight-16B-A3B.yaml
Normal file
@ -0,0 +1,86 @@
|
||||
- match:
|
||||
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
|
||||
replace:
|
||||
class: ktransformers.operators.RoPE.RotaryEmbeddingV3
|
||||
kwargs:
|
||||
generate_device: "cuda"
|
||||
prefill_device: "cuda"
|
||||
|
||||
- match:
|
||||
name: "^lm_head$" # regular expression
|
||||
class: torch.nn.Linear # only match modules matching name and class simultaneously
|
||||
replace:
|
||||
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
||||
kwargs:
|
||||
generate_device: "cuda"
|
||||
prefill_device: "cuda"
|
||||
generate_op: "KLinearMarlin"
|
||||
prefill_op: "KLinearTorch"
|
||||
|
||||
- match:
|
||||
name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression
|
||||
class: torch.nn.Linear # only match modules matching name and class simultaneously
|
||||
replace:
|
||||
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
||||
kwargs:
|
||||
generate_device: "cuda"
|
||||
prefill_device: "cuda"
|
||||
generate_op: "KLinearMarlin"
|
||||
prefill_op: "KLinearTorch"
|
||||
- match:
|
||||
name: "^model\\.layers\\..*\\.mlp$"
|
||||
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
|
||||
replace:
|
||||
class: ktransformers.operators.experts.KDeepseekV3MoE # mlp module with custom forward function
|
||||
kwargs:
|
||||
generate_device: "cuda"
|
||||
prefill_device: "cuda"
|
||||
- match:
|
||||
class: ktransformers.models.modeling_deepseek_v3.MoEGate
|
||||
replace:
|
||||
class: ktransformers.operators.gate.KMoEGate
|
||||
kwargs:
|
||||
generate_device: "cuda:0"
|
||||
prefill_device: "cuda:0"
|
||||
- match:
|
||||
name: "^model\\.layers\\..*\\.mlp\\.experts$"
|
||||
replace:
|
||||
class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism
|
||||
kwargs:
|
||||
prefill_device: "cuda"
|
||||
prefill_op: "KExpertsTorch"
|
||||
generate_device: "cpu"
|
||||
generate_op: "KExpertsCPU"
|
||||
out_device: "cuda"
|
||||
recursive: False # don't recursively inject submodules of this module
|
||||
# if want to use more VRAM, use experts Marlin and disable CUDA Graph(disable CUDA Graph may cause low performance)
|
||||
#- match:
|
||||
# name: "^model\\.layers\\..*\\.mlp\\.experts$"
|
||||
# replace:
|
||||
# class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism
|
||||
# kwargs:
|
||||
# prefill_device: "cuda"
|
||||
# prefill_op: "KExpertsTorch"
|
||||
# generate_device: "cuda"
|
||||
# generate_op: "KExpertsMarlin"
|
||||
# recursive: False # don't recursively inject submodules of this module
|
||||
- match:
|
||||
name: "^model\\.layers\\..*\\.self_attn$"
|
||||
replace:
|
||||
class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
|
||||
kwargs:
|
||||
generate_device: "cuda"
|
||||
prefill_device: "cuda"
|
||||
- match:
|
||||
name: "^model$"
|
||||
replace:
|
||||
class: "ktransformers.operators.models.KDeepseekV2Model"
|
||||
kwargs:
|
||||
per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
|
||||
- match:
|
||||
name: "^model.embed_tokens"
|
||||
replace:
|
||||
class: "default"
|
||||
kwargs:
|
||||
generate_device: "cpu"
|
||||
prefill_device: "cpu"
|
||||
@ -310,6 +310,8 @@ class GGUFLoader:
|
||||
values = GGML_DEQUANTIZE[ggml_name](data)
|
||||
values = torch.from_numpy(values.copy())
|
||||
|
||||
if ggml_name == "BF16":
|
||||
values = values.view(torch.bfloat16)
|
||||
values = values.view(shape[-2::-1])
|
||||
|
||||
return values
|
||||
|
||||
@ -207,7 +207,7 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
|
||||
tokens.append(int(next_token))
|
||||
seq_length += 1
|
||||
|
||||
if next_token[0].item() == tokenizer.eos_token_id or tokenizer.decode(next_token) == '<|im_end|>':
|
||||
if next_token[0].item() == tokenizer.eos_token_id or tokenizer.decode(next_token.tolist()) == '<|im_end|>':
|
||||
print(stream.end(), end="", flush=True)
|
||||
break
|
||||
else:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user