Merge pull request #638 from kvcache-ai/feat-moonlight

fix KExpertsMarlin on GPU with out CUDA Graph
2026-03-21 10:01:39 +08:00 · 2025-02-24 17:32:05 +08:00 · 2025-02-24 17:32:05 +08:00 · 4b5991e77e
commit 4b5991e77e
parent eb039b723d f327695079
2 changed files with 13 additions and 0 deletions
--- a/ktransformers/optimize/optimize_rules/Moonlight-16B-A3B.yaml
+++ b/ktransformers/optimize/optimize_rules/Moonlight-16B-A3B.yaml
@ -53,6 +53,17 @@
      generate_op: "KExpertsCPU"
      out_device: "cuda"
  recursive: False # don't recursively inject submodules of this module
+# if want to use more VRAM, use experts Marlin and disable CUDA Graph(disable CUDA Graph may cause low performance)
+#- match:
+#    name: "^model\\.layers\\..*\\.mlp\\.experts$"
+#  replace:
+#    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
+#    kwargs:
+#      prefill_device: "cuda"
+#      prefill_op: "KExpertsTorch"
+#      generate_device: "cuda"
+#      generate_op: "KExpertsMarlin"
+#  recursive: False # don't recursively inject submodules of this module
 - match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
--- a/ktransformers/util/custom_gguf.py
+++ b/ktransformers/util/custom_gguf.py
@ -310,6 +310,8 @@ class GGUFLoader:
            values = GGML_DEQUANTIZE[ggml_name](data)
            values = torch.from_numpy(values.copy())

+        if ggml_name == "BF16":
+            values = values.view(torch.bfloat16)
        values = values.view(shape[-2::-1])

        return values