From f327695079298e703e24eb8a9f2493fe4e2bde80 Mon Sep 17 00:00:00 2001 From: Atream Date: Mon, 24 Feb 2025 09:30:54 +0000 Subject: [PATCH] fix KExpertsMarlin on GPU with out CUDA Graph --- .../optimize/optimize_rules/Moonlight-16B-A3B.yaml | 11 +++++++++++ ktransformers/util/custom_gguf.py | 2 ++ 2 files changed, 13 insertions(+) diff --git a/ktransformers/optimize/optimize_rules/Moonlight-16B-A3B.yaml b/ktransformers/optimize/optimize_rules/Moonlight-16B-A3B.yaml index 4c8eca2..6cea246 100644 --- a/ktransformers/optimize/optimize_rules/Moonlight-16B-A3B.yaml +++ b/ktransformers/optimize/optimize_rules/Moonlight-16B-A3B.yaml @@ -53,6 +53,17 @@ generate_op: "KExpertsCPU" out_device: "cuda" recursive: False # don't recursively inject submodules of this module +# if want to use more VRAM, use experts Marlin and disable CUDA Graph(disable CUDA Graph may cause low performance) +#- match: +# name: "^model\\.layers\\..*\\.mlp\\.experts$" +# replace: +# class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism +# kwargs: +# prefill_device: "cuda" +# prefill_op: "KExpertsTorch" +# generate_device: "cuda" +# generate_op: "KExpertsMarlin" +# recursive: False # don't recursively inject submodules of this module - match: name: "^model\\.layers\\..*\\.self_attn$" replace: diff --git a/ktransformers/util/custom_gguf.py b/ktransformers/util/custom_gguf.py index 919f432..72c3efb 100644 --- a/ktransformers/util/custom_gguf.py +++ b/ktransformers/util/custom_gguf.py @@ -310,6 +310,8 @@ class GGUFLoader: values = GGML_DEQUANTIZE[ggml_name](data) values = torch.from_numpy(values.copy()) + if ggml_name == "BF16": + values = values.view(torch.bfloat16) values = values.view(shape[-2::-1]) return values