From e8e02e5ccc9227055617247fad60e1a973885109 Mon Sep 17 00:00:00 2001
From: Atream <boxin-zhang@qq.com>
Date: Sun, 23 Feb 2025 14:21:18 +0000
Subject: [PATCH] support Moonlight

---
 ktransformers/local_chat.py        | 10 ++--------
 ktransformers/operators/experts.py |  2 +-
 ktransformers/util/utils.py        |  2 +-
 3 files changed, 4 insertions(+), 10 deletions(-)
diff --git a/ktransformers/local_chat.py b/ktransformers/local_chat.py
index 5b40455..d087752 100644
--- a/ktransformers/local_chat.py
+++ b/ktransformers/local_chat.py
@@ -58,13 +58,12 @@ def local_chat(
     gguf_path: str | None = None,
     max_new_tokens: int = 300,
     cpu_infer: int = Config().cpu_infer,
-    use_cuda_graph: bool = False,
+    use_cuda_graph: bool = True,
     prompt_file : str | None = None,
     mode: str = "normal",
     force_think: bool = False,
 ):
 
-
     torch.set_grad_enabled(False)
 
     Config().cpu_infer = cpu_infer
@@ -160,9 +159,6 @@ def local_chat(
         input_tensor = tokenizer.apply_chat_template(
             messages, add_generation_prompt=True, return_tensors="pt"
         )
-
-        # input_tensor = torch.tensor([[0, 6657, 84646]], device=input_tensor.device)
-
         if force_think:
             token_thinks = torch.tensor([tokenizer.encode("<think>\\n",add_special_tokens=False)],device=input_tensor.device)
             input_tensor = torch.cat(
@@ -184,6 +180,4 @@ def local_chat(
 
 
 if __name__ == "__main__":
-    # fire.Fire(local_chat)
-    # local_chat(model_path="/mnt/data/model/DeepSeek-R1", gguf_path="/mnt/data/model/DeepseekV3-q4km-gguf", cpu_infer=33, force_think=False)
-    local_chat(model_path="/mnt/data/model/Moonlight-16B-A3B-Instruct", gguf_path="/mnt/data/model/Moonlight-16B-A3B-Instruct-GGUF", cpu_infer=33, force_think=False)
\ No newline at end of file
+    fire.Fire(local_chat)
\ No newline at end of file
diff --git a/ktransformers/operators/experts.py b/ktransformers/operators/experts.py
index 04c04c5..035bac4 100644
--- a/ktransformers/operators/experts.py
+++ b/ktransformers/operators/experts.py
@@ -159,7 +159,7 @@ class KExpertsCPU(KExpertsBase):
         down_ptr = ctypes.addressof(
             ctypes.cast(self.down.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents
         )
-        # print(self.gate_qtype, self.up_qtype, self.down_qtype)
+        #print(self.gate_type, self.up_type, self.down_type)
         n_routed_experts = self.n_routed_experts
         # n_routed_experts = len(self.orig_module)
         moe_config = MOEConfig(
diff --git a/ktransformers/util/utils.py b/ktransformers/util/utils.py
index cc4a323..5c608b1 100644
--- a/ktransformers/util/utils.py
+++ b/ktransformers/util/utils.py
@@ -207,7 +207,7 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
             tokens.append(int(next_token))
             seq_length += 1
             
-            if next_token[0].item() == tokenizer.eos_token_id or tokenizer.decode(next_token) == '<|im_end|>':
+            if next_token[0].item() == tokenizer.eos_token_id or tokenizer.decode(next_token.tolist()) == '<|im_end|>':
                 print(stream.end(), end="", flush=True)
                 break
             else: