From e8e02e5ccc9227055617247fad60e1a973885109 Mon Sep 17 00:00:00 2001 From: Atream Date: Sun, 23 Feb 2025 14:21:18 +0000 Subject: [PATCH] support Moonlight --- ktransformers/local_chat.py | 10 ++-------- ktransformers/operators/experts.py | 2 +- ktransformers/util/utils.py | 2 +- 3 files changed, 4 insertions(+), 10 deletions(-) diff --git a/ktransformers/local_chat.py b/ktransformers/local_chat.py index 5b40455..d087752 100644 --- a/ktransformers/local_chat.py +++ b/ktransformers/local_chat.py @@ -58,13 +58,12 @@ def local_chat( gguf_path: str | None = None, max_new_tokens: int = 300, cpu_infer: int = Config().cpu_infer, - use_cuda_graph: bool = False, + use_cuda_graph: bool = True, prompt_file : str | None = None, mode: str = "normal", force_think: bool = False, ): - torch.set_grad_enabled(False) Config().cpu_infer = cpu_infer @@ -160,9 +159,6 @@ def local_chat( input_tensor = tokenizer.apply_chat_template( messages, add_generation_prompt=True, return_tensors="pt" ) - - # input_tensor = torch.tensor([[0, 6657, 84646]], device=input_tensor.device) - if force_think: token_thinks = torch.tensor([tokenizer.encode("\\n",add_special_tokens=False)],device=input_tensor.device) input_tensor = torch.cat( @@ -184,6 +180,4 @@ def local_chat( if __name__ == "__main__": - # fire.Fire(local_chat) - # local_chat(model_path="/mnt/data/model/DeepSeek-R1", gguf_path="/mnt/data/model/DeepseekV3-q4km-gguf", cpu_infer=33, force_think=False) - local_chat(model_path="/mnt/data/model/Moonlight-16B-A3B-Instruct", gguf_path="/mnt/data/model/Moonlight-16B-A3B-Instruct-GGUF", cpu_infer=33, force_think=False) \ No newline at end of file + fire.Fire(local_chat) \ No newline at end of file diff --git a/ktransformers/operators/experts.py b/ktransformers/operators/experts.py index 04c04c5..035bac4 100644 --- a/ktransformers/operators/experts.py +++ b/ktransformers/operators/experts.py @@ -159,7 +159,7 @@ class KExpertsCPU(KExpertsBase): down_ptr = ctypes.addressof( ctypes.cast(self.down.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents ) - # print(self.gate_qtype, self.up_qtype, self.down_qtype) + #print(self.gate_type, self.up_type, self.down_type) n_routed_experts = self.n_routed_experts # n_routed_experts = len(self.orig_module) moe_config = MOEConfig( diff --git a/ktransformers/util/utils.py b/ktransformers/util/utils.py index cc4a323..5c608b1 100644 --- a/ktransformers/util/utils.py +++ b/ktransformers/util/utils.py @@ -207,7 +207,7 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud tokens.append(int(next_token)) seq_length += 1 - if next_token[0].item() == tokenizer.eos_token_id or tokenizer.decode(next_token) == '<|im_end|>': + if next_token[0].item() == tokenizer.eos_token_id or tokenizer.decode(next_token.tolist()) == '<|im_end|>': print(stream.end(), end="", flush=True) break else: