From 8704c091923288d4ff37c37101ce617129df0d98 Mon Sep 17 00:00:00 2001 From: lazymio Date: Mon, 24 Feb 2025 21:01:33 +0800 Subject: [PATCH 1/7] Allow temperature and top_p from requests --- .../server/api/openai/legacy/completions.py | 4 ++-- .../server/backend/interfaces/ktransformers.py | 6 +++--- .../server/backend/interfaces/transformers.py | 18 +++++++++++------- .../server/schemas/legacy/completions.py | 2 ++ 4 files changed, 18 insertions(+), 12 deletions(-) diff --git a/ktransformers/server/api/openai/legacy/completions.py b/ktransformers/server/api/openai/legacy/completions.py index be85a29..fe250f4 100644 --- a/ktransformers/server/api/openai/legacy/completions.py +++ b/ktransformers/server/api/openai/legacy/completions.py @@ -20,7 +20,7 @@ async def create_completion(request:Request,create:CompletionCreate): if create.stream: async def inner(): - async for token in interface.inference(create.prompt,id): + async for token in interface.inference(create.prompt,id,create.temperature,create.top_p): d = {'choices':[{'delta':{'content':token}}]} yield f"data:{json.dumps(d)}\n\n" d = {'choices':[{'delta':{'content':''},'finish_reason':''}]} @@ -28,6 +28,6 @@ async def create_completion(request:Request,create:CompletionCreate): return stream_response(request,inner()) else: comp = CompletionObject(id=id,object='text_completion',created=int(time())) - async for token in interface.inference(create.prompt,id): + async for token in interface.inference(create.prompt,id,create.temperature,create.top_p): comp.append_token(token) return comp diff --git a/ktransformers/server/backend/interfaces/ktransformers.py b/ktransformers/server/backend/interfaces/ktransformers.py index 49a3f16..85bfb29 100644 --- a/ktransformers/server/backend/interfaces/ktransformers.py +++ b/ktransformers/server/backend/interfaces/ktransformers.py @@ -14,7 +14,7 @@ from ktransformers.models.custom_cache import StaticCache from ktransformers.util.cuda_graph_runner import CUDAGraphRunner from ktransformers.local_chat import custom_models, default_optimize_rules from ktransformers.util.utils import get_device - +from typing import Optional warm_uped = False @@ -207,7 +207,7 @@ class KTransformersInterface(TransformersInterface): device = self.device_map.get("blk.0.self_attn", {}).get("generate_device", "cuda:0") return torch.tensor([self.seq_length - 1], device=device) - async def inference(self, local_messages, thread_id: str): + async def inference(self, local_messages, thread_id: str, temperature: Optional[float], top_p: Optional[float]): async with self._infer_lock: - async for v in super().inference(local_messages, thread_id): + async for v in super().inference(local_messages, thread_id, temperature, top_p): yield v diff --git a/ktransformers/server/backend/interfaces/transformers.py b/ktransformers/server/backend/interfaces/transformers.py index 8211933..d2e48a4 100644 --- a/ktransformers/server/backend/interfaces/transformers.py +++ b/ktransformers/server/backend/interfaces/transformers.py @@ -202,13 +202,17 @@ class TransformersInterface(BackendInterfaceBase): self.seq_length += 1 return self.streamer.put(new_tokens) - def prepare_logits_wrapper(self, inputs, device): + def prepare_logits_wrapper(self, inputs, device, temperature: Optional[float] = None, top_p: Optional[float] = None): + if temperature is None: + temperature = self.args.temperature + if top_p is None: + top_p = self.args.top_p generation_config, model_kwargs = self.model._prepare_generation_config( None, max_length=self.args.max_new_tokens, do_sample=True, top_k=self.args.top_k, - top_p=self.args.top_p, - temperature=self.args.temperature, + top_p=top_p, + temperature=temperature, repetition_penalty=self.args.repetition_penalty # change this to modify generate config ) self.inputs = inputs @@ -255,7 +259,7 @@ class TransformersInterface(BackendInterfaceBase): return self.logits_to_token(logits) @torch.no_grad - def prefill(self, input_ids: torch.Tensor, is_new: bool): + def prefill(self, input_ids: torch.Tensor, is_new: bool, temperature: Optional[float] = None, top_p: Optional[float] = None): input_ids_length = input_ids.shape[-1] logger.debug(f"input_ids: {input_ids.shape}") @@ -323,7 +327,7 @@ class TransformersInterface(BackendInterfaceBase): else: logits = self.model(inputs_embeds=inputs_embeds, return_dict=False)[0] - self.prepare_logits_wrapper(input_ids, device) + self.prepare_logits_wrapper(input_ids, device, temperature, top_p) next_token = self.logits_to_token(logits[0, -1, :]) yield self.append_new_tokens(next_token) @@ -359,7 +363,7 @@ class TransformersInterface(BackendInterfaceBase): self.last_request_id = thread_id return True - async def inference(self, local_messages, thread_id: str): + async def inference(self, local_messages, thread_id: str, temperature: Optional[float] = None, top_p: Optional[float] = None): self.streamer.reset() self.profiler.create_and_start_timer("tokenize") if isinstance(local_messages, List): @@ -386,7 +390,7 @@ class TransformersInterface(BackendInterfaceBase): print(think, end="",flush=True) yield think - for t in self.prefill(input_ids, self.check_is_new(thread_id)): + for t in self.prefill(input_ids, self.check_is_new(thread_id), temperature, top_p): # output think token after prefill done if t is not None: print(t, end="",flush=True) diff --git a/ktransformers/server/schemas/legacy/completions.py b/ktransformers/server/schemas/legacy/completions.py index 874e556..7be0404 100644 --- a/ktransformers/server/schemas/legacy/completions.py +++ b/ktransformers/server/schemas/legacy/completions.py @@ -9,6 +9,8 @@ class CompletionCreate(BaseModel): model: str prompt: str | List[str] stream: bool = False + temperature: Optional[float] + top_p: Optional[float] def get_tokenizer_messages(self): if isinstance(self.prompt,List): From bf36547f98bdf6b02264425c814f999219ab742f Mon Sep 17 00:00:00 2001 From: lazymio Date: Mon, 24 Feb 2025 21:07:35 +0800 Subject: [PATCH 2/7] Also allow repetition_penalty --- .../server/api/openai/legacy/completions.py | 4 ++-- .../server/backend/interfaces/transformers.py | 14 ++++++++------ ktransformers/server/schemas/legacy/completions.py | 1 + 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/ktransformers/server/api/openai/legacy/completions.py b/ktransformers/server/api/openai/legacy/completions.py index fe250f4..9808c3a 100644 --- a/ktransformers/server/api/openai/legacy/completions.py +++ b/ktransformers/server/api/openai/legacy/completions.py @@ -20,7 +20,7 @@ async def create_completion(request:Request,create:CompletionCreate): if create.stream: async def inner(): - async for token in interface.inference(create.prompt,id,create.temperature,create.top_p): + async for token in interface.inference(create.prompt,id,create.temperature,create.top_p,create.repetition_penalty): d = {'choices':[{'delta':{'content':token}}]} yield f"data:{json.dumps(d)}\n\n" d = {'choices':[{'delta':{'content':''},'finish_reason':''}]} @@ -28,6 +28,6 @@ async def create_completion(request:Request,create:CompletionCreate): return stream_response(request,inner()) else: comp = CompletionObject(id=id,object='text_completion',created=int(time())) - async for token in interface.inference(create.prompt,id,create.temperature,create.top_p): + async for token in interface.inference(create.prompt,id,create.temperature,create.top_p,create.repetition_penalty): comp.append_token(token) return comp diff --git a/ktransformers/server/backend/interfaces/transformers.py b/ktransformers/server/backend/interfaces/transformers.py index d2e48a4..2674dd1 100644 --- a/ktransformers/server/backend/interfaces/transformers.py +++ b/ktransformers/server/backend/interfaces/transformers.py @@ -202,18 +202,20 @@ class TransformersInterface(BackendInterfaceBase): self.seq_length += 1 return self.streamer.put(new_tokens) - def prepare_logits_wrapper(self, inputs, device, temperature: Optional[float] = None, top_p: Optional[float] = None): + def prepare_logits_wrapper(self, inputs, device, temperature: Optional[float] = None, top_p: Optional[float] = None, repetition_penalty: Optional[float] = None): if temperature is None: temperature = self.args.temperature if top_p is None: top_p = self.args.top_p + if repetition_penalty is None: + repetition_penalty = self.args.repetition_penalty generation_config, model_kwargs = self.model._prepare_generation_config( None, max_length=self.args.max_new_tokens, do_sample=True, top_k=self.args.top_k, top_p=top_p, temperature=temperature, - repetition_penalty=self.args.repetition_penalty # change this to modify generate config + repetition_penalty=repetition_penalty # change this to modify generate config ) self.inputs = inputs self.generation_config = generation_config @@ -259,7 +261,7 @@ class TransformersInterface(BackendInterfaceBase): return self.logits_to_token(logits) @torch.no_grad - def prefill(self, input_ids: torch.Tensor, is_new: bool, temperature: Optional[float] = None, top_p: Optional[float] = None): + def prefill(self, input_ids: torch.Tensor, is_new: bool, temperature: Optional[float] = None, top_p: Optional[float] = None, repetition_penalty: Optional[float] = None): input_ids_length = input_ids.shape[-1] logger.debug(f"input_ids: {input_ids.shape}") @@ -327,7 +329,7 @@ class TransformersInterface(BackendInterfaceBase): else: logits = self.model(inputs_embeds=inputs_embeds, return_dict=False)[0] - self.prepare_logits_wrapper(input_ids, device, temperature, top_p) + self.prepare_logits_wrapper(input_ids, device, temperature, top_p, repetition_penalty) next_token = self.logits_to_token(logits[0, -1, :]) yield self.append_new_tokens(next_token) @@ -363,7 +365,7 @@ class TransformersInterface(BackendInterfaceBase): self.last_request_id = thread_id return True - async def inference(self, local_messages, thread_id: str, temperature: Optional[float] = None, top_p: Optional[float] = None): + async def inference(self, local_messages, thread_id: str, temperature: Optional[float] = None, top_p: Optional[float] = None, repetition_penalty: Optional[float] = None): self.streamer.reset() self.profiler.create_and_start_timer("tokenize") if isinstance(local_messages, List): @@ -390,7 +392,7 @@ class TransformersInterface(BackendInterfaceBase): print(think, end="",flush=True) yield think - for t in self.prefill(input_ids, self.check_is_new(thread_id), temperature, top_p): + for t in self.prefill(input_ids, self.check_is_new(thread_id), temperature, top_p, repetition_penalty): # output think token after prefill done if t is not None: print(t, end="",flush=True) diff --git a/ktransformers/server/schemas/legacy/completions.py b/ktransformers/server/schemas/legacy/completions.py index 7be0404..c5876d4 100644 --- a/ktransformers/server/schemas/legacy/completions.py +++ b/ktransformers/server/schemas/legacy/completions.py @@ -11,6 +11,7 @@ class CompletionCreate(BaseModel): stream: bool = False temperature: Optional[float] top_p: Optional[float] + repetition_penalty: Optional[float] def get_tokenizer_messages(self): if isinstance(self.prompt,List): From 05ad2884535c1819603131badd212125d660a0ef Mon Sep 17 00:00:00 2001 From: lazymio Date: Mon, 24 Feb 2025 21:08:36 +0800 Subject: [PATCH 3/7] Also /chat/completions --- ktransformers/server/api/openai/endpoints/chat.py | 4 ++-- ktransformers/server/schemas/endpoints/chat.py | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/ktransformers/server/api/openai/endpoints/chat.py b/ktransformers/server/api/openai/endpoints/chat.py index f84538a..356637c 100644 --- a/ktransformers/server/api/openai/endpoints/chat.py +++ b/ktransformers/server/api/openai/endpoints/chat.py @@ -28,13 +28,13 @@ async def chat_completion(request:Request,create:ChatCompletionCreate): if create.stream: async def inner(): chunk = ChatCompletionChunk(id=id,object='chat.completion.chunk',created=int(time())) - async for token in interface.inference(input_message,id): + async for token in interface.inference(input_message,id,create.temperature,create.top_p,create.repetition_penalty): chunk.set_token(token) yield chunk return chat_stream_response(request,inner()) else: comp = ChatCompletionObject(id=id,object='chat.completion',created=int(time())) comp.usage = Usage(completion_tokens=1, prompt_tokens=1, total_tokens=2) - async for token in interface.inference(input_message,id): + async for token in interface.inference(input_message,id,create.temperature,create.top_p,create.repetition_penalty): comp.append_token(token) return comp diff --git a/ktransformers/server/schemas/endpoints/chat.py b/ktransformers/server/schemas/endpoints/chat.py index 5c4dc4e..b929c4b 100644 --- a/ktransformers/server/schemas/endpoints/chat.py +++ b/ktransformers/server/schemas/endpoints/chat.py @@ -25,6 +25,9 @@ class ChatCompletionCreate(BaseModel): messages: List[Message] model : str stream : bool = False + temperature: Optional[float] + top_p: Optional[float] + repetition_penalty: Optional[float] def get_tokenizer_messages(self): return [m.to_tokenizer_message() for m in self.messages] From 76487c4dcb09a2bfac93ac437ec0956325a50d07 Mon Sep 17 00:00:00 2001 From: lazymio Date: Mon, 24 Feb 2025 21:30:03 +0800 Subject: [PATCH 4/7] Revert repetition_penalty as it is not in API spec --- ktransformers/server/api/openai/endpoints/chat.py | 4 ++-- .../server/api/openai/legacy/completions.py | 4 ++-- .../server/backend/interfaces/transformers.py | 14 ++++++-------- ktransformers/server/schemas/endpoints/chat.py | 2 +- ktransformers/server/schemas/legacy/completions.py | 2 +- 5 files changed, 12 insertions(+), 14 deletions(-) diff --git a/ktransformers/server/api/openai/endpoints/chat.py b/ktransformers/server/api/openai/endpoints/chat.py index 356637c..e5ea636 100644 --- a/ktransformers/server/api/openai/endpoints/chat.py +++ b/ktransformers/server/api/openai/endpoints/chat.py @@ -28,13 +28,13 @@ async def chat_completion(request:Request,create:ChatCompletionCreate): if create.stream: async def inner(): chunk = ChatCompletionChunk(id=id,object='chat.completion.chunk',created=int(time())) - async for token in interface.inference(input_message,id,create.temperature,create.top_p,create.repetition_penalty): + async for token in interface.inference(input_message,id,create.temperature,create.top_p): chunk.set_token(token) yield chunk return chat_stream_response(request,inner()) else: comp = ChatCompletionObject(id=id,object='chat.completion',created=int(time())) comp.usage = Usage(completion_tokens=1, prompt_tokens=1, total_tokens=2) - async for token in interface.inference(input_message,id,create.temperature,create.top_p,create.repetition_penalty): + async for token in interface.inference(input_message,id,create.temperature,create.top_p): comp.append_token(token) return comp diff --git a/ktransformers/server/api/openai/legacy/completions.py b/ktransformers/server/api/openai/legacy/completions.py index 9808c3a..fe250f4 100644 --- a/ktransformers/server/api/openai/legacy/completions.py +++ b/ktransformers/server/api/openai/legacy/completions.py @@ -20,7 +20,7 @@ async def create_completion(request:Request,create:CompletionCreate): if create.stream: async def inner(): - async for token in interface.inference(create.prompt,id,create.temperature,create.top_p,create.repetition_penalty): + async for token in interface.inference(create.prompt,id,create.temperature,create.top_p): d = {'choices':[{'delta':{'content':token}}]} yield f"data:{json.dumps(d)}\n\n" d = {'choices':[{'delta':{'content':''},'finish_reason':''}]} @@ -28,6 +28,6 @@ async def create_completion(request:Request,create:CompletionCreate): return stream_response(request,inner()) else: comp = CompletionObject(id=id,object='text_completion',created=int(time())) - async for token in interface.inference(create.prompt,id,create.temperature,create.top_p,create.repetition_penalty): + async for token in interface.inference(create.prompt,id,create.temperature,create.top_p): comp.append_token(token) return comp diff --git a/ktransformers/server/backend/interfaces/transformers.py b/ktransformers/server/backend/interfaces/transformers.py index 2674dd1..d2e48a4 100644 --- a/ktransformers/server/backend/interfaces/transformers.py +++ b/ktransformers/server/backend/interfaces/transformers.py @@ -202,20 +202,18 @@ class TransformersInterface(BackendInterfaceBase): self.seq_length += 1 return self.streamer.put(new_tokens) - def prepare_logits_wrapper(self, inputs, device, temperature: Optional[float] = None, top_p: Optional[float] = None, repetition_penalty: Optional[float] = None): + def prepare_logits_wrapper(self, inputs, device, temperature: Optional[float] = None, top_p: Optional[float] = None): if temperature is None: temperature = self.args.temperature if top_p is None: top_p = self.args.top_p - if repetition_penalty is None: - repetition_penalty = self.args.repetition_penalty generation_config, model_kwargs = self.model._prepare_generation_config( None, max_length=self.args.max_new_tokens, do_sample=True, top_k=self.args.top_k, top_p=top_p, temperature=temperature, - repetition_penalty=repetition_penalty # change this to modify generate config + repetition_penalty=self.args.repetition_penalty # change this to modify generate config ) self.inputs = inputs self.generation_config = generation_config @@ -261,7 +259,7 @@ class TransformersInterface(BackendInterfaceBase): return self.logits_to_token(logits) @torch.no_grad - def prefill(self, input_ids: torch.Tensor, is_new: bool, temperature: Optional[float] = None, top_p: Optional[float] = None, repetition_penalty: Optional[float] = None): + def prefill(self, input_ids: torch.Tensor, is_new: bool, temperature: Optional[float] = None, top_p: Optional[float] = None): input_ids_length = input_ids.shape[-1] logger.debug(f"input_ids: {input_ids.shape}") @@ -329,7 +327,7 @@ class TransformersInterface(BackendInterfaceBase): else: logits = self.model(inputs_embeds=inputs_embeds, return_dict=False)[0] - self.prepare_logits_wrapper(input_ids, device, temperature, top_p, repetition_penalty) + self.prepare_logits_wrapper(input_ids, device, temperature, top_p) next_token = self.logits_to_token(logits[0, -1, :]) yield self.append_new_tokens(next_token) @@ -365,7 +363,7 @@ class TransformersInterface(BackendInterfaceBase): self.last_request_id = thread_id return True - async def inference(self, local_messages, thread_id: str, temperature: Optional[float] = None, top_p: Optional[float] = None, repetition_penalty: Optional[float] = None): + async def inference(self, local_messages, thread_id: str, temperature: Optional[float] = None, top_p: Optional[float] = None): self.streamer.reset() self.profiler.create_and_start_timer("tokenize") if isinstance(local_messages, List): @@ -392,7 +390,7 @@ class TransformersInterface(BackendInterfaceBase): print(think, end="",flush=True) yield think - for t in self.prefill(input_ids, self.check_is_new(thread_id), temperature, top_p, repetition_penalty): + for t in self.prefill(input_ids, self.check_is_new(thread_id), temperature, top_p): # output think token after prefill done if t is not None: print(t, end="",flush=True) diff --git a/ktransformers/server/schemas/endpoints/chat.py b/ktransformers/server/schemas/endpoints/chat.py index b929c4b..5507266 100644 --- a/ktransformers/server/schemas/endpoints/chat.py +++ b/ktransformers/server/schemas/endpoints/chat.py @@ -27,7 +27,7 @@ class ChatCompletionCreate(BaseModel): stream : bool = False temperature: Optional[float] top_p: Optional[float] - repetition_penalty: Optional[float] + frequency_penalty: Optional[float] def get_tokenizer_messages(self): return [m.to_tokenizer_message() for m in self.messages] diff --git a/ktransformers/server/schemas/legacy/completions.py b/ktransformers/server/schemas/legacy/completions.py index c5876d4..ca4b89c 100644 --- a/ktransformers/server/schemas/legacy/completions.py +++ b/ktransformers/server/schemas/legacy/completions.py @@ -11,7 +11,7 @@ class CompletionCreate(BaseModel): stream: bool = False temperature: Optional[float] top_p: Optional[float] - repetition_penalty: Optional[float] + frequency_penalty: Optional[float] def get_tokenizer_messages(self): if isinstance(self.prompt,List): From 91062a834f3b586beb58fe5ef20bb637ff8c3c27 Mon Sep 17 00:00:00 2001 From: lazymio Date: Mon, 24 Feb 2025 21:38:01 +0800 Subject: [PATCH 5/7] Default values --- ktransformers/server/schemas/endpoints/chat.py | 7 +++---- ktransformers/server/schemas/legacy/completions.py | 5 ++--- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/ktransformers/server/schemas/endpoints/chat.py b/ktransformers/server/schemas/endpoints/chat.py index 5507266..821b3b9 100644 --- a/ktransformers/server/schemas/endpoints/chat.py +++ b/ktransformers/server/schemas/endpoints/chat.py @@ -25,10 +25,9 @@ class ChatCompletionCreate(BaseModel): messages: List[Message] model : str stream : bool = False - temperature: Optional[float] - top_p: Optional[float] - frequency_penalty: Optional[float] - + temperature: Optional[float] = None + top_p: Optional[float] = None + def get_tokenizer_messages(self): return [m.to_tokenizer_message() for m in self.messages] diff --git a/ktransformers/server/schemas/legacy/completions.py b/ktransformers/server/schemas/legacy/completions.py index ca4b89c..ea936ea 100644 --- a/ktransformers/server/schemas/legacy/completions.py +++ b/ktransformers/server/schemas/legacy/completions.py @@ -9,9 +9,8 @@ class CompletionCreate(BaseModel): model: str prompt: str | List[str] stream: bool = False - temperature: Optional[float] - top_p: Optional[float] - frequency_penalty: Optional[float] + temperature: Optional[float] = None + top_p: Optional[float] = None def get_tokenizer_messages(self): if isinstance(self.prompt,List): From 07eb712a73e65952114d3e404b4326d0d390cedc Mon Sep 17 00:00:00 2001 From: lazymio Date: Mon, 24 Feb 2025 21:51:14 +0800 Subject: [PATCH 6/7] Left out --- ktransformers/server/backend/interfaces/ktransformers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ktransformers/server/backend/interfaces/ktransformers.py b/ktransformers/server/backend/interfaces/ktransformers.py index 85bfb29..88b7e4b 100644 --- a/ktransformers/server/backend/interfaces/ktransformers.py +++ b/ktransformers/server/backend/interfaces/ktransformers.py @@ -127,7 +127,7 @@ class KTransformersInterface(TransformersInterface): @torch.no_grad - def prefill(self, input_ids: torch.Tensor, is_new: bool): + def prefill(self, input_ids: torch.Tensor, is_new: bool, temperature: Optional[float], top_p: Optional[float]): input_ids_length = input_ids.shape[-1] logger.debug(f"input_ids: {input_ids.shape}") @@ -198,7 +198,7 @@ class KTransformersInterface(TransformersInterface): else: logits = self.model(inputs_embeds=inputs_embeds, return_dict=False)[0] - self.prepare_logits_wrapper(input_ids, device) + self.prepare_logits_wrapper(input_ids, device, temperature, top_p) next_token = self.logits_to_token(logits[0, -1, :]) yield self.append_new_tokens(next_token) From b121ca4df8f921f254b14d28b3474e6d745ffed0 Mon Sep 17 00:00:00 2001 From: lazymio Date: Thu, 27 Feb 2025 18:11:35 +0800 Subject: [PATCH 7/7] Fix according to upstream changes --- ktransformers/server/backend/interfaces/ktransformers.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ktransformers/server/backend/interfaces/ktransformers.py b/ktransformers/server/backend/interfaces/ktransformers.py index 47d99c6..6de0998 100644 --- a/ktransformers/server/backend/interfaces/ktransformers.py +++ b/ktransformers/server/backend/interfaces/ktransformers.py @@ -201,10 +201,9 @@ class KTransformersInterface(TransformersInterface): else: logits = self.model(inputs_embeds=inputs_embeds, return_dict=False)[0] - self.prepare_logits_wrapper(input_ids, device, temperature, top_p) if flashinfer_enabled: MLAWrapperSingleton.reset_buffer() - self.prepare_logits_wrapper(input_ids, device) + self.prepare_logits_wrapper(input_ids, device, temperature, top_p) next_token = self.logits_to_token(logits[0, -1, :]) yield self.append_new_tokens(next_token)