MiniCPM/inference/vllm/examples/infer_cpm_mistral/inference.py

import argparse

from vllm import LLM, SamplingParams

parser = argparse.ArgumentParser()

parser.add_argument("--model_path", type=str, default="")

args = parser.parse_args()

# Sample prompts.
prompts = [
    "北京烤鸭真好吃，正好有一家北京烤鸭店，我想去吃北京烤鸭",
    "def reverse_list(list):",
    "Beijing is the capital of",
    "1 + 1 = ",
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
    '''0123456789'''*810 + " what is next number?",
    "您好，三加二等于多少？",
    "我是",
    ""
]

params_dict = {
    "n": 1,
    "best_of": None,
    "presence_penalty": 0.0,
    "frequency_penalty": 0.0,
    "temperature": 1,
    "top_p": 0.9,
    "top_k": -1,
    "use_beam_search": False,
    "length_penalty": 1.0,
    "early_stopping": False,
    "stop": None,
    "stop_token_ids": None,
    "ignore_eos": False,
    "max_tokens": 100,
    "logprobs": None,
    "prompt_logprobs": None,
    "skip_special_tokens": True,
}

# Create a sampling params object.
sampling_params = SamplingParams(**params_dict)

# Create an LLM.
llm = LLM(model=args.model_path, tensor_parallel_size=1, dtype='bfloat16')
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")