From f0f07d8f24727edf9e6f156d62781506683ade53 Mon Sep 17 00:00:00 2001 From: root <403644786@qq.com> Date: Mon, 1 Jul 2024 16:47:23 +0800 Subject: [PATCH 1/3] =?UTF-8?q?=E6=96=B0=E7=9A=84autogptq=E7=A4=BA?= =?UTF-8?q?=E4=BE=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- quantize/gptq_quantize.py | 236 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 236 insertions(+) create mode 100644 quantize/gptq_quantize.py diff --git a/quantize/gptq_quantize.py b/quantize/gptq_quantize.py new file mode 100644 index 0000000..a98285b --- /dev/null +++ b/quantize/gptq_quantize.py @@ -0,0 +1,236 @@ +""" +由于autogptq已经不更新很久了,使用gptq量化前,请先安装我们的autogptq分支,否则代码无法正常运行。 + +‘’‘bash +git clone https://github.com/LDLINGLINGLING/AutoGPTQ/tree/minicpm_gptq +cd Autogptq +pip install e . +‘’‘ + +""" + + + + + +import json +import random +import time +from argparse import ArgumentParser +import torch +from datasets import Dataset +from transformers import AutoTokenizer +from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig + +import os +import shutil + +def copy_missing_files(src_path, dst_path): + src_files=os.listdir(src_path) + dst_files=os.listdir(dst_path) + for src_file in src_files: + if src_file not in dst_files and src_file.endswith(('.bin', '.json'))!=True and src_file.startswith('.')!=True: + src_file_path = os.path.join(src_path, src_file) + dst_file_path = os.path.join(dst_path, src_file) + shutil.copy2(src_file_path, dst_file_path) + +def load_data(data_path, tokenizer, n_samples): + + with open(data_path, "r", encoding="utf-8") as f: + raw_data = json.load(f) + + raw_data = random.sample(raw_data, k=min(n_samples, len(raw_data))) + def dummy_gen(): + return raw_data + + def tokenize(examples): + instructions = examples["instruction"] + inputs = examples["input"] + outputs = examples["output"] + + prompts = [] + texts = [] + input_ids = [] + attention_mask = [] + for istr, inp, opt in zip(instructions, inputs, outputs): + if inp: + prompt = f":\n{istr+inp}:\n" + text = "" + prompt + opt + "" + else: + prompt = f"\n{istr}\n:\n" + text = "" + prompt + opt+ "" + if len(tokenizer(prompt)["input_ids"]) >= tokenizer.model_max_length: + continue + + tokenized_data = tokenizer(text) + + input_ids.append(tokenized_data["input_ids"][: tokenizer.model_max_length]) + attention_mask.append(tokenized_data["attention_mask"][: tokenizer.model_max_length]) + prompts.append(prompt) + texts.append(text) + + return { + "input_ids": input_ids, + "attention_mask": attention_mask, + "prompt": prompts, + } + + dataset = Dataset.from_generator(dummy_gen) + + dataset = dataset.map( + tokenize, + batched=True, + batch_size=len(dataset), + num_proc=1, + keep_in_memory=True, + load_from_cache_file=False, + remove_columns=["instruction", "input"], + ) + + dataset = dataset.to_list() + + for sample in dataset: + sample["input_ids"] = torch.LongTensor(sample["input_ids"]) + sample["attention_mask"] = torch.LongTensor(sample["attention_mask"]) + + return dataset + + +def main(): + parser = ArgumentParser() + parser.add_argument("--pretrained_model_dir", type=str,default='/root/ld/ld_model_pretrained/MiniCPM-1B-sft-bf16') + parser.add_argument("--quantized_model_dir", type=str, default='/root/ld/ld_project/AutoGPTQ/examples/quantization/minicpm_1b_4bit') + parser.add_argument("--bits", type=int, default=4, choices=[2, 3, 4])#do not use 8 bit + parser.add_argument( + "--group_size", + type=int, + default=128, + help="group size, -1 means no grouping or full rank", + ) + parser.add_argument("--desc_act", action="store_true", default=True,help="whether to quantize with desc_act") + parser.add_argument( + "--num_samples", + type=int, + default=128, + help="how many samples will be used to quantize model", + ) + parser.add_argument( + "--save_and_reload", + action="store_true", + default=True, + help="whether save quantized model to disk and reload back", + ) + parser.add_argument("--fast_tokenizer", action="store_true", help="whether use fast tokenizer") + parser.add_argument( + "--use_triton", + action="store_true", + help="whether use triton to speedup at inference", + ) + parser.add_argument( + "--per_gpu_max_memory", + type=int, + default=None, + help="max memory used to load model per gpu", + ) + parser.add_argument( + "--cpu_max_memory", + type=int, + default=None, + help="max memory used to offload model to cpu", + ) + parser.add_argument( + "--quant_batch_size", + type=int, + default=8, + help="examples batch size for quantization", + ) + parser.add_argument( + "--trust_remote_code", + default=True, + action="store_true", + help="whether to trust remote code when loading model", + ) + parser.add_argument( + "--quant_data", + default='quantize_data/alpaca_data_cleaned.json', + help="the quant data path", + ) + + args = parser.parse_args() + + max_memory = {} + if args.per_gpu_max_memory is not None and args.per_gpu_max_memory > 0: + if torch.cuda.is_available(): + max_memory.update({i: f"{args.per_gpu_max_memory}GIB" for i in range(torch.cuda.device_count())}) + if args.cpu_max_memory is not None and args.cpu_max_memory > 0 and max_memory: + max_memory["cpu"] = f"{args.cpu_max_memory}GIB" + if not max_memory: + max_memory = None + + tokenizer = AutoTokenizer.from_pretrained( + args.pretrained_model_dir, + use_fast=args.fast_tokenizer, + trust_remote_code=args.trust_remote_code, + ) + model = AutoGPTQForCausalLM.from_pretrained( + args.pretrained_model_dir, + quantize_config=BaseQuantizeConfig(bits=args.bits, group_size=args.group_size, desc_act=args.desc_act), + max_memory=max_memory, + trust_remote_code=args.trust_remote_code, + ) + + examples = load_data(args.quant_data, tokenizer, args.num_samples) + examples_for_quant = [ + {"input_ids": example["input_ids"], "attention_mask": example["attention_mask"]} for example in examples + ] + + start = time.time() + model.quantize( + examples_for_quant, + batch_size=args.quant_batch_size, + use_triton=args.use_triton, + autotune_warmup_after_quantized=args.use_triton, + ) + end = time.time() + print(f"quantization took: {end - start: .4f}s") + + if not args.quantized_model_dir: + args.quantized_model_dir = args.pretrained_model_dir + + if args.save_and_reload: + model.save_quantized(args.quantized_model_dir) + tokenizer.save_pretrained(args.quantized_model_dir) + copy_missing_files(args.pretrained_model_dir,args.quantized_model_dir) + del model + if torch.cuda.is_available(): + torch.cuda.empty_cache() + model = AutoGPTQForCausalLM.from_quantized( + args.quantized_model_dir, + device="cuda:0", + use_triton=args.use_triton, + max_memory=max_memory, + inject_fused_mlp=True, + inject_fused_attention=True, + trust_remote_code=args.trust_remote_code, + ) + + pipeline_init_kwargs = {"model": model, "tokenizer": tokenizer} + if not max_memory: + pipeline_init_kwargs["device"] = "cuda:0" + for example in random.sample(examples, k=min(4, len(examples))): + print(f"prompt: {example['prompt']}") + print("-" * 42) + print(f"golden: {example['output']}") + print("-" * 42) + start = time.time() + print(tokenizer.decode(model.generate(**tokenizer("{}".format(example['prompt']), return_tensors="pt").to(model.device),max_new_tokens=100)[0])) + +if __name__ == "__main__": + import logging + + logging.basicConfig( + format="%(asctime)s %(levelname)s [%(name)s] %(message)s", + level=logging.INFO, + datefmt="%Y-%m-%d %H:%M:%S", + ) + main() \ No newline at end of file From dae9354e96cbbe0af3dd03627ca0758d667dcdc9 Mon Sep 17 00:00:00 2001 From: root <403644786@qq.com> Date: Mon, 1 Jul 2024 16:56:39 +0800 Subject: [PATCH 2/3] =?UTF-8?q?=E6=A0=B9=E6=8D=AEautogptq=E7=A4=BA?= =?UTF-8?q?=E4=BE=8B=E4=BF=AE=E6=94=B9=E4=BA=86autogptq=E7=9A=84readme?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 1c7c5cd..2d5be8f 100644 --- a/README.md +++ b/README.md @@ -287,9 +287,10 @@ print(model.response("<用户>山东省最高的山是哪座山, 它比黄山高 pip install e . ``` 3. 前往[模型下载](#1)下载未量化的MiniCPM仓库下所有文件放至本地同一文件夹下,1b、2b模型均可,训练后模型亦可。 -4. 在./AutoGPTQ/examples/quantization路径下输入以下命令,其中no_quantized_path是第3步模型下载路径,save_path是量化模型保存路径,--bits 为量化位数可以选择输入4或者8 +4. 命令行输入以下命令,其中no_quantized_model_path是第3步模型下载路径,save_path是量化模型保存路径,--bits 为量化位数可以选择输入4或者8 ``` - python quant_with_alpaca.py --pretrained_model_dir no_quantized_path --quantized_model_dir save_path --bits 4 + cd Minicpm/quantize + python gptq_quantize.py --pretrained_model_dir no_quant_model_path --quantized_model_dir quant_save_path --bits 4 ``` 5. 可以使用./AutoGPTQ/examples/quantization/inference.py进行推理,也可以参考前文使用vllm对量化后的模型,单卡4090下minicpm-1b-int4模型vllm推理在2000token/s左右。 From 5a3ed8359c7ccb03f10f68d040d62fd627589d86 Mon Sep 17 00:00:00 2001 From: root <403644786@qq.com> Date: Mon, 1 Jul 2024 16:58:53 +0800 Subject: [PATCH 3/3] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E4=BA=86quantize?= =?UTF-8?q?=E7=9B=AE=E5=BD=95=E4=B8=8B=E7=9A=84readme?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- quantize/readme.md | 62 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 quantize/readme.md diff --git a/quantize/readme.md b/quantize/readme.md new file mode 100644 index 0000000..344b458 --- /dev/null +++ b/quantize/readme.md @@ -0,0 +1,62 @@ +## 模型量化 +

+ +**gptq量化** +1. 首先git获取[minicpm_gptqd代码](https://github.com/LDLINGLINGLING/AutoGPTQ/tree/minicpm_gptq) +2. 进入minicpm_gptqd主目录./AutoGPTQ,命令行输入: + ``` + pip install e . + ``` +3. 前往[模型下载](#1)下载未量化的MiniCPM仓库下所有文件放至本地同一文件夹下,1b、2b模型均可,训练后模型亦可。 +4. 命令行输入以下命令,其中no_quantized_model_path是第3步模型下载路径,save_path是量化模型保存路径,--bits 为量化位数可以选择输入4或者8 + ``` + cd Minicpm/quantize + python gptq_quantize.py --pretrained_model_dir no_quant_model_path --quantized_model_dir quant_save_path --bits 4 + ``` +5. 可以使用./AutoGPTQ/examples/quantization/inference.py进行推理,也可以参考前文使用vllm对量化后的模型,单卡4090下minicpm-1b-int4模型vllm推理在2000token/s左右。 + +

+ +**awq量化** +1. 在quantize/awq_quantize.py 文件中修改根据注释修改配置参数: + ```python + model_path = '/root/ld/ld_model_pretrained/MiniCPM-1B-sft-bf16' # model_path or model_id + quant_path = '/root/ld/ld_project/pull_request/MiniCPM/quantize/awq_cpm_1b_4bit' # quant_save_path + quant_data_path='/root/ld/ld_project/pull_request/MiniCPM/quantize/quantize_data/wikitext'# 写入自带量化数据集,data下的alpaca或者wikitext + quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" } # "w_bit":4 or 8 + quant_samples=512 # how many samples to use for calibration + custom_data=[{'question':'你叫什么名字。','answer':'我是openmbmb开源的小钢炮minicpm。'}, # 自定义数据集可用 + {'question':'你有什么特色。','answer':'我很小,但是我很强。'}] + ``` +2. 在quantize/quantize_data文件下已经提供了alpaca和wiki_text两个数据集作为量化校准集,修改上述quant_data_path为其中一个文件夹的路径 +3. 如果需要自定义数据集,修改quantize/awq_quantize.py中的custom_data变量,如: + ```python + custom_data=[{'question':'过敏性鼻炎有什么症状?','answer':'过敏性鼻炎可能鼻塞,流鼻涕,头痛等症状反复发作,严重时建议及时就医。'}, + {'question':'1+1等于多少?','answer':'等于2'}] + ``` +4. 根据选择的数据集,选择以下某一行代码替换 quantize/awq_quantize.py 中第三十八行: + ```python + #使用wikitext进行量化 + model.quantize(tokenizer, quant_config=quant_config, calib_data=load_wikitext(quant_data_path=quant_data_path)) + #使用alpaca进行量化 + model.quantize(tokenizer, quant_config=quant_config, calib_data=load_alpaca(quant_data_path=quant_data_path)) + #使用自定义数据集进行量化 + model.quantize(tokenizer, quant_config=quant_config, calib_data=load_cust_data(quant_data_path=quant_data_path)) + + ``` +5. 运行quantize/awq_quantize.py文件,在设置的quan_path目录下可得awq量化后的模型。 +

+ +**量化测试** +1. 命令行进入到 MiniCPM/quantize 目录下 +2. 修改quantize_eval.sh文件中awq_path,gptq_path,awq_path,如果不需要测试的类型保持为空字符串,如下示例表示仅测试awq模型: + ``` + awq_path="/root/ld/ld_project/AutoAWQ/examples/awq_cpm_1b_4bit" + gptq_path="" + model_path="" + ``` +3. 在MiniCPM/quantize路径下命令行输入: + ``` + bash quantize_eval.sh + ``` +4. 窗口将输出该模型的内存占用情况、困惑度。 \ No newline at end of file