Merge branch 'OpenBMB:main' into add_cpmV_hfdemo

This commit is contained in:
cxcz 2024-07-04 19:38:39 +08:00 committed by GitHub
commit a5c9423f7e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 301 additions and 2 deletions

View File

@ -287,9 +287,10 @@ print(model.response("<用户>山东省最高的山是哪座山, 它比黄山高
pip install e .
```
3. 前往[模型下载](#1)下载未量化的MiniCPM仓库下所有文件放至本地同一文件夹下,1b、2b模型均可,训练后模型亦可。
4. 在./AutoGPTQ/examples/quantization路径下输入以下命令其中no_quantized_path是第3步模型下载路径save_path是量化模型保存路径--bits 为量化位数可以选择输入4或者8
4. 命令行输入以下命令其中no_quantized_model_path是第3步模型下载路径save_path是量化模型保存路径--bits 为量化位数可以选择输入4或者8
```
python quant_with_alpaca.py --pretrained_model_dir no_quantized_path --quantized_model_dir save_path --bits 4
cd Minicpm/quantize
python gptq_quantize.py --pretrained_model_dir no_quant_model_path --quantized_model_dir quant_save_path --bits 4
```
5. 可以使用./AutoGPTQ/examples/quantization/inference.py进行推理也可以参考前文使用vllm对量化后的模型单卡4090下minicpm-1b-int4模型vllm推理在2000token/s左右。

236
quantize/gptq_quantize.py Normal file
View File

@ -0,0 +1,236 @@
"""
由于autogptq已经不更新很久了使用gptq量化前请先安装我们的autogptq分支,否则代码无法正常运行
bash
git clone https://github.com/LDLINGLINGLING/AutoGPTQ/tree/minicpm_gptq
cd Autogptq
pip install e .
"""
import json
import random
import time
from argparse import ArgumentParser
import torch
from datasets import Dataset
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
import os
import shutil
def copy_missing_files(src_path, dst_path):
src_files=os.listdir(src_path)
dst_files=os.listdir(dst_path)
for src_file in src_files:
if src_file not in dst_files and src_file.endswith(('.bin', '.json'))!=True and src_file.startswith('.')!=True:
src_file_path = os.path.join(src_path, src_file)
dst_file_path = os.path.join(dst_path, src_file)
shutil.copy2(src_file_path, dst_file_path)
def load_data(data_path, tokenizer, n_samples):
with open(data_path, "r", encoding="utf-8") as f:
raw_data = json.load(f)
raw_data = random.sample(raw_data, k=min(n_samples, len(raw_data)))
def dummy_gen():
return raw_data
def tokenize(examples):
instructions = examples["instruction"]
inputs = examples["input"]
outputs = examples["output"]
prompts = []
texts = []
input_ids = []
attention_mask = []
for istr, inp, opt in zip(instructions, inputs, outputs):
if inp:
prompt = f"<AI>:\n{istr+inp}<AI>:\n"
text = "<s>" + prompt + opt + "</s>"
else:
prompt = f"<USER>\n{istr}\n<AI>:\n"
text = "<s>" + prompt + opt+ "</s>"
if len(tokenizer(prompt)["input_ids"]) >= tokenizer.model_max_length:
continue
tokenized_data = tokenizer(text)
input_ids.append(tokenized_data["input_ids"][: tokenizer.model_max_length])
attention_mask.append(tokenized_data["attention_mask"][: tokenizer.model_max_length])
prompts.append(prompt)
texts.append(text)
return {
"input_ids": input_ids,
"attention_mask": attention_mask,
"prompt": prompts,
}
dataset = Dataset.from_generator(dummy_gen)
dataset = dataset.map(
tokenize,
batched=True,
batch_size=len(dataset),
num_proc=1,
keep_in_memory=True,
load_from_cache_file=False,
remove_columns=["instruction", "input"],
)
dataset = dataset.to_list()
for sample in dataset:
sample["input_ids"] = torch.LongTensor(sample["input_ids"])
sample["attention_mask"] = torch.LongTensor(sample["attention_mask"])
return dataset
def main():
parser = ArgumentParser()
parser.add_argument("--pretrained_model_dir", type=str,default='/root/ld/ld_model_pretrained/MiniCPM-1B-sft-bf16')
parser.add_argument("--quantized_model_dir", type=str, default='/root/ld/ld_project/AutoGPTQ/examples/quantization/minicpm_1b_4bit')
parser.add_argument("--bits", type=int, default=4, choices=[2, 3, 4])#do not use 8 bit
parser.add_argument(
"--group_size",
type=int,
default=128,
help="group size, -1 means no grouping or full rank",
)
parser.add_argument("--desc_act", action="store_true", default=True,help="whether to quantize with desc_act")
parser.add_argument(
"--num_samples",
type=int,
default=128,
help="how many samples will be used to quantize model",
)
parser.add_argument(
"--save_and_reload",
action="store_true",
default=True,
help="whether save quantized model to disk and reload back",
)
parser.add_argument("--fast_tokenizer", action="store_true", help="whether use fast tokenizer")
parser.add_argument(
"--use_triton",
action="store_true",
help="whether use triton to speedup at inference",
)
parser.add_argument(
"--per_gpu_max_memory",
type=int,
default=None,
help="max memory used to load model per gpu",
)
parser.add_argument(
"--cpu_max_memory",
type=int,
default=None,
help="max memory used to offload model to cpu",
)
parser.add_argument(
"--quant_batch_size",
type=int,
default=8,
help="examples batch size for quantization",
)
parser.add_argument(
"--trust_remote_code",
default=True,
action="store_true",
help="whether to trust remote code when loading model",
)
parser.add_argument(
"--quant_data",
default='quantize_data/alpaca_data_cleaned.json',
help="the quant data path",
)
args = parser.parse_args()
max_memory = {}
if args.per_gpu_max_memory is not None and args.per_gpu_max_memory > 0:
if torch.cuda.is_available():
max_memory.update({i: f"{args.per_gpu_max_memory}GIB" for i in range(torch.cuda.device_count())})
if args.cpu_max_memory is not None and args.cpu_max_memory > 0 and max_memory:
max_memory["cpu"] = f"{args.cpu_max_memory}GIB"
if not max_memory:
max_memory = None
tokenizer = AutoTokenizer.from_pretrained(
args.pretrained_model_dir,
use_fast=args.fast_tokenizer,
trust_remote_code=args.trust_remote_code,
)
model = AutoGPTQForCausalLM.from_pretrained(
args.pretrained_model_dir,
quantize_config=BaseQuantizeConfig(bits=args.bits, group_size=args.group_size, desc_act=args.desc_act),
max_memory=max_memory,
trust_remote_code=args.trust_remote_code,
)
examples = load_data(args.quant_data, tokenizer, args.num_samples)
examples_for_quant = [
{"input_ids": example["input_ids"], "attention_mask": example["attention_mask"]} for example in examples
]
start = time.time()
model.quantize(
examples_for_quant,
batch_size=args.quant_batch_size,
use_triton=args.use_triton,
autotune_warmup_after_quantized=args.use_triton,
)
end = time.time()
print(f"quantization took: {end - start: .4f}s")
if not args.quantized_model_dir:
args.quantized_model_dir = args.pretrained_model_dir
if args.save_and_reload:
model.save_quantized(args.quantized_model_dir)
tokenizer.save_pretrained(args.quantized_model_dir)
copy_missing_files(args.pretrained_model_dir,args.quantized_model_dir)
del model
if torch.cuda.is_available():
torch.cuda.empty_cache()
model = AutoGPTQForCausalLM.from_quantized(
args.quantized_model_dir,
device="cuda:0",
use_triton=args.use_triton,
max_memory=max_memory,
inject_fused_mlp=True,
inject_fused_attention=True,
trust_remote_code=args.trust_remote_code,
)
pipeline_init_kwargs = {"model": model, "tokenizer": tokenizer}
if not max_memory:
pipeline_init_kwargs["device"] = "cuda:0"
for example in random.sample(examples, k=min(4, len(examples))):
print(f"prompt: {example['prompt']}")
print("-" * 42)
print(f"golden: {example['output']}")
print("-" * 42)
start = time.time()
print(tokenizer.decode(model.generate(**tokenizer("{}".format(example['prompt']), return_tensors="pt").to(model.device),max_new_tokens=100)[0]))
if __name__ == "__main__":
import logging
logging.basicConfig(
format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
level=logging.INFO,
datefmt="%Y-%m-%d %H:%M:%S",
)
main()

62
quantize/readme.md Normal file
View File

@ -0,0 +1,62 @@
## 模型量化
<p id="gptq"></p>
**gptq量化**
1. 首先git获取[minicpm_gptqd代码](https://github.com/LDLINGLINGLING/AutoGPTQ/tree/minicpm_gptq)
2. 进入minicpm_gptqd主目录./AutoGPTQ命令行输入
```
pip install e .
```
3. 前往[模型下载](#1)下载未量化的MiniCPM仓库下所有文件放至本地同一文件夹下,1b、2b模型均可,训练后模型亦可。
4. 命令行输入以下命令其中no_quantized_model_path是第3步模型下载路径save_path是量化模型保存路径--bits 为量化位数可以选择输入4或者8
```
cd Minicpm/quantize
python gptq_quantize.py --pretrained_model_dir no_quant_model_path --quantized_model_dir quant_save_path --bits 4
```
5. 可以使用./AutoGPTQ/examples/quantization/inference.py进行推理也可以参考前文使用vllm对量化后的模型单卡4090下minicpm-1b-int4模型vllm推理在2000token/s左右。
<p id="awq"></p>
**awq量化**
1. 在quantize/awq_quantize.py 文件中修改根据注释修改配置参数:
```python
model_path = '/root/ld/ld_model_pretrained/MiniCPM-1B-sft-bf16' # model_path or model_id
quant_path = '/root/ld/ld_project/pull_request/MiniCPM/quantize/awq_cpm_1b_4bit' # quant_save_path
quant_data_path='/root/ld/ld_project/pull_request/MiniCPM/quantize/quantize_data/wikitext'# 写入自带量化数据集data下的alpaca或者wikitext
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" } # "w_bit":4 or 8
quant_samples=512 # how many samples to use for calibration
custom_data=[{'question':'你叫什么名字。','answer':'我是openmbmb开源的小钢炮minicpm。'}, # 自定义数据集可用
{'question':'你有什么特色。','answer':'我很小,但是我很强。'}]
```
2. 在quantize/quantize_data文件下已经提供了alpaca和wiki_text两个数据集作为量化校准集修改上述quant_data_path为其中一个文件夹的路径
3. 如果需要自定义数据集修改quantize/awq_quantize.py中的custom_data变量
```python
custom_data=[{'question':'过敏性鼻炎有什么症状?','answer':'过敏性鼻炎可能鼻塞,流鼻涕,头痛等症状反复发作,严重时建议及时就医。'},
{'question':'1+1等于多少','answer':'等于2'}]
```
4. 根据选择的数据集,选择以下某一行代码替换 quantize/awq_quantize.py 中第三十八行:
```python
#使用wikitext进行量化
model.quantize(tokenizer, quant_config=quant_config, calib_data=load_wikitext(quant_data_path=quant_data_path))
#使用alpaca进行量化
model.quantize(tokenizer, quant_config=quant_config, calib_data=load_alpaca(quant_data_path=quant_data_path))
#使用自定义数据集进行量化
model.quantize(tokenizer, quant_config=quant_config, calib_data=load_cust_data(quant_data_path=quant_data_path))
```
5. 运行quantize/awq_quantize.py文件,在设置的quan_path目录下可得awq量化后的模型。
<p id="quantize_test"></p>
**量化测试**
1. 命令行进入到 MiniCPM/quantize 目录下
2. 修改quantize_eval.sh文件中awq_pathgptq_pathawq_path,如果不需要测试的类型保持为空字符串如下示例表示仅测试awq模型
```
awq_path="/root/ld/ld_project/AutoAWQ/examples/awq_cpm_1b_4bit"
gptq_path=""
model_path=""
```
3. 在MiniCPM/quantize路径下命令行输入
```
bash quantize_eval.sh
```
4. 窗口将输出该模型的内存占用情况、困惑度。