add autoawq example

This commit is contained in:
root 2024-06-24 10:57:19 +08:00
parent b808010417
commit f062357093
28 changed files with 258741 additions and 13 deletions

View File

@ -6,8 +6,7 @@ from typing import Dict, Optional
import torch import torch
import transformers import transformers
from torch.utils.data import Dataset from torch.utils.data import Dataset
from transformers import (AutoModelForCausalLM, AutoTokenizer, Trainer, from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
TrainingArguments)
@dataclass @dataclass
@ -48,15 +47,19 @@ class SupervisedDataset(Dataset):
data_path, data_path,
tokenizer, tokenizer,
model_max_length=4096, model_max_length=4096,
user_tokens='<用户>', user_tokens="<用户>",
assistant_tokens='<AI>', assistant_tokens="<AI>",
): ):
super(SupervisedDataset, self).__init__() super(SupervisedDataset, self).__init__()
self.data = json.load(open(data_path)) self.data = json.load(open(data_path))
self.tokenizer = tokenizer self.tokenizer = tokenizer
self.model_max_length = model_max_length self.model_max_length = model_max_length
self.user_tokens = self.tokenizer.encode(user_tokens) #针对不同模型,都可以对应到<用户>的id self.user_tokens = self.tokenizer.encode(
self.assistant_tokens = self.tokenizer.encode(assistant_tokens) #针对不同模型,都可以对应到<AI>的id user_tokens
) # 针对不同模型,都可以对应到<用户>的id
self.assistant_tokens = self.tokenizer.encode(
assistant_tokens
) # 针对不同模型,都可以对应到<AI>的id
self.ignore_index = -100 self.ignore_index = -100
item = self.preprocessing(self.data[0]) item = self.preprocessing(self.data[0])
print("input:", self.tokenizer.decode(item["input_ids"])) print("input:", self.tokenizer.decode(item["input_ids"]))
@ -86,10 +89,9 @@ class SupervisedDataset(Dataset):
] * len(content_ids) ] * len(content_ids)
else: else:
input_ids += self.assistant_tokens + content_ids input_ids += self.assistant_tokens + content_ids
label_ids += ( label_ids += [self.ignore_index] * len(
[self.ignore_index] * len(self.assistant_tokens) self.assistant_tokens
+ content_ids ) + content_ids
)
input_ids.append(self.tokenizer.eos_token_id) input_ids.append(self.tokenizer.eos_token_id)
label_ids.append(self.tokenizer.eos_token_id) label_ids.append(self.tokenizer.eos_token_id)
@ -171,7 +173,7 @@ if __name__ == "__main__":
max_length=training_args.model_max_length, max_length=training_args.model_max_length,
use_lora=training_args.use_lora, use_lora=training_args.use_lora,
bf16=training_args.bf16, bf16=training_args.bf16,
fp16=training_args.fp16 fp16=training_args.fp16,
) )
train_dataset = SupervisedDataset( train_dataset = SupervisedDataset(

40
quantize/awq_quantize.py Normal file
View File

@ -0,0 +1,40 @@
from datasets import load_dataset
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
import os
model_path = '/root/ld/ld_model_pretrained/MiniCPM-1B-sft-bf16' # model_path or model_id
quant_path = '/root/ld/ld_project/pull_request/MiniCPM/quantize/awq_cpm_1b_4bit' # quant_save_path
quant_data_path='/root/ld/ld_project/pull_request/MiniCPM/quantize/quantize_data/alpaca'
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" } #"w_bit":4 or 8
quant_samples=512 #how many samples to use for calibration
# Load model
model = AutoAWQForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True,device_map={"": "cuda:0"})
# Define data loading methods
def load_alpaca(quant_data_path):
data = load_dataset(quant_data_path, split="train") #Set the absolute path to alpaca or huggingface id
# concatenate data
def concatenate_data(x):
return {"text": '<s><用户>'+x['instruction'] + '<AI>' + x['input'] + '\n' + x['output']}
concatenated = data.map(concatenate_data)[:quant_samples]
return [text for text in concatenated["text"]]
def load_wikitext():
data = load_dataset('wikitext', 'wikitext-2-raw-v1', split="train")
return [text for text in data["text"] if text.strip() != '' and len(text.split(' ')) > 20][:quant_samples]
# Quantize
model.quantize(tokenizer, quant_config=quant_config, calib_data=load_alpaca(quant_data_path=quant_data_path))
# Save quantized model
model.save_quantized(quant_path)
tokenizer.save_pretrained(quant_path)
print(f'Model is quantized and saved at "{quant_path}"')

View File

@ -0,0 +1 @@
{"url": "hf://datasets/tatsu-lab/alpaca@dce01c9b08f87459cf36a430d809084718273017/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet", "etag": null}

View File

@ -0,0 +1 @@
{"description": "", "citation": "", "homepage": "", "license": "", "features": {"instruction": {"dtype": "string", "_type": "Value"}, "input": {"dtype": "string", "_type": "Value"}, "output": {"dtype": "string", "_type": "Value"}, "text": {"dtype": "string", "_type": "Value"}}, "builder_name": "parquet", "dataset_name": "alpaca", "config_name": "default", "version": {"version_str": "0.0.0", "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 46208623, "num_examples": 52002, "dataset_name": "alpaca"}}, "download_checksums": {"hf://datasets/tatsu-lab/alpaca@dce01c9b08f87459cf36a430d809084718273017/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet": {"num_bytes": 24246638, "checksum": null}}, "download_size": 24246638, "dataset_size": 46208623, "size_in_bytes": 70455261}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1 @@
{"url": "hf://datasets/wikitext@b08601e04326c79dfdd32d625aee71d232d685c3/wikitext-2-raw-v1/validation-00000-of-00001.parquet", "etag": null}

View File

@ -0,0 +1 @@
{"url": "hf://datasets/wikitext@b08601e04326c79dfdd32d625aee71d232d685c3/wikitext-2-raw-v1/test-00000-of-00001.parquet", "etag": null}

View File

@ -0,0 +1 @@
{"url": "hf://datasets/wikitext@b08601e04326c79dfdd32d625aee71d232d685c3/wikitext-2-raw-v1/train-00000-of-00001.parquet", "etag": null}

View File

@ -0,0 +1 @@
{"description": "", "citation": "", "homepage": "", "license": "", "features": {"text": {"dtype": "string", "_type": "Value"}}, "builder_name": "parquet", "dataset_name": "wikitext", "config_name": "wikitext-2-raw-v1", "version": {"version_str": "0.0.0", "major": 0, "minor": 0, "patch": 0}, "splits": {"test": {"name": "test", "num_bytes": 1305088, "num_examples": 4358, "dataset_name": "wikitext"}, "train": {"name": "train", "num_bytes": 11061717, "num_examples": 36718, "dataset_name": "wikitext"}, "validation": {"name": "validation", "num_bytes": 1159288, "num_examples": 3760, "dataset_name": "wikitext"}}, "download_checksums": {"hf://datasets/wikitext@b08601e04326c79dfdd32d625aee71d232d685c3/wikitext-2-raw-v1/test-00000-of-00001.parquet": {"num_bytes": 732610, "checksum": null}, "hf://datasets/wikitext@b08601e04326c79dfdd32d625aee71d232d685c3/wikitext-2-raw-v1/train-00000-of-00001.parquet": {"num_bytes": 6357543, "checksum": null}, "hf://datasets/wikitext@b08601e04326c79dfdd32d625aee71d232d685c3/wikitext-2-raw-v1/validation-00000-of-00001.parquet": {"num_bytes": 657209, "checksum": null}}, "download_size": 7747362, "dataset_size": 13526093, "size_in_bytes": 21273455}

113
quantize/quantize_eval.py Normal file
View File

@ -0,0 +1,113 @@
import torch
import torch.nn as nn
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from awq import AutoAWQForCausalLM
from auto_gptq import AutoGPTQForCausalLM
import GPUtil
import argparse
parser = argparse.ArgumentParser(description="========量化困惑度测试========")
parser.add_argument(
"--model_path",
type=str,
default='/root/ld/ld_model_pretrained/miniCPM-bf16',
help="未量化前的模型路径。"
)
parser.add_argument(
"--awq_path",
type=str,
default='/root/ld/ld_project/pull_request/MiniCPM/quantize/awq_cpm_2b_4bit',
help="awq量化后的模型保存路径。"
)
#we will support gptq later
parser.add_argument(
"--gptq_path",
type=str,
default='/root/ld/ld_project/AutoGPTQ/examples/quantization/minicpm_2b_4bit',
help="gptq量化后的模型保存路径。"
)
parser.add_argument(
"--data_path",
type=str,
default='/root/ld/ld_project/pull_request/MiniCPM/quantize/quantize_data/wikitext',
help="可以是以后的量化数据集示例中默认为wiki_text"
)
def get_device():
if torch.backends.mps.is_available():
return "mps"
elif torch.cuda.is_available():
return "cuda:0"
else:
return "cpu"
def evaluate_perplexity(model, tokenizer,data_path):
def _perplexity(nlls, n_samples, seqlen):
return torch.exp(torch.stack(nlls).sum() / (n_samples * seqlen))
data = load_dataset(data_path, split="test")
data = tokenizer("\n\n".join(data["text"]), return_tensors="pt")
data = data.input_ids.to('cuda:0')
seqlen = 2048
model = model.eval()
n_samples = data.numel() // seqlen
nlls = []
with tqdm(range(n_samples), desc="Perplexity -") as progress_bar:
for i in progress_bar:
start_index = i * seqlen
end_index = (i + 1) * seqlen
batch = data[:, start_index:end_index].to('cuda:0')
with torch.no_grad():
logits = model(batch).logits
shift_logits = logits[:, :-1, :].contiguous().float()
shift_labels = data[:, start_index:end_index][:, 1:]
loss_fct = nn.CrossEntropyLoss()
loss = loss_fct(
shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
)
neg_log_likelihood = loss.float() * seqlen
nlls.append(neg_log_likelihood)
curr_ppl = _perplexity(nlls, i + 1, seqlen)
progress_bar.set_description(f"Perplexity {curr_ppl:.3f}")
ppl = _perplexity(nlls, n_samples, seqlen)
return ppl.item()
if __name__ == "__main__":
args = parser.parse_args()
if args.model_path:
model = AutoModelForCausalLM.from_pretrained(args.model_path, torch_dtype=torch.bfloat16, device_map='cuda', trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(args.model_path)
print("pretrained model",args.model_path.split('/')[-1])
gpu_usage = GPUtil.getGPUs()[0].memoryUsed
print(f"gpu usage: {round(gpu_usage/1024,2)}GB")
evaluate_perplexity(model, tokenizer, args.data_path)
del model
if args.awq_path:
model = AutoAWQForCausalLM.from_quantized(args.awq_path, fuse_layers=True,device_map={"":'cuda:0'})
tokenizer = AutoTokenizer.from_pretrained(args.awq_path)
print("awq model",args.awq_path.split('/')[-1])
gpu_usage = GPUtil.getGPUs()[0].memoryUsed
print(f"gpu usage: {round(gpu_usage/1024,2)}GB")
evaluate_perplexity(model, tokenizer, args.data_path)
del model
#we will support the autogptq later
if args.gptq_path:
tokenizer = AutoTokenizer.from_pretrained(args.gptq_path, use_fast=True)
model = AutoGPTQForCausalLM.from_quantized(args.gptq_path, device="cuda:0",trust_remote_code=True)
print("gptq model",args.gptq_path.split('/')[-1])
gpu_usage = GPUtil.getGPUs()[0].memoryUsed
print(f"gpu usage: {round(gpu_usage/1024,2)}GB")
evaluate_perplexity(model, tokenizer, args.data_path)