Add README and update finetune scripts

This commit is contained in:
Xiang Long 2024-02-01 02:21:25 +08:00
parent 76845055c5
commit a250c200ed
10 changed files with 179 additions and 50 deletions

88
finetune/README.md Normal file
View File

@ -0,0 +1,88 @@
# MiniCPM 微调
本目录提供 MiniCPM-2B 模型的微调示例,包括全量微调和 PEFT。格式上提供多轮对话微调样例和输入输出格式微调样例。
如果将模型下载到了本地,本文和代码中的 `OpenBMB/MiniCPM-2B` 字段均应替换为相应地址以从本地加载模型。
运行示例需要 `python>=3.10`,除基础的 `torch` 依赖外,示例代码运行还需要依赖。
**我们提供了 [示例notebook](lora_finetune.ipynb) 用于演示如何以 AdvertiseGen 为例处理数据和使用微调脚本。**
```bash
pip install -r requirements.txt
```
## 测试硬件标准
我们仅提供了单机多卡/多机多卡的运行示例,因此您需要至少一台具有多个 GPU 的机器。本仓库中的**默认配置文件**中,我们记录了显存的占用情况:
+ SFT 全量微调: 4张显卡平均分配每张显卡占用 `30245MiB` 显存。
+ LORA 微调: 1张显卡占用 `10619MiB` 显存。
> 请注意,该结果仅供参考,对于不同的参数,显存占用可能会有所不同。请结合你的硬件情况进行调整。
## 多轮对话格式
多轮对话微调示例采用 ChatGLM3 对话格式约定,对不同角色添加不同 `loss_mask` 从而在一遍计算中为多轮回复计算 `loss`
对于数据文件,样例采用如下格式
```json
[
{
"conversations": [
{
"role": "system",
"content": "<system prompt text>"
},
{
"role": "user",
"content": "<user prompt text>"
},
{
"role": "assistant",
"content": "<assistant response text>"
},
// ... Muti Turn
{
"role": "user",
"content": "<user prompt text>"
},
{
"role": "assistant",
"content": "<assistant response text>"
}
]
}
// ...
]
```
## 数据集格式示例
这里以 AdvertiseGen 数据集为例,
您可以从 [Google Drive](https://drive.google.com/file/d/13_vf0xRTQsyneRKdD1bZIr93vBGOczrk/view?usp=sharing)
或者 [Tsinghua Cloud](https://cloud.tsinghua.edu.cn/f/b3f119a008264b1cabd1/?dl=1) 下载 AdvertiseGen 数据集。
将解压后的 AdvertiseGen 目录放到 `data` 目录下并自行转换为如下格式数据集。
> 请注意,现在的微调代码中加入了验证集,因此,对于一组完整的微调数据集,必须包含训练数据集和验证数据集,测试数据集可以不填写。或者直接用验证数据集代替。
```
{"conversations": [{"role": "user", "content": "类型#裙*裙长#半身裙"}, {"role": "assistant", "content": "这款百搭时尚的仙女半身裙,整体设计非常的飘逸随性,穿上之后每个女孩子都能瞬间变成小仙女啦。料子非常的轻盈,透气性也很好,穿到夏天也很舒适。"}]}
```
## 开始微调
通过以下代码执行 **单机多卡/多机多卡** 运行。
```bash
cd finetune
bash sft_finetune.sh
```
通过以下代码执行 **单机单卡** 运行。
```angular2html
cd finetune
bash lora_finetune.sh
```

View File

@ -1,8 +1,4 @@
{ {
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"gradient_accumulation_steps": "auto",
"gradient_clipping": 1.0,
"fp16": { "fp16": {
"enabled": "auto", "enabled": "auto",
"loss_scale": 0, "loss_scale": 0,
@ -16,10 +12,15 @@
}, },
"zero_optimization": { "zero_optimization": {
"stage": 2, "stage": 2,
"allgather_partitions": true,
"overlap_comm": true, "overlap_comm": true,
"reduce_scatter": true, "reduce_scatter": true,
"contiguous_gradients": true "contiguous_gradients": true
}, },
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"gradient_accumulation_steps": "auto",
"gradient_clipping": 1.0,
"wall_clock_breakdown": false, "wall_clock_breakdown": false,
"flops_profiler": { "flops_profiler": {
"enabled": false, "enabled": false,

View File

@ -1,8 +1,4 @@
{ {
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"gradient_accumulation_steps": "auto",
"gradient_clipping": 1.0,
"fp16": { "fp16": {
"enabled": "auto", "enabled": "auto",
"loss_scale": 0, "loss_scale": 0,
@ -16,16 +12,19 @@
}, },
"zero_optimization": { "zero_optimization": {
"stage": 2, "stage": 2,
"allgather_partitions": true,
"overlap_comm": true, "overlap_comm": true,
"reduce_scatter": true, "reduce_scatter": true,
"contiguous_gradients": true, "contiguous_gradients": true,
"offload_optimizer": { "offload_optimizer": {
"device": "cpu" "device": "cpu",
}, "pin_memory": true
"offload_param": {
"device": "cpu"
} }
}, },
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"gradient_accumulation_steps": "auto",
"gradient_clipping": 1.0,
"wall_clock_breakdown": false, "wall_clock_breakdown": false,
"flops_profiler": { "flops_profiler": {
"enabled": false, "enabled": false,

View File

@ -1,16 +1,32 @@
{ {
"train_batch_size": "auto", "fp16": {
"train_micro_batch_size_per_gpu": "auto", "enabled": "auto",
"gradient_accumulation_steps": "auto", "loss_scale": 0,
"gradient_clipping": 1.0, "loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": { "bf16": {
"enabled": "auto" "enabled": "auto"
}, },
"zero_optimization": { "zero_optimization": {
"stage": 3, "stage": 3,
"allgather_partitions": true,
"allgather_bucket_size": 5e8,
"reduce_scatter": true,
"contiguous_gradients": true,
"overlap_comm": true, "overlap_comm": true,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"stage3_gather_16bit_weights_on_model_save": true "stage3_gather_16bit_weights_on_model_save": true
}, },
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"gradient_accumulation_steps": "auto",
"gradient_clipping": 1.0,
"wall_clock_breakdown": false,
"flops_profiler": { "flops_profiler": {
"enabled": false, "enabled": false,
"profile_step": 1, "profile_step": 1,

View File

@ -0,0 +1,46 @@
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"zero_optimization": {
"stage": 3,
"allgather_partitions": true,
"allgather_bucket_size": 5e8,
"reduce_scatter": true,
"contiguous_gradients": true,
"overlap_comm": true,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"stage3_gather_16bit_weights_on_model_save": true,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"offload_param": {
"device": "cpu",
"pin_memory": true
}
},
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"gradient_accumulation_steps": "auto",
"gradient_clipping": 1.0,
"wall_clock_breakdown": false,
"flops_profiler": {
"enabled": false,
"profile_step": 1,
"module_depth": -1,
"top_modules": 1,
"detailed": true,
"output_file": null
}
}

View File

@ -1,8 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import json import json
import time
from typing import Dict, Optional from typing import Dict, Optional
from datetime import datetime
from dataclasses import dataclass, field from dataclasses import dataclass, field
import torch import torch
@ -155,7 +153,7 @@ def load_model_and_tokenizer(
model = get_peft_model(model, lora_config) model = get_peft_model(model, lora_config)
# trainable params: 2,949,120 || all params: 3,010,652,928 || trainable%: 0.09795616002669305 # trainable params: 2,949,120 || all params: 3,010,652,928 || trainable%: 0.09795616002669305
model.print_trainable_parameters() model.print_trainable_parameters()
model.enable_input_require_grads() # model.enable_input_require_grads() # need when using adapter
return model, tokenizer return model, tokenizer
@ -184,15 +182,12 @@ if __name__ == "__main__":
model_max_length=training_args.model_max_length, model_max_length=training_args.model_max_length,
) )
formatted_time = datetime.now().strftime("%Y%m%d%H%M%S")
trainer = Trainer( trainer = Trainer(
model=model, model=model,
args=training_args, args=training_args,
train_dataset=train_dataset, train_dataset=train_dataset,
eval_dataset=eval_dataset, eval_dataset=eval_dataset,
tokenizer=tokenizer, tokenizer=tokenizer,
# compute_metrics=compute_metrics,
) )
trainer.train() trainer.train()

View File

@ -9,7 +9,7 @@
"本 notebook 是一个使用 `AdvertiseGen` 数据集对 MiniCPM-2B 进行 LoRA 微调,使其具备专业的广告生成能力的代码示例。\n", "本 notebook 是一个使用 `AdvertiseGen` 数据集对 MiniCPM-2B 进行 LoRA 微调,使其具备专业的广告生成能力的代码示例。\n",
"\n", "\n",
"## 硬件需求\n", "## 硬件需求\n",
"- 显存24GB\n", "- 显存:12GB\n",
"- 显卡架构:安培架构(推荐)\n", "- 显卡架构:安培架构(推荐)\n",
"- 内存16GB" "- 内存16GB"
] ]
@ -101,7 +101,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"!bash lora_finetune.sh" "!bash lora_finetune_ds.sh"
] ]
} }
], ],

View File

@ -1,16 +1,17 @@
formatted_time=$(date +"%Y%m%d%H%M%S") formatted_time=$(date +"%Y%m%d%H%M%S")
echo $formatted_time echo $formatted_time
CUDA_VISIBLE_DEVICES=0 python finetune.py \
deepspeed --include localhost:0 finetune.py \
--model_name_or_path <your_model_name_or_path> \ --model_name_or_path <your_model_name_or_path> \
--output_dir output/AdvertiseGenLoRA/$formatted_time/ \ --output_dir output/AdvertiseGenLoRA/$formatted_time/ \
--train_data_path data/AdvertiseGenChatML/train.json \ --train_data_path data/AdvertiseGenChatML/train.json \
--eval_data_path data/AdvertiseGenChatML/dev.json \ --eval_data_path data/AdvertiseGenChatML/dev.json \
--learning_rate 1e-3 --per_device_train_batch_size 1 \ --learning_rate 1e-3 --per_device_train_batch_size 1 \
--per_device_eval_batch_size 32 --fp16\ --per_device_eval_batch_size 1 --fp16 --use_lora \
--gradient_accumulation_steps 8 --warmup_steps 100 \ --gradient_accumulation_steps 1 --warmup_steps 100 \
--max_steps 3000 --weight_decay 0.01 \ --max_steps 3000 --weight_decay 0.01 \
--evaluation_strategy steps --eval_steps 500 \ --evaluation_strategy steps --eval_steps 500 \
--save_strategy steps --save_steps 500 \ --save_strategy steps --save_steps 500 --seed 42 \
--use_lora true --seed 42 \ --log_level info --logging_strategy steps --logging_steps 10 \
--log_level info --logging_strategy steps --logging_steps 10 --deepspeed configs/ds_config_zero3_offload.json

View File

@ -1,17 +0,0 @@
formatted_time=$(date +"%Y%m%d%H%M%S")
echo $formatted_time
deepspeed --include localhost:0,1 finetune.py \
--model_name_or_path <your_model_name_or_path> \
--output_dir output/AdvertiseGenLoRA/$formatted_time/ \
--train_data_path data/AdvertiseGenChatML/train.json \
--eval_data_path data/AdvertiseGenChatML/dev.json \
--learning_rate 1e-3 --per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 --fp16 --use_lora \
--gradient_accumulation_steps 8 --warmup_steps 100 \
--max_steps 3000 --weight_decay 0.01 \
--evaluation_strategy steps --eval_steps 500 \
--save_strategy steps --save_steps 500 --seed 42 \
--log_level info --logging_strategy steps --logging_steps 10 \
--deepspeed configs/ds_config_zero2_offload.json

View File

@ -2,16 +2,16 @@ formatted_time=$(date +"%Y%m%d%H%M%S")
echo $formatted_time echo $formatted_time
deepspeed --include localhost:1,2 finetune.py \ deepspeed --include localhost:1,2,4,5 finetune.py \
--model_name_or_path <your_model_name_or_path> \ --model_name_or_path <your_model_name_or_path> \
--output_dir output/AdvertiseGenLoRA/$formatted_time/ \ --output_dir output/AdvertiseGenLoRA/$formatted_time/ \
--train_data_path data/AdvertiseGenChatML/train.json \ --train_data_path data/AdvertiseGenChatML/train.json \
--eval_data_path data/AdvertiseGenChatML/dev.json \ --eval_data_path data/AdvertiseGenChatML/dev.json \
--learning_rate 1e-3 --per_device_train_batch_size 1 \ --learning_rate 1e-3 --per_device_train_batch_size 1 \
--per_device_eval_batch_size 32 --fp16 \ --per_device_eval_batch_size 4 --bf16 \
--gradient_accumulation_steps 8 --warmup_steps 100 \ --gradient_accumulation_steps 8 --warmup_steps 100 \
--max_steps 3000 --weight_decay 0.01 \ --max_steps 3000 --weight_decay 0.01 \
--evaluation_strategy steps --eval_steps 500 \ --evaluation_strategy steps --eval_steps 500 \
--save_strategy steps --save_steps 500 --seed 42 \ --save_strategy steps --save_steps 500 --seed 42 \
--log_level info --logging_strategy steps --logging_steps 10 \ --log_level info --logging_strategy steps --logging_steps 10 \
--deepspeed configs/ds_config_zero2.json --deepspeed configs/ds_config_zero3_offload.json