diff --git a/finetune/README.md b/finetune/README.md new file mode 100644 index 0000000..4c92758 --- /dev/null +++ b/finetune/README.md @@ -0,0 +1,88 @@ +# MiniCPM 微调 + +本目录提供 MiniCPM-2B 模型的微调示例,包括全量微调和 PEFT。格式上,提供多轮对话微调样例和输入输出格式微调样例。 + +如果将模型下载到了本地,本文和代码中的 `OpenBMB/MiniCPM-2B` 字段均应替换为相应地址以从本地加载模型。 + +运行示例需要 `python>=3.10`,除基础的 `torch` 依赖外,示例代码运行还需要依赖。 + +**我们提供了 [示例notebook](lora_finetune.ipynb) 用于演示如何以 AdvertiseGen 为例处理数据和使用微调脚本。** + +```bash +pip install -r requirements.txt +``` + +## 测试硬件标准 + +我们仅提供了单机多卡/多机多卡的运行示例,因此您需要至少一台具有多个 GPU 的机器。本仓库中的**默认配置文件**中,我们记录了显存的占用情况: + ++ SFT 全量微调: 4张显卡平均分配,每张显卡占用 `30245MiB` 显存。 ++ LORA 微调: 1张显卡,占用 `10619MiB` 显存。 + +> 请注意,该结果仅供参考,对于不同的参数,显存占用可能会有所不同。请结合你的硬件情况进行调整。 + +## 多轮对话格式 + +多轮对话微调示例采用 ChatGLM3 对话格式约定,对不同角色添加不同 `loss_mask` 从而在一遍计算中为多轮回复计算 `loss`。 + +对于数据文件,样例采用如下格式 + +```json +[ + { + "conversations": [ + { + "role": "system", + "content": "" + }, + { + "role": "user", + "content": "" + }, + { + "role": "assistant", + "content": "" + }, + // ... Muti Turn + { + "role": "user", + "content": "" + }, + { + "role": "assistant", + "content": "" + } + ] + } + // ... +] +``` + +## 数据集格式示例 + +这里以 AdvertiseGen 数据集为例, +您可以从 [Google Drive](https://drive.google.com/file/d/13_vf0xRTQsyneRKdD1bZIr93vBGOczrk/view?usp=sharing) +或者 [Tsinghua Cloud](https://cloud.tsinghua.edu.cn/f/b3f119a008264b1cabd1/?dl=1) 下载 AdvertiseGen 数据集。 +将解压后的 AdvertiseGen 目录放到 `data` 目录下并自行转换为如下格式数据集。 + +> 请注意,现在的微调代码中加入了验证集,因此,对于一组完整的微调数据集,必须包含训练数据集和验证数据集,测试数据集可以不填写。或者直接用验证数据集代替。 + +``` +{"conversations": [{"role": "user", "content": "类型#裙*裙长#半身裙"}, {"role": "assistant", "content": "这款百搭时尚的仙女半身裙,整体设计非常的飘逸随性,穿上之后每个女孩子都能瞬间变成小仙女啦。料子非常的轻盈,透气性也很好,穿到夏天也很舒适。"}]} +``` + +## 开始微调 + +通过以下代码执行 **单机多卡/多机多卡** 运行。 + +```bash +cd finetune +bash sft_finetune.sh +``` + +通过以下代码执行 **单机单卡** 运行。 + +```angular2html +cd finetune +bash lora_finetune.sh +``` diff --git a/finetune/configs/ds_config_zero2.json b/finetune/configs/ds_config_zero2.json index b6fb07f..902ffc9 100644 --- a/finetune/configs/ds_config_zero2.json +++ b/finetune/configs/ds_config_zero2.json @@ -1,8 +1,4 @@ { - "train_batch_size": "auto", - "train_micro_batch_size_per_gpu": "auto", - "gradient_accumulation_steps": "auto", - "gradient_clipping": 1.0, "fp16": { "enabled": "auto", "loss_scale": 0, @@ -16,10 +12,15 @@ }, "zero_optimization": { "stage": 2, + "allgather_partitions": true, "overlap_comm": true, "reduce_scatter": true, "contiguous_gradients": true }, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "gradient_accumulation_steps": "auto", + "gradient_clipping": 1.0, "wall_clock_breakdown": false, "flops_profiler": { "enabled": false, diff --git a/finetune/configs/ds_config_zero2_offload.json b/finetune/configs/ds_config_zero2_offload.json index 8dd6fc5..09964b0 100644 --- a/finetune/configs/ds_config_zero2_offload.json +++ b/finetune/configs/ds_config_zero2_offload.json @@ -1,8 +1,4 @@ { - "train_batch_size": "auto", - "train_micro_batch_size_per_gpu": "auto", - "gradient_accumulation_steps": "auto", - "gradient_clipping": 1.0, "fp16": { "enabled": "auto", "loss_scale": 0, @@ -16,16 +12,19 @@ }, "zero_optimization": { "stage": 2, + "allgather_partitions": true, "overlap_comm": true, "reduce_scatter": true, "contiguous_gradients": true, "offload_optimizer": { - "device": "cpu" - }, - "offload_param": { - "device": "cpu" + "device": "cpu", + "pin_memory": true } }, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "gradient_accumulation_steps": "auto", + "gradient_clipping": 1.0, "wall_clock_breakdown": false, "flops_profiler": { "enabled": false, diff --git a/finetune/configs/ds_config_zero3.json b/finetune/configs/ds_config_zero3.json index 1d091df..7e2e39a 100644 --- a/finetune/configs/ds_config_zero3.json +++ b/finetune/configs/ds_config_zero3.json @@ -1,16 +1,32 @@ { - "train_batch_size": "auto", - "train_micro_batch_size_per_gpu": "auto", - "gradient_accumulation_steps": "auto", - "gradient_clipping": 1.0, + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, "bf16": { "enabled": "auto" }, "zero_optimization": { "stage": 3, + "allgather_partitions": true, + "allgather_bucket_size": 5e8, + "reduce_scatter": true, + "contiguous_gradients": true, "overlap_comm": true, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", "stage3_gather_16bit_weights_on_model_save": true }, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "gradient_accumulation_steps": "auto", + "gradient_clipping": 1.0, + "wall_clock_breakdown": false, "flops_profiler": { "enabled": false, "profile_step": 1, diff --git a/finetune/configs/ds_config_zero3_offload.json b/finetune/configs/ds_config_zero3_offload.json new file mode 100644 index 0000000..5ee9eb1 --- /dev/null +++ b/finetune/configs/ds_config_zero3_offload.json @@ -0,0 +1,46 @@ +{ + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "allgather_partitions": true, + "allgather_bucket_size": 5e8, + "reduce_scatter": true, + "contiguous_gradients": true, + "overlap_comm": true, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_gather_16bit_weights_on_model_save": true, + "offload_optimizer": { + "device": "cpu", + "pin_memory": true + }, + "offload_param": { + "device": "cpu", + "pin_memory": true + } + }, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "gradient_accumulation_steps": "auto", + "gradient_clipping": 1.0, + "wall_clock_breakdown": false, + "flops_profiler": { + "enabled": false, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null + } +} \ No newline at end of file diff --git a/finetune/finetune.py b/finetune/finetune.py index 01eeb92..3d050db 100644 --- a/finetune/finetune.py +++ b/finetune/finetune.py @@ -1,8 +1,6 @@ # -*- coding: utf-8 -*- import json -import time from typing import Dict, Optional -from datetime import datetime from dataclasses import dataclass, field import torch @@ -155,7 +153,7 @@ def load_model_and_tokenizer( model = get_peft_model(model, lora_config) # trainable params: 2,949,120 || all params: 3,010,652,928 || trainable%: 0.09795616002669305 model.print_trainable_parameters() - model.enable_input_require_grads() + # model.enable_input_require_grads() # need when using adapter return model, tokenizer @@ -184,15 +182,12 @@ if __name__ == "__main__": model_max_length=training_args.model_max_length, ) - formatted_time = datetime.now().strftime("%Y%m%d%H%M%S") - trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer, - # compute_metrics=compute_metrics, ) trainer.train() diff --git a/finetune/lora_finetune.ipynb b/finetune/lora_finetune.ipynb index dc6e91c..6cf5b41 100644 --- a/finetune/lora_finetune.ipynb +++ b/finetune/lora_finetune.ipynb @@ -9,7 +9,7 @@ "本 notebook 是一个使用 `AdvertiseGen` 数据集对 MiniCPM-2B 进行 LoRA 微调,使其具备专业的广告生成能力的代码示例。\n", "\n", "## 硬件需求\n", - "- 显存:24GB\n", + "- 显存:12GB\n", "- 显卡架构:安培架构(推荐)\n", "- 内存:16GB" ] @@ -101,7 +101,7 @@ "metadata": {}, "outputs": [], "source": [ - "!bash lora_finetune.sh" + "!bash lora_finetune_ds.sh" ] } ], diff --git a/finetune/lora_finetune.sh b/finetune/lora_finetune.sh index 86738f7..122f1a5 100644 --- a/finetune/lora_finetune.sh +++ b/finetune/lora_finetune.sh @@ -1,16 +1,17 @@ formatted_time=$(date +"%Y%m%d%H%M%S") echo $formatted_time -CUDA_VISIBLE_DEVICES=0 python finetune.py \ + +deepspeed --include localhost:0 finetune.py \ --model_name_or_path \ --output_dir output/AdvertiseGenLoRA/$formatted_time/ \ --train_data_path data/AdvertiseGenChatML/train.json \ --eval_data_path data/AdvertiseGenChatML/dev.json \ --learning_rate 1e-3 --per_device_train_batch_size 1 \ - --per_device_eval_batch_size 32 --fp16\ - --gradient_accumulation_steps 8 --warmup_steps 100 \ + --per_device_eval_batch_size 1 --fp16 --use_lora \ + --gradient_accumulation_steps 1 --warmup_steps 100 \ --max_steps 3000 --weight_decay 0.01 \ --evaluation_strategy steps --eval_steps 500 \ - --save_strategy steps --save_steps 500 \ - --use_lora true --seed 42 \ - --log_level info --logging_strategy steps --logging_steps 10 + --save_strategy steps --save_steps 500 --seed 42 \ + --log_level info --logging_strategy steps --logging_steps 10 \ + --deepspeed configs/ds_config_zero3_offload.json diff --git a/finetune/lora_finetune_ds.sh b/finetune/lora_finetune_ds.sh deleted file mode 100644 index 5ced717..0000000 --- a/finetune/lora_finetune_ds.sh +++ /dev/null @@ -1,17 +0,0 @@ -formatted_time=$(date +"%Y%m%d%H%M%S") -echo $formatted_time - - -deepspeed --include localhost:0,1 finetune.py \ - --model_name_or_path \ - --output_dir output/AdvertiseGenLoRA/$formatted_time/ \ - --train_data_path data/AdvertiseGenChatML/train.json \ - --eval_data_path data/AdvertiseGenChatML/dev.json \ - --learning_rate 1e-3 --per_device_train_batch_size 1 \ - --per_device_eval_batch_size 1 --fp16 --use_lora \ - --gradient_accumulation_steps 8 --warmup_steps 100 \ - --max_steps 3000 --weight_decay 0.01 \ - --evaluation_strategy steps --eval_steps 500 \ - --save_strategy steps --save_steps 500 --seed 42 \ - --log_level info --logging_strategy steps --logging_steps 10 \ - --deepspeed configs/ds_config_zero2_offload.json diff --git a/finetune/sft_finetune.sh b/finetune/sft_finetune.sh index 6338da8..877cd8a 100644 --- a/finetune/sft_finetune.sh +++ b/finetune/sft_finetune.sh @@ -2,16 +2,16 @@ formatted_time=$(date +"%Y%m%d%H%M%S") echo $formatted_time -deepspeed --include localhost:1,2 finetune.py \ +deepspeed --include localhost:1,2,4,5 finetune.py \ --model_name_or_path \ --output_dir output/AdvertiseGenLoRA/$formatted_time/ \ --train_data_path data/AdvertiseGenChatML/train.json \ --eval_data_path data/AdvertiseGenChatML/dev.json \ --learning_rate 1e-3 --per_device_train_batch_size 1 \ - --per_device_eval_batch_size 32 --fp16 \ + --per_device_eval_batch_size 4 --bf16 \ --gradient_accumulation_steps 8 --warmup_steps 100 \ --max_steps 3000 --weight_decay 0.01 \ --evaluation_strategy steps --eval_steps 500 \ --save_strategy steps --save_steps 500 --seed 42 \ --log_level info --logging_strategy steps --logging_steps 10 \ - --deepspeed configs/ds_config_zero2.json + --deepspeed configs/ds_config_zero3_offload.json