Fix finetune supervised dataset issue

2026-01-19 12:53:36 +08:00 · 2024-03-16 01:58:08 +08:00 · 2024-03-16 01:58:08 +08:00 · 36337f70ea
commit 36337f70ea
parent 74ecbcce5e
53 changed files with 170463 additions and 54 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
 *__pycache__*
 *.pyc
 finetune/output/*
 wip.*
--- a/finetune/README.md
+++ b/finetune/README.md
@ -64,11 +64,6 @@ pip install -r requirements.txt
 ## 数据集格式示例
 这里以 AdvertiseGen 数据集为例,
 您可以从 [Google Drive](https://drive.google.com/file/d/13_vf0xRTQsyneRKdD1bZIr93vBGOczrk/view?usp=sharing)
 或者 [Tsinghua Cloud](https://cloud.tsinghua.edu.cn/f/b3f119a008264b1cabd1/?dl=1) 下载 AdvertiseGen 数据集。
 将解压后的 AdvertiseGen 目录放到 `data` 目录下并自行转换为如下格式数据集。
 > 请注意，现在的微调代码中加入了验证集，因此，对于一组完整的微调数据集，必须包含训练数据集和验证数据集，测试数据集可以不填写。或者直接用验证数据集代替。
 ```
--- a/finetune/README_en.md
+++ b/finetune/README_en.md
@ -66,11 +66,6 @@ For the data file, the example uses the following format
 ## Dataset Format Example
 Here, taking the AdvertiseGen dataset as an example,
 you can download the AdvertiseGen dataset from [Google Drive](https://drive.google.com/file/d/13_vf0xRTQsyneRKdD1bZIr93vBGOczrk/view?usp=sharing)
 or [Tsinghua Cloud](https://cloud.tsinghua.edu.cn/f/b3f119a008264b1cabd1/?dl=1) . After extracting the AdvertiseGen directory, place it in the `data` directory and convert it into the following format dataset.
 > Please note, the fine-tuning code now includes a validation set, so for a complete set of fine-tuning datasets, it must contain training and validation datasets, while the test dataset is optional. Or, you can use the validation dataset in place of it.
 ```
--- a/finetune/data/AdvertiseGen/dev.json
+++ b/finetune/data/AdvertiseGen/dev.json
--- a/finetune/data/AdvertiseGen/train.json
+++ b/finetune/data/AdvertiseGen/train.json
--- a/finetune/data/ocnli_public/dev.json
+++ b/finetune/data/ocnli_public/dev.json
--- a/finetune/data/ocnli_public/train.json
+++ b/finetune/data/ocnli_public/train.json
--- a/finetune/finetune.py
+++ b/finetune/finetune.py
@ -73,7 +73,7 @@ class SupervisedDataset(Dataset):
    def preprocessing(self, example):
        input_ids = [self.tokenizer.bos_token_id]
-        label_ids = []
+        label_ids = [self.ignore_index]
        for message in example["messages"]:
            role = message["role"]
@ -92,17 +92,22 @@ class SupervisedDataset(Dataset):
                    + content_ids
                )
        input_ids.append(self.tokenizer.eos_token_id)
        label_ids.append(self.tokenizer.eos_token_id)
        # truncate to max len
        input_ids = input_ids[: self.model_max_length]
        label_ids = label_ids[: self.model_max_length]
-        # input_ids += [self.tokenizer.eos_token_id] * (len(label_ids) - len(input_ids))
+        attention_mask = [1] * len(input_ids)
        # pad to max len
        input_ids += [self.tokenizer.eos_token_id] * (
            self.model_max_length - len(input_ids)
        )
        label_ids += [self.ignore_index] * (self.model_max_length - len(label_ids))
        attention_mask += [0] * (self.model_max_length - len(attention_mask))
        # convert to pt tensor
        input_ids = torch.LongTensor(input_ids)
        label_ids = torch.LongTensor(label_ids)
-        # print(f"len input_ids: {len(input_ids)}, len label_ids: {len(label_ids)}")
+        attention_mask = torch.LongTensor(attention_mask)
        attention_mask = input_ids.ne(self.tokenizer.eos_token_id)
        return {
            "input_ids": input_ids,
            "label_ids": label_ids,
@ -158,7 +163,6 @@ def load_model_and_tokenizer(
 if __name__ == "__main__":
    model_path = "/mnt/data/user/tc_agi/yh/models/MiniCPM"
    max_length = 512
    parser = transformers.HfArgumentParser(
        (ModelArguments, DataArguments, TrainingArguments)
    )
--- a/finetune/lora_finetune.ipynb
+++ b/finetune/lora_finetune.ipynb
@ -4,11 +4,13 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "# MiniCPM-2B 参数高效微调（LoRA）消费级单卡示例\n",
+    "# MiniCPM-2B 参数高效微调（LoRA）A100 80G 单卡示例\n",
    "\n",
    "显存更小的显卡可用 batch size 和 grad_accum 间时间换空间\n",
    "\n",
    "本 notebook 是一个使用 `AdvertiseGen` 数据集对 MiniCPM-2B 进行 LoRA 微调，使其具备专业的广告生成能力的代码示例。\n",
    "\n",
-    "## 硬件需求\n",
+    "## 最低硬件需求\n",
    "- 显存：12GB\n",
    "- 显卡架构：安培架构（推荐）\n",
    "- 内存：16GB"
@ -20,31 +22,7 @@
   "source": [
    "## 1. 准备数据集\n",
    "\n",
-    "下载 AdvertiseGen 数据集\n",
+    "将数据集转换为更通用的格式\n"
    "- [Google Drive](https://drive.google.com/file/d/13_vf0xRTQsyneRKdD1bZIr93vBGOczrk/view?usp=sharing)\n",
    "- [Tsinghua Cloud](https://cloud.tsinghua.edu.cn/f/b3f119a008264b1cabd1/?dl=1)\n",
    "\n",
    "下载后的数据集格式为 `.tar.gz` 的压缩格式，接下来的操作中，假设该压缩包被置于 `finetune/data/`。\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 校验文件完整性\n",
    "!md5sum data/AdvertiseGen.tar.gz "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 解压数据集\n",
    "!tar xvf data/AdvertiseGen.tar.gz "
   ]
  },
  {
@ -103,6 +81,47 @@
   "source": [
    "!bash lora_finetune.sh"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 推理验证"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "from tqdm import tqdm\n",
    "from transformers import AutoModelForCausalLM, AutoTokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = \"output/AdvertiseGenLoRA/20240315224356/checkpoint-3000\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(path)\n",
    "model = AutoModelForCausalLM.from_pretrained(\n",
    "    path, torch_dtype=torch.bfloat16, device_map=\"cuda\", trust_remote_code=True\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "res, history = model.chat(tokenizer, query=\"<用户>类型#上衣*材质#牛仔布*颜色#白色*风格#简约*图案#刺绣*衣样式#外套*衣款式#破洞<AI>\", max_length=80, top_p=0.5)\n",
    "res, history"
   ]
  }
 ],
 "metadata": {
@ -121,7 +140,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.10.12"
  }
 },
 "nbformat": 4,
--- a/finetune/lora_finetune.sh
+++ b/finetune/lora_finetune.sh
@ -2,13 +2,13 @@ formatted_time=$(date +"%Y%m%d%H%M%S")
 echo $formatted_time
-deepspeed --include localhost:0 finetune.py \
+deepspeed --include localhost:1 finetune.py \
-    --model_name_or_path <your_model_name_or_path> \
+    --model_name_or_path MiniCPM-2B-sft-bf16 \
    --output_dir output/AdvertiseGenLoRA/$formatted_time/ \
    --train_data_path data/AdvertiseGenChatML/train.json \
    --eval_data_path data/AdvertiseGenChatML/dev.json \
-    --learning_rate 1e-3 --per_device_train_batch_size 1 \
+    --learning_rate 5e-5 --per_device_train_batch_size 32 \
-    --per_device_eval_batch_size 1 --fp16 --use_lora \
+    --per_device_eval_batch_size 64  --model_max_length 384 --bf16 --use_lora \
    --gradient_accumulation_steps 1 --warmup_steps 100 \
    --max_steps 3000 --weight_decay 0.01 \
    --evaluation_strategy steps --eval_steps 500 \
--- a/finetune/lora_finetune_ocnli.ipynb
+++ b/finetune/lora_finetune_ocnli.ipynb
--- a/finetune/lora_finetune_ocnli.sh
+++ b/finetune/lora_finetune_ocnli.sh
@ -0,0 +1,17 @@
 formatted_time=$(date +"%Y%m%d%H%M%S")
 echo $formatted_time
 deepspeed --include localhost:1 --master_port 19888 finetune.py \
    --model_name_or_path MiniCPM-2B-sft-bf16 \
    --output_dir output/OCNLILoRA/$formatted_time/ \
    --train_data_path data/ocnli_public_chatml/train.json \
    --eval_data_path data/ocnli_public_chatml/dev.json \
    --learning_rate 5e-5 --per_device_train_batch_size 80 \
    --per_device_eval_batch_size 128 --model_max_length 128 --bf16 --use_lora \
    --gradient_accumulation_steps 1 --warmup_steps 100 \
    --max_steps 1000 --weight_decay 0.01 \
    --evaluation_strategy steps --eval_steps 500 \
    --save_strategy steps --save_steps 500 --seed 42 \
    --log_level info --logging_strategy steps --logging_steps 10 \
    --deepspeed configs/ds_config_zero3_offload.json
--- a/finetune/sft_finetune.sh
+++ b/finetune/sft_finetune.sh
@ -2,16 +2,16 @@ formatted_time=$(date +"%Y%m%d%H%M%S")
 echo $formatted_time
-deepspeed --include localhost:0,1,2,3 finetune.py \
+deepspeed --include localhost:0,1 finetune.py \
-    --model_name_or_path <your_model_name_or_path> \
+    --model_name_or_path MiniCPM-2B-sft-bf16 \
    --output_dir output/AdvertiseGenSFT/$formatted_time/ \
    --train_data_path data/AdvertiseGenChatML/train.json \
    --eval_data_path data/AdvertiseGenChatML/dev.json \
-    --learning_rate 1e-3 --per_device_train_batch_size 1 \
+    --learning_rate 5e-5 --per_device_train_batch_size 14 \
-    --per_device_eval_batch_size 4 --bf16 \
+    --per_device_eval_batch_size 32 --bf16 \
-    --gradient_accumulation_steps 8 --warmup_steps 100 \
+    --gradient_accumulation_steps 2 --warmup_steps 100 \
    --max_steps 3000 --weight_decay 0.01 \
-    --evaluation_strategy steps --eval_steps 500 \
+    --evaluation_strategy steps --eval_steps 100 \
    --save_strategy steps --save_steps 500 --seed 42 \
    --log_level info --logging_strategy steps --logging_steps 10 \
-    --deepspeed configs/ds_config_zero3_offload.json
+    --deepspeed configs/ds_config_zero2.json
--- a/inference/vllm/vllm/pycache/init.cpython-310.pyc
+++ b/inference/vllm/vllm/pycache/init.cpython-310.pyc
--- a/inference/vllm/vllm/pycache/block.cpython-310.pyc
+++ b/inference/vllm/vllm/pycache/block.cpython-310.pyc
--- a/inference/vllm/vllm/pycache/config.cpython-310.pyc
+++ b/inference/vllm/vllm/pycache/config.cpython-310.pyc
--- a/inference/vllm/vllm/pycache/logger.cpython-310.pyc
+++ b/inference/vllm/vllm/pycache/logger.cpython-310.pyc
--- a/inference/vllm/vllm/pycache/outputs.cpython-310.pyc
+++ b/inference/vllm/vllm/pycache/outputs.cpython-310.pyc
--- a/inference/vllm/vllm/pycache/sampling_params.cpython-310.pyc
+++ b/inference/vllm/vllm/pycache/sampling_params.cpython-310.pyc
--- a/inference/vllm/vllm/pycache/sequence.cpython-310.pyc
+++ b/inference/vllm/vllm/pycache/sequence.cpython-310.pyc
--- a/inference/vllm/vllm/pycache/utils.cpython-310.pyc
+++ b/inference/vllm/vllm/pycache/utils.cpython-310.pyc
--- a/inference/vllm/vllm/core/pycache/init.cpython-310.pyc
+++ b/inference/vllm/vllm/core/pycache/init.cpython-310.pyc
--- a/inference/vllm/vllm/core/pycache/block_manager.cpython-310.pyc
+++ b/inference/vllm/vllm/core/pycache/block_manager.cpython-310.pyc
--- a/inference/vllm/vllm/core/pycache/policy.cpython-310.pyc
+++ b/inference/vllm/vllm/core/pycache/policy.cpython-310.pyc
--- a/inference/vllm/vllm/core/pycache/scheduler.cpython-310.pyc
+++ b/inference/vllm/vllm/core/pycache/scheduler.cpython-310.pyc
--- a/inference/vllm/vllm/engine/pycache/init.cpython-310.pyc
+++ b/inference/vllm/vllm/engine/pycache/init.cpython-310.pyc
--- a/inference/vllm/vllm/engine/pycache/arg_utils.cpython-310.pyc
+++ b/inference/vllm/vllm/engine/pycache/arg_utils.cpython-310.pyc
--- a/inference/vllm/vllm/engine/pycache/async_llm_engine.cpython-310.pyc
+++ b/inference/vllm/vllm/engine/pycache/async_llm_engine.cpython-310.pyc
--- a/inference/vllm/vllm/engine/pycache/llm_engine.cpython-310.pyc
+++ b/inference/vllm/vllm/engine/pycache/llm_engine.cpython-310.pyc
--- a/inference/vllm/vllm/engine/pycache/ray_utils.cpython-310.pyc
+++ b/inference/vllm/vllm/engine/pycache/ray_utils.cpython-310.pyc
--- a/inference/vllm/vllm/entrypoints/pycache/init.cpython-310.pyc
+++ b/inference/vllm/vllm/entrypoints/pycache/init.cpython-310.pyc
--- a/inference/vllm/vllm/entrypoints/pycache/llm.cpython-310.pyc
+++ b/inference/vllm/vllm/entrypoints/pycache/llm.cpython-310.pyc
--- a/inference/vllm/vllm/model_executor/pycache/init.cpython-310.pyc
+++ b/inference/vllm/vllm/model_executor/pycache/init.cpython-310.pyc
--- a/inference/vllm/vllm/model_executor/pycache/input_metadata.cpython-310.pyc
+++ b/inference/vllm/vllm/model_executor/pycache/input_metadata.cpython-310.pyc
--- a/inference/vllm/vllm/model_executor/pycache/model_loader.cpython-310.pyc
+++ b/inference/vllm/vllm/model_executor/pycache/model_loader.cpython-310.pyc
--- a/inference/vllm/vllm/model_executor/pycache/utils.cpython-310.pyc
+++ b/inference/vllm/vllm/model_executor/pycache/utils.cpython-310.pyc
--- a/inference/vllm/vllm/model_executor/pycache/weight_utils.cpython-310.pyc
+++ b/inference/vllm/vllm/model_executor/pycache/weight_utils.cpython-310.pyc
--- a/inference/vllm/vllm/transformers_utils/pycache/init.cpython-310.pyc
+++ b/inference/vllm/vllm/transformers_utils/pycache/init.cpython-310.pyc
--- a/inference/vllm/vllm/transformers_utils/pycache/config.cpython-310.pyc
+++ b/inference/vllm/vllm/transformers_utils/pycache/config.cpython-310.pyc
--- a/inference/vllm/vllm/transformers_utils/pycache/tokenizer.cpython-310.pyc
+++ b/inference/vllm/vllm/transformers_utils/pycache/tokenizer.cpython-310.pyc
--- a/inference/vllm/vllm/transformers_utils/configs/pycache/init.cpython-310.pyc
+++ b/inference/vllm/vllm/transformers_utils/configs/pycache/init.cpython-310.pyc
--- a/inference/vllm/vllm/transformers_utils/configs/pycache/aquila.cpython-310.pyc
+++ b/inference/vllm/vllm/transformers_utils/configs/pycache/aquila.cpython-310.pyc
--- a/inference/vllm/vllm/transformers_utils/configs/pycache/baichuan.cpython-310.pyc
+++ b/inference/vllm/vllm/transformers_utils/configs/pycache/baichuan.cpython-310.pyc
--- a/inference/vllm/vllm/transformers_utils/configs/pycache/chatglm.cpython-310.pyc
+++ b/inference/vllm/vllm/transformers_utils/configs/pycache/chatglm.cpython-310.pyc
--- a/inference/vllm/vllm/transformers_utils/configs/pycache/cpm.cpython-310.pyc
+++ b/inference/vllm/vllm/transformers_utils/configs/pycache/cpm.cpython-310.pyc
--- a/inference/vllm/vllm/transformers_utils/configs/pycache/cpm_mistral.cpython-310.pyc
+++ b/inference/vllm/vllm/transformers_utils/configs/pycache/cpm_mistral.cpython-310.pyc
--- a/inference/vllm/vllm/transformers_utils/configs/pycache/falcon.cpython-310.pyc
+++ b/inference/vllm/vllm/transformers_utils/configs/pycache/falcon.cpython-310.pyc
--- a/inference/vllm/vllm/transformers_utils/configs/pycache/mpt.cpython-310.pyc
+++ b/inference/vllm/vllm/transformers_utils/configs/pycache/mpt.cpython-310.pyc
--- a/inference/vllm/vllm/transformers_utils/configs/pycache/qwen.cpython-310.pyc
+++ b/inference/vllm/vllm/transformers_utils/configs/pycache/qwen.cpython-310.pyc
--- a/inference/vllm/vllm/transformers_utils/configs/pycache/yi.cpython-310.pyc
+++ b/inference/vllm/vllm/transformers_utils/configs/pycache/yi.cpython-310.pyc
--- a/inference/vllm/vllm/worker/pycache/init.cpython-310.pyc
+++ b/inference/vllm/vllm/worker/pycache/init.cpython-310.pyc
--- a/inference/vllm/vllm/worker/pycache/cache_engine.cpython-310.pyc
+++ b/inference/vllm/vllm/worker/pycache/cache_engine.cpython-310.pyc
--- a/inference/vllm/vllm/worker/pycache/worker.cpython-310.pyc
+++ b/inference/vllm/vllm/worker/pycache/worker.cpython-310.pyc