diff --git a/finetune/README.md b/finetune/README.md
new file mode 100644
index 0000000..4c92758
--- /dev/null
+++ b/finetune/README.md
@@ -0,0 +1,88 @@
+# MiniCPM 微调
+
+本目录提供 MiniCPM-2B 模型的微调示例，包括全量微调和 PEFT。格式上，提供多轮对话微调样例和输入输出格式微调样例。
+
+如果将模型下载到了本地，本文和代码中的 `OpenBMB/MiniCPM-2B` 字段均应替换为相应地址以从本地加载模型。
+
+运行示例需要 `python>=3.10`，除基础的 `torch` 依赖外，示例代码运行还需要依赖。
+
+**我们提供了 [示例notebook](lora_finetune.ipynb) 用于演示如何以 AdvertiseGen 为例处理数据和使用微调脚本。**
+
+```bash
+pip install -r requirements.txt
+```
+
+## 测试硬件标准
+
+我们仅提供了单机多卡/多机多卡的运行示例，因此您需要至少一台具有多个 GPU 的机器。本仓库中的**默认配置文件**中，我们记录了显存的占用情况：
+
++ SFT 全量微调: 4张显卡平均分配，每张显卡占用 `30245MiB` 显存。
++ LORA 微调: 1张显卡，占用 `10619MiB` 显存。
+
+> 请注意，该结果仅供参考，对于不同的参数，显存占用可能会有所不同。请结合你的硬件情况进行调整。
+
+## 多轮对话格式
+
+多轮对话微调示例采用 ChatGLM3 对话格式约定，对不同角色添加不同 `loss_mask` 从而在一遍计算中为多轮回复计算 `loss`。
+
+对于数据文件，样例采用如下格式
+
+```json
+[
+  {
+    "conversations": [
+      {
+        "role": "system",
+        "content": "<system prompt text>"
+      },
+      {
+        "role": "user",
+        "content": "<user prompt text>"
+      },
+      {
+        "role": "assistant",
+        "content": "<assistant response text>"
+      },
+      // ... Muti Turn
+      {
+        "role": "user",
+        "content": "<user prompt text>"
+      },
+      {
+        "role": "assistant",
+        "content": "<assistant response text>"
+      }
+    ]
+  }
+  // ...
+]
+```
+
+## 数据集格式示例
+
+这里以 AdvertiseGen 数据集为例,
+您可以从 [Google Drive](https://drive.google.com/file/d/13_vf0xRTQsyneRKdD1bZIr93vBGOczrk/view?usp=sharing)
+或者 [Tsinghua Cloud](https://cloud.tsinghua.edu.cn/f/b3f119a008264b1cabd1/?dl=1) 下载 AdvertiseGen 数据集。
+将解压后的 AdvertiseGen 目录放到 `data` 目录下并自行转换为如下格式数据集。
+
+> 请注意，现在的微调代码中加入了验证集，因此，对于一组完整的微调数据集，必须包含训练数据集和验证数据集，测试数据集可以不填写。或者直接用验证数据集代替。
+
+```
+{"conversations": [{"role": "user", "content": "类型#裙*裙长#半身裙"}, {"role": "assistant", "content": "这款百搭时尚的仙女半身裙，整体设计非常的飘逸随性，穿上之后每个女孩子都能瞬间变成小仙女啦。料子非常的轻盈，透气性也很好，穿到夏天也很舒适。"}]}
+```
+
+## 开始微调
+
+通过以下代码执行 **单机多卡/多机多卡** 运行。
+
+```bash
+cd finetune
+bash sft_finetune.sh
+```
+
+通过以下代码执行 **单机单卡** 运行。
+
+```angular2html
+cd finetune
+bash lora_finetune.sh
+```
diff --git a/finetune/configs/ds_config_zero2.json b/finetune/configs/ds_config_zero2.json
index b6fb07f..902ffc9 100644
--- a/finetune/configs/ds_config_zero2.json
+++ b/finetune/configs/ds_config_zero2.json
@@ -1,8 +1,4 @@
 {
-    "train_batch_size": "auto",
-    "train_micro_batch_size_per_gpu": "auto",
-    "gradient_accumulation_steps": "auto",
-    "gradient_clipping": 1.0,
     "fp16": {
         "enabled": "auto",
         "loss_scale": 0,
@@ -16,10 +12,15 @@
     },
     "zero_optimization": {
         "stage": 2,
+        "allgather_partitions": true,
         "overlap_comm": true,
         "reduce_scatter": true,
         "contiguous_gradients": true
     },
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": 1.0,
     "wall_clock_breakdown": false,
     "flops_profiler": {
         "enabled": false,
diff --git a/finetune/configs/ds_config_zero2_offload.json b/finetune/configs/ds_config_zero2_offload.json
index 8dd6fc5..09964b0 100644
--- a/finetune/configs/ds_config_zero2_offload.json
+++ b/finetune/configs/ds_config_zero2_offload.json
@@ -1,8 +1,4 @@
 {
-    "train_batch_size": "auto",
-    "train_micro_batch_size_per_gpu": "auto",
-    "gradient_accumulation_steps": "auto",
-    "gradient_clipping": 1.0,
     "fp16": {
         "enabled": "auto",
         "loss_scale": 0,
@@ -16,16 +12,19 @@
     },
     "zero_optimization": {
         "stage": 2,
+        "allgather_partitions": true,
         "overlap_comm": true,
         "reduce_scatter": true,
         "contiguous_gradients": true,
         "offload_optimizer": {
-            "device": "cpu"
-        },
-        "offload_param": {
-            "device": "cpu"
+            "device": "cpu",
+            "pin_memory": true
         }
     },
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": 1.0,
     "wall_clock_breakdown": false,
     "flops_profiler": {
         "enabled": false,
diff --git a/finetune/configs/ds_config_zero3.json b/finetune/configs/ds_config_zero3.json
index 1d091df..7e2e39a 100644
--- a/finetune/configs/ds_config_zero3.json
+++ b/finetune/configs/ds_config_zero3.json
@@ -1,16 +1,32 @@
 {
-    "train_batch_size": "auto",
-    "train_micro_batch_size_per_gpu": "auto",
-    "gradient_accumulation_steps": "auto",
-    "gradient_clipping": 1.0,
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
     "bf16": {
         "enabled": "auto"
     },
     "zero_optimization": {
         "stage": 3,
+        "allgather_partitions": true,
+        "allgather_bucket_size": 5e8,
+        "reduce_scatter": true,
+        "contiguous_gradients": true,
         "overlap_comm": true,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
         "stage3_gather_16bit_weights_on_model_save": true
     },
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": 1.0,
+    "wall_clock_breakdown": false,
     "flops_profiler": {
         "enabled": false,
         "profile_step": 1,
diff --git a/finetune/configs/ds_config_zero3_offload.json b/finetune/configs/ds_config_zero3_offload.json
new file mode 100644
index 0000000..5ee9eb1
--- /dev/null
+++ b/finetune/configs/ds_config_zero3_offload.json
@@ -0,0 +1,46 @@
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "allgather_partitions": true,
+        "allgather_bucket_size": 5e8,
+        "reduce_scatter": true,
+        "contiguous_gradients": true,
+        "overlap_comm": true,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_gather_16bit_weights_on_model_save": true,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": true
+        }
+    },
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": 1.0,
+    "wall_clock_breakdown": false,
+    "flops_profiler": {
+        "enabled": false,
+        "profile_step": 1,
+        "module_depth": -1,
+        "top_modules": 1,
+        "detailed": true,
+        "output_file": null
+    }
+}
\ No newline at end of file
diff --git a/finetune/finetune.py b/finetune/finetune.py
index 01eeb92..3d050db 100644
--- a/finetune/finetune.py
+++ b/finetune/finetune.py
@@ -1,8 +1,6 @@
 # -*- coding: utf-8 -*-
 import json
-import time
 from typing import Dict, Optional
-from datetime import datetime
 from dataclasses import dataclass, field
 
 import torch
@@ -155,7 +153,7 @@ def load_model_and_tokenizer(
         model = get_peft_model(model, lora_config)
         # trainable params: 2,949,120 || all params: 3,010,652,928 || trainable%: 0.09795616002669305
         model.print_trainable_parameters()
-        model.enable_input_require_grads()
+        # model.enable_input_require_grads()  # need when using adapter
 
     return model, tokenizer
 
@@ -184,15 +182,12 @@ if __name__ == "__main__":
         model_max_length=training_args.model_max_length,
     )
 
-    formatted_time = datetime.now().strftime("%Y%m%d%H%M%S")
-
     trainer = Trainer(
         model=model,
         args=training_args,
         train_dataset=train_dataset,
         eval_dataset=eval_dataset,
         tokenizer=tokenizer,
-        # compute_metrics=compute_metrics,
     )
 
     trainer.train()
diff --git a/finetune/lora_finetune.ipynb b/finetune/lora_finetune.ipynb
index dc6e91c..6cf5b41 100644
--- a/finetune/lora_finetune.ipynb
+++ b/finetune/lora_finetune.ipynb
@@ -9,7 +9,7 @@
     "本 notebook 是一个使用 `AdvertiseGen` 数据集对 MiniCPM-2B 进行 LoRA 微调，使其具备专业的广告生成能力的代码示例。\n",
     "\n",
     "## 硬件需求\n",
-    "- 显存：24GB\n",
+    "- 显存：12GB\n",
     "- 显卡架构：安培架构（推荐）\n",
     "- 内存：16GB"
    ]
@@ -101,7 +101,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!bash lora_finetune.sh"
+    "!bash lora_finetune_ds.sh"
    ]
   }
  ],
diff --git a/finetune/lora_finetune.sh b/finetune/lora_finetune.sh
index 86738f7..122f1a5 100644
--- a/finetune/lora_finetune.sh
+++ b/finetune/lora_finetune.sh
@@ -1,16 +1,17 @@
 formatted_time=$(date +"%Y%m%d%H%M%S")
 echo $formatted_time
 
-CUDA_VISIBLE_DEVICES=0 python finetune.py \
+
+deepspeed --include localhost:0 finetune.py \
     --model_name_or_path <your_model_name_or_path> \
     --output_dir output/AdvertiseGenLoRA/$formatted_time/ \
     --train_data_path data/AdvertiseGenChatML/train.json \
     --eval_data_path data/AdvertiseGenChatML/dev.json \
     --learning_rate 1e-3 --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 32 --fp16\
-    --gradient_accumulation_steps 8 --warmup_steps 100 \
+    --per_device_eval_batch_size 1 --fp16 --use_lora \
+    --gradient_accumulation_steps 1 --warmup_steps 100 \
     --max_steps 3000 --weight_decay 0.01 \
     --evaluation_strategy steps --eval_steps 500 \
-    --save_strategy steps --save_steps 500 \
-    --use_lora true --seed 42 \
-    --log_level info --logging_strategy steps --logging_steps 10
+    --save_strategy steps --save_steps 500 --seed 42 \
+    --log_level info --logging_strategy steps --logging_steps 10 \
+    --deepspeed configs/ds_config_zero3_offload.json
diff --git a/finetune/lora_finetune_ds.sh b/finetune/lora_finetune_ds.sh
deleted file mode 100644
index 5ced717..0000000
--- a/finetune/lora_finetune_ds.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-formatted_time=$(date +"%Y%m%d%H%M%S")
-echo $formatted_time
-
-
-deepspeed --include localhost:0,1 finetune.py \
-    --model_name_or_path <your_model_name_or_path> \
-    --output_dir output/AdvertiseGenLoRA/$formatted_time/ \
-    --train_data_path data/AdvertiseGenChatML/train.json \
-    --eval_data_path data/AdvertiseGenChatML/dev.json \
-    --learning_rate 1e-3 --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 --fp16 --use_lora \
-    --gradient_accumulation_steps 8 --warmup_steps 100 \
-    --max_steps 3000 --weight_decay 0.01 \
-    --evaluation_strategy steps --eval_steps 500 \
-    --save_strategy steps --save_steps 500 --seed 42 \
-    --log_level info --logging_strategy steps --logging_steps 10 \
-    --deepspeed configs/ds_config_zero2_offload.json
diff --git a/finetune/sft_finetune.sh b/finetune/sft_finetune.sh
index 6338da8..877cd8a 100644
--- a/finetune/sft_finetune.sh
+++ b/finetune/sft_finetune.sh
@@ -2,16 +2,16 @@ formatted_time=$(date +"%Y%m%d%H%M%S")
 echo $formatted_time
 
 
-deepspeed --include localhost:1,2 finetune.py \
+deepspeed --include localhost:1,2,4,5 finetune.py \
     --model_name_or_path <your_model_name_or_path> \
     --output_dir output/AdvertiseGenLoRA/$formatted_time/ \
     --train_data_path data/AdvertiseGenChatML/train.json \
     --eval_data_path data/AdvertiseGenChatML/dev.json \
     --learning_rate 1e-3 --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 32 --fp16 \
+    --per_device_eval_batch_size 4 --bf16 \
     --gradient_accumulation_steps 8 --warmup_steps 100 \
     --max_steps 3000 --weight_decay 0.01 \
     --evaluation_strategy steps --eval_steps 500 \
     --save_strategy steps --save_steps 500 --seed 42 \
     --log_level info --logging_strategy steps --logging_steps 10 \
-    --deepspeed configs/ds_config_zero2.json
+    --deepspeed configs/ds_config_zero3_offload.json