mirror of
https://github.com/RYDE-WORK/MiniCPM.git
synced 2026-01-25 16:13:34 +08:00
91 lines
3.9 KiB
Python
91 lines
3.9 KiB
Python
import argparse
|
|
import json
|
|
import os
|
|
import shutil
|
|
from tqdm import tqdm
|
|
from collections import OrderedDict
|
|
import torch
|
|
|
|
def convert_model(config, ckpt):
|
|
# config
|
|
config_bmt = OrderedDict(
|
|
{
|
|
"_dtype": "bf16",
|
|
"activate_fn": "silu",
|
|
"architectures": [
|
|
"CPMDragonflyForCausalLM"
|
|
],
|
|
"model_type": "cpm_dragonfly",
|
|
"base": 10000,
|
|
"dim_ff": config['intermediate_size'],
|
|
"dim_head": config['hidden_size'] // config['num_attention_heads'],
|
|
"dim_model": config['hidden_size'],
|
|
"dim_model_base": 256,
|
|
"dropout_p": 0.0,
|
|
"eps": config['rms_norm_eps'],
|
|
"init_std": config['initializer_range'],
|
|
"num_heads": config['num_attention_heads'],
|
|
"num_kv_heads": config['num_key_value_heads'],
|
|
"num_layers": config['num_hidden_layers'],
|
|
"orig_max_length": 4096,
|
|
"pose_prob": 0.0,
|
|
"pose_scaling_factor": 1.0,
|
|
"qk_norm": False,
|
|
"rope_scaling_factor": 1,
|
|
"rope_scaling_type": "",
|
|
"scale": True,
|
|
"scale_depth": config['scale_depth'],
|
|
"scale_emb": config['scale_emb'],
|
|
"tie_lm_head": True,
|
|
"tp": 0,
|
|
"transformers_version": "4.35.0",
|
|
"vocab_size": config['vocab_size']
|
|
}
|
|
)
|
|
|
|
|
|
model_bmt = OrderedDict()
|
|
model_bmt["input_embedding.weight"] = ckpt['model.embed_tokens.weight'].contiguous()
|
|
model_bmt["encoder.output_layernorm.weight"] = ckpt['model.norm.weight'].contiguous()
|
|
for lnum in tqdm(range(config_bmt['num_layers'])):
|
|
hf_pfx = f"model.layers.{lnum}"
|
|
bmt_pfx = f"encoder.layers.{lnum}"
|
|
model_bmt[f"{bmt_pfx}.self_att.layernorm_before_attention.weight"] = ckpt[f"{hf_pfx}.input_layernorm.weight"].contiguous()
|
|
model_bmt[f"{bmt_pfx}.self_att.self_attention.project_q.weight"] = ckpt[f"{hf_pfx}.self_attn.q_proj.weight"].contiguous()
|
|
model_bmt[f"{bmt_pfx}.self_att.self_attention.project_k.weight"] = ckpt[f"{hf_pfx}.self_attn.k_proj.weight"].contiguous()
|
|
model_bmt[f"{bmt_pfx}.self_att.self_attention.project_v.weight"] = ckpt[f"{hf_pfx}.self_attn.v_proj.weight"].contiguous()
|
|
model_bmt[f"{bmt_pfx}.self_att.self_attention.attention_out.weight"] = ckpt[f"{hf_pfx}.self_attn.o_proj.weight"].contiguous()
|
|
model_bmt[f"{bmt_pfx}.ffn.layernorm_before_ffn.weight"] = ckpt[f"{hf_pfx}.post_attention_layernorm.weight"].contiguous()
|
|
model_bmt[f"{bmt_pfx}.ffn.ffn.w_in.w_0.weight"] = ckpt[f"{hf_pfx}.mlp.gate_proj.weight"].contiguous()
|
|
model_bmt[f"{bmt_pfx}.ffn.ffn.w_in.w_1.weight"] = ckpt[f"{hf_pfx}.mlp.up_proj.weight"].contiguous()
|
|
model_bmt[f"{bmt_pfx}.ffn.ffn.w_out.weight"] = ckpt[f"{hf_pfx}.mlp.down_proj.weight"].contiguous()
|
|
|
|
|
|
return config_bmt, model_bmt
|
|
|
|
def load_model_ckpt(args):
|
|
with open(os.path.join(args.load, "config.json"), 'r') as fin:
|
|
config = json.load(fin)
|
|
ckpt = torch.load(os.path.join(args.load, "pytorch_model.bin"))
|
|
|
|
os.makedirs(f"{args.save}", exist_ok=True)
|
|
|
|
# model and config
|
|
hf_config, hf_ckpt = convert_model(config, ckpt)
|
|
with open(os.path.join(args.save, "config.json"), 'w') as fout:
|
|
json.dump(hf_config, fout, indent=4)
|
|
torch.save(hf_ckpt, f"{args.save}/pytorch_model.pt")
|
|
|
|
# tokenizer
|
|
shutil.copyfile(f"{args.load}/tokenizer.json", f"{args.save}/tokenizer.json")
|
|
shutil.copyfile(f"{args.load}/tokenizer.model", f"{args.save}/tokenizer.model")
|
|
shutil.copyfile(f"{args.load}/special_tokens_map.json", f"{args.save}/special_tokens_map.json")
|
|
shutil.copyfile(f"{args.load}/tokenizer_config.json", f"{args.save}/tokenizer_config.json")
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--load", type=str, default="")
|
|
parser.add_argument("--save", type=str, default="")
|
|
args = parser.parse_args()
|
|
|
|
load_model_ckpt(args) |