From 8ae10c60ffc7ca07b037d8a6f0520177182144d1 Mon Sep 17 00:00:00 2001 From: root <403644786@qq.com> Date: Fri, 21 Jun 2024 15:31:24 +0800 Subject: [PATCH] =?UTF-8?q?=E5=8E=9F=E5=A7=8B=E4=BB=A3=E7=A0=81=E7=9A=84us?= =?UTF-8?q?ertoken=E6=98=AF=E9=92=88=E5=AF=B92b=E7=9A=84=EF=BC=8C=E5=85=B6?= =?UTF-8?q?=E4=BB=96=E6=A8=A1=E5=9E=8B=E4=BC=9A=E6=9C=89=E9=97=AE=E9=A2=98?= =?UTF-8?q?,=E7=8E=B0=E5=9C=A8=E6=A0=B9=E6=8D=AE=E4=B8=8D=E5=90=8C?= =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E9=83=BD=E4=BC=9A=E8=B0=83=E6=95=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- finetune/finetune.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/finetune/finetune.py b/finetune/finetune.py index 7008ff2..0fe24fd 100644 --- a/finetune/finetune.py +++ b/finetune/finetune.py @@ -42,21 +42,21 @@ class TrainingArguments(transformers.TrainingArguments): class SupervisedDataset(Dataset): """Dataset for supervised fine-tuning.""" - + def __init__( self, data_path, tokenizer, model_max_length=4096, - user_tokens=[1786, 4194, 95388], - assistant_tokens=[1786, 10850, 95388], + user_tokens='<用户>', + assistant_tokens='', ): super(SupervisedDataset, self).__init__() self.data = json.load(open(data_path)) self.tokenizer = tokenizer self.model_max_length = model_max_length - self.user_tokens = user_tokens - self.assistant_tokens = assistant_tokens + self.user_tokens = self.tokenizer(user_tokens)['input_ids']#针对不同模型,都可以对应到<用户>的id + self.assistant_tokens = self.tokenizer(assistant_tokens)['input_ids']#针对不同模型,都可以对应到的id self.ignore_index = -100 item = self.preprocessing(self.data[0]) print("input:", self.tokenizer.decode(item["input_ids"])) @@ -64,7 +64,6 @@ class SupervisedDataset(Dataset): for id_ in item["label_ids"]: if id_ == -100: continue - labels.append(id_) print("label:", self.tokenizer.decode(labels))