mirror of
https://github.com/RYDE-WORK/MiniCPM.git
synced 2026-02-05 06:33:25 +08:00
使用encode方法,代码可读性更强
This commit is contained in:
parent
8ae10c60ff
commit
062ea5264a
@ -55,8 +55,8 @@ class SupervisedDataset(Dataset):
|
|||||||
self.data = json.load(open(data_path))
|
self.data = json.load(open(data_path))
|
||||||
self.tokenizer = tokenizer
|
self.tokenizer = tokenizer
|
||||||
self.model_max_length = model_max_length
|
self.model_max_length = model_max_length
|
||||||
self.user_tokens = self.tokenizer(user_tokens)['input_ids']#针对不同模型,都可以对应到<用户>的id
|
self.user_tokens = self.tokenizer.encode(user_tokens)#针对不同模型,都可以对应到<用户>的id
|
||||||
self.assistant_tokens = self.tokenizer(assistant_tokens)['input_ids']#针对不同模型,都可以对应到<AI>的id
|
self.assistant_tokens = self.tokenizer.encode(assistant_tokens)#针对不同模型,都可以对应到<AI>的id
|
||||||
self.ignore_index = -100
|
self.ignore_index = -100
|
||||||
item = self.preprocessing(self.data[0])
|
item = self.preprocessing(self.data[0])
|
||||||
print("input:", self.tokenizer.decode(item["input_ids"]))
|
print("input:", self.tokenizer.decode(item["input_ids"]))
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user