+
80
-

huggingface transformers pipeline如何训练自定义数据微调模型?

huggingface transformers pipeline如何训练自定义数据微调模型?


网友回复

+
0
-

from transformers import pipeline, set_seed

# 加载GPT-2模型
model_name = "gpt2"
nlp = pipeline("text-generation", model=model_name)

# 定义微调数据
train_data = [
    {"text": "This is the first sentence. ", "new_text": "This is the first sentence. This is the second sentence. "},
    {"text": "Another example. ", "new_text": "Another example. This is another sentence. "}
]

# 微调模型
nlp_textgen = pipeline("text-generation", model=model_name, tokenizer=model_name, device=0)
set_seed(42)
nlp_textgen.train(train_data, learning_rate=1e-3, num_train_epochs=1)

# 使用微调后的模型生成文本
text = nlp_textgen("Hello, how are you?")
print(text)

在上面的代码中,我们首先使用pipeline函数加载了GPT-2模型,并将其保存在变量nlp中。然后,我们定义了微调数据train_data,其中包含两个示例。每个示例都包含一个输入文本和一个相应的输出文本。我们将使用这些示例来微调模型。接下来,我们使用pipeline函数再次加载GPT-2模型,并将其保存在变量nlp_textgen中。我们使用set_seed函数设置随机种子,以确保结果可重复。然后,我们使用train函数微调模型,传递train_data数据和一些训练参数(例如学习率和训练轮数)。一旦模型微调完毕,您就可以使用它来生成文本。在上面的示例中,我们使用微调后的模型生成了一段文本,并将其保存在变量text中。然后,我们将文本打印出来以进行查看。
+
0
-

transformers API参考链接:https://huggingface.co/docs/transformers/v4.21.2/en/training

train.py

from datasets import load_dataset
from transformers import AutoTokenizer,AutoConfig
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import os
import json

#from datasets import load_metric

os.environ["CUDA_VISIBLE_DEVICES"]= "1,2,3,4,5,6,7"

# 加载数据集(训练数据、测试数据)
dataset = load_dataset("csv", data_files={"train": "./weibo_train.csv", "test": "./weibo_test.csv"}, cache_dir="./cache")
dataset = dataset.class_encode_column("label") #对标签类进行编码,此过程对训练集的标签进行汇总

# 利用加载的数据集,对label进行编号,生成label_map,以便于训练、及后续的推理、计算准确率等
def generate_label_map(dataset):
    labels=dataset['train'].features['label'].names
    label2id=dict()
    for idx,label in enumerate(labels):
        label2id[label]=idx
    return label2id

def save_label_map(dataset,label_map_file):
    # only take the labels of the training data for the label set of the model.
    label2id=generate_label_map(dataset)
    with open(label_map_file,'w',encoding='utf-8') as fout:
        json.dump(label2id,fout)

# 保存label map
label_map_file='label2id.json'
save_label_map(dataset,label_map_file)

# 读取label map【注意,在多卡训练时,这种读取文件的方法可能会导致报错】
#label2id={}
#with open(label_map_file,'r',encoding='utf-8') as fin:
#    label2id=json.load(fin)

label2id=generate_label_map(dataset)

if not label2id:
    exit()
id2label={id:label for label,id in label2id.items()}
#加载tokenizer,会自动下载
tokenizer = AutoTokenizer.from_pretrained("./bert-base-chinese") 


def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=45)

tokenized_dataset = dataset.map(preprocess_function, batched=True)

#small_eval_dataset = tokenized_dataset["test"].shuffle(seed=42).select(range(1000))
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

config = AutoConfig.from_pretrained("./bert-base-chinese", num_labels=len(label2id), id2label=id2label, label2id=label2id)
# 加载Bert预训练模型
model=AutoModelForSequenceClassification.from_pretrained("./bert-base-chinese",config=config)


training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=180,
    per_device_eval_batch_size=128,
    num_train_epochs=20,
    weight_decay=0.01,
    #fp16=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    #train_dataset=small_train_dataset,
    #eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("training sample: ", trainer.train_dataset[1])

trainer.train()

print("finished")
2、推理参考文档:https://huggingface.co/docs/transformers/v4.21.2/en/pipeline_tutorialinfer.py
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset
from datasets import load_dataset
import datasets
from sklearn.metrics import accuracy_score


# 加载测试数据
#dataset = load_dataset("csv", data_files={"train": "", "test": "./weibo_test.csv"}, split='test')
dataset=load_dataset("csv", data_files={"train": "weibo_train.csv", "test": "weibo_test.csv"}, cache_dir="./cache")

# 加载模型
model_dir='./results/checkpoint-1200'
print('using checkpoint from dir:',model_dir)
pipe = pipeline(task="text-classification",device=0,model=model_dir)

# 模型预测
preds=[]
for out in pipe(KeyDataset(dataset['test'], "text"), batch_size=128, truncation="only_first"):
    print(out)
    #print(out['label'])
    preds.append(out['label'])

'''
with open('pred.txt','w',encoding='utf8') as fout:
    for label in preds:
        fout.write(label)
        fout.write('\n')
'''

# 计算准确率
y_true=dataset['test']['label']
acc=accuracy_score(y_true,preds)
print('Acc on test data:{:.4f}'.format(acc))

我知道答案,我要回答