import torch
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
TrainingArguments,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig
# === 配置 ===
# 选择其一:Qwen2.5、DeepSeek-R1-Distill、Llama 3.1、Mistral 等。
MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
# MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
# MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
DATASET = "your_dataset.json" # 或 HuggingFace 数据集名
OUTPUT_DIR = "./output"
MAX_SEQ_LENGTH = 4096 # Qwen2.5 支持最高 32K 上下文
USE_DORA = True # DoRA 相对于标准 LoRA 提升质量
USE_FLASH_ATTN = True # Flash Attention 2 节省显存并加速
# === 使用 4-bit 量化加载模型 ===
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
quantization_config=bnb_config,
device_map="auto",
trust_remote_code=True, # Qwen2.5 和 DeepSeek 必需
# Flash Attention 2:需要安培架构及以上 GPU(RTX 30/40,A100)
attn_implementation="flash_attention_2" if USE_FLASH_ATTN else "eager",
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
# === 配置 LoRA(可选 DoRA) ===
# DoRA(权重分解的低秩适配)— 需要 PEFT >= 0.14
# use_dora=True 会将权重分解为幅度 + 方向,以获得更好质量
lora_config = LoraConfig(
r=64, # 秩(越大=容量越高,显存越多)
lora_alpha=16, # 缩放因子(保持等于或为 r 的一半)
target_modules=[
"q_proj", "k_proj", "v_proj", "o_proj", # 注意力层
"gate_proj", "up_proj", "down_proj", # MLP 层
],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
use_dora=USE_DORA, # DoRA:提升质量(PEFT 0.14+)
# use_rslora=True, # 可选:秩稳定 LoRA
)
# 为 QLoRA 训练准备模型
model = prepare_model_for_kbit_training(
model,
use_gradient_checkpointing=True,
gradient_checkpointing_kwargs={"use_reentrant": False},
)
model = get_peft_model(model, lora_config)
# 打印可训练参数摘要
model.print_trainable_parameters()
# 示例输出:trainable params: 42,991,616 || all params: 7,284,891,648 || trainable%: 0.59
# === 加载数据集 ===
dataset = load_dataset("json", data_files=DATASET)
# 或使用公开数据集:
# dataset = load_dataset("HuggingFaceH4/ultrachat_200k")
# === 将数据集格式化为 Qwen2.5 / ChatML 格式 ===
def format_chat_qwen(example):
"""使用 ChatML 模板为 Qwen2.5 格式化。"""
messages = example.get("messages", [])
if not messages:
# 处理 alpaca 风格数据
text = f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
text += f"<|im_start|>user\n{example['instruction']}"
if example.get("input"):
text += f"\n{example['input']}"
text += f"<|im_end|>\n<|im_start|>assistant\n{example['output']}<|im_end|>"
else:
# 处理 messages 格式(ChatML)
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=False,
)
return {"text": text}
dataset = dataset.map(format_chat_qwen, remove_columns=dataset["train"].column_names)
# === 训练参数(PEFT 0.14+ / TRL 0.12+) ===
training_args = SFTConfig(
output_dir=OUTPUT_DIR,
num_train_epochs=3,
per_device_train_batch_size=2,
gradient_accumulation_steps=8, # 实际批次 = 2 * 8 = 16
learning_rate=2e-4,
weight_decay=0.001,
warmup_ratio=0.03,
lr_scheduler_type="cosine",
logging_steps=10,
save_steps=100,
save_total_limit=3,
bf16=True, # 在现代 GPU(A100、RTX 30/40)上使用 bf16
# fp16=True, # 在较旧 GPU 上使用 fp16
optim="paged_adamw_8bit",
max_grad_norm=0.3,
group_by_length=True,
report_to="wandb", # 或 "tensorboard"
# SFTConfig 特有:
max_seq_length=MAX_SEQ_LENGTH,
dataset_text_field="text",
packing=True, # 为提高效率打包多个示例
)
# === 开始训练 ===
trainer = SFTTrainer(
model=model,
train_dataset=dataset["train"],
tokenizer=tokenizer,
args=training_args,
)
trainer.train()
# === 保存 LoRA 适配器 ===
trainer.save_model(f"{OUTPUT_DIR}/final")
tokenizer.save_pretrained(f"{OUTPUT_DIR}/final")
print(f"Model saved to {OUTPUT_DIR}/final")