Copy import torch
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
TrainingArguments,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset
from trl import SFTTrainer
# === Configuration ===
MODEL_NAME = "mistralai/Mistral-7B-v0.1"
DATASET = "your_dataset.json" # or HuggingFace dataset name
OUTPUT_DIR = "./output"
MAX_SEQ_LENGTH = 2048
# === Load Model with 4-bit Quantization ===
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
quantization_config=bnb_config,
device_map="auto",
trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
# === Configure LoRA ===
lora_config = LoraConfig(
r=64, # Rank (higher = more capacity, more VRAM)
lora_alpha=16, # Scaling factor
target_modules=[
"q_proj", "k_proj", "v_proj", "o_proj", # Attention
"gate_proj", "up_proj", "down_proj", # MLP
],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
)
# Prepare model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
# Print trainable parameters
model.print_trainable_parameters()
# === Load Dataset ===
dataset = load_dataset("json", data_files=DATASET)
# Or: dataset = load_dataset("timdettmers/openassistant-guanaco")
# === Format Dataset ===
def format_chat(example):
# Adjust based on your dataset format
text = f"### Instruction:\n{example['instruction']}\n\n"
if example.get('input'):
text += f"### Input:\n{example['input']}\n\n"
text += f"### Response:\n{example['output']}"
return {"text": text}
dataset = dataset.map(format_chat)
# === Training Arguments ===
training_args = TrainingArguments(
output_dir=OUTPUT_DIR,
num_train_epochs=3,
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
learning_rate=2e-4,
weight_decay=0.001,
warmup_ratio=0.03,
lr_scheduler_type="cosine",
logging_steps=10,
save_steps=100,
save_total_limit=3,
fp16=True,
optim="paged_adamw_8bit",
max_grad_norm=0.3,
group_by_length=True,
report_to="wandb", # or "tensorboard"
)
# === Train ===
trainer = SFTTrainer(
model=model,
train_dataset=dataset["train"],
dataset_text_field="text",
max_seq_length=MAX_SEQ_LENGTH,
tokenizer=tokenizer,
args=training_args,
packing=True, # Pack multiple examples into one sequence
)
trainer.train()
# === Save ===
trainer.save_model(f"{OUTPUT_DIR}/final")
tokenizer.save_pretrained(f"{OUTPUT_DIR}/final")