# Fine-Tuning Models with Hugging Face

## What We're Building

A complete pipeline for fine-tuning Hugging Face models (LLMs, vision, etc.) on Clore GPUs with QLoRA, PEFT, and automatic checkpoint management — achieving state-of-the-art results at minimal cost.

## Prerequisites

* Clore.ai API key
* Python 3.10+
* Hugging Face account (for model access)

## Step 1: Fine-Tuning Configuration

```python
# finetune_config.py
"""Configuration for Hugging Face fine-tuning."""

from dataclasses import dataclass, field
from typing import Optional, List

@dataclass
class ModelConfig:
    """Model configuration."""
    model_name: str = "meta-llama/Llama-2-7b-hf"
    trust_remote_code: bool = True
    torch_dtype: str = "bfloat16"
    
    # Quantization
    load_in_4bit: bool = True
    bnb_4bit_compute_dtype: str = "bfloat16"
    bnb_4bit_quant_type: str = "nf4"
    bnb_4bit_use_double_quant: bool = True

@dataclass
class LoRAConfig:
    """LoRA/QLoRA configuration."""
    r: int = 64
    lora_alpha: int = 16
    lora_dropout: float = 0.1
    target_modules: List[str] = field(default_factory=lambda: [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ])
    bias: str = "none"
    task_type: str = "CAUSAL_LM"

@dataclass
class TrainingConfig:
    """Training configuration."""
    # Data
    dataset_name: str = "databricks/databricks-dolly-15k"
    max_seq_length: int = 2048
    
    # Training
    num_epochs: int = 3
    per_device_batch_size: int = 4
    gradient_accumulation_steps: int = 4
    learning_rate: float = 2e-4
    weight_decay: float = 0.01
    warmup_ratio: float = 0.03
    
    # Optimizer
    optim: str = "paged_adamw_8bit"
    
    # Logging
    logging_steps: int = 10
    save_steps: int = 100
    
    # Output
    output_dir: str = "/workspace/fine-tuned-model"
    hub_model_id: Optional[str] = None

@dataclass
class CloreConfig:
    """Clore provisioning configuration."""
    api_key: str = ""
    gpu_type: str = "RTX 4090"
    max_price_usd: float = 0.50
    image: str = "pytorch/pytorch:2.7.1-cuda12.8-cudnn9-devel"
```

## Step 2: Fine-Tuning Script

```python
# finetune.py
"""Hugging Face fine-tuning script with QLoRA."""

import os
import sys
import json
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    TaskType
)
import wandb


def load_config(config_path: str) -> dict:
    """Load configuration from JSON file."""
    with open(config_path) as f:
        return json.load(f)


def setup_model(model_config: dict):
    """Setup model with quantization."""
    
    # BitsAndBytes config for 4-bit quantization
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=model_config.get("load_in_4bit", True),
        bnb_4bit_compute_dtype=getattr(torch, model_config.get("bnb_4bit_compute_dtype", "bfloat16")),
        bnb_4bit_quant_type=model_config.get("bnb_4bit_quant_type", "nf4"),
        bnb_4bit_use_double_quant=model_config.get("bnb_4bit_use_double_quant", True)
    )
    
    # Load model
    model = AutoModelForCausalLM.from_pretrained(
        model_config["model_name"],
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=model_config.get("trust_remote_code", True),
        torch_dtype=getattr(torch, model_config.get("torch_dtype", "bfloat16"))
    )
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        model_config["model_name"],
        trust_remote_code=model_config.get("trust_remote_code", True)
    )
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    
    # Prepare for k-bit training
    model = prepare_model_for_kbit_training(model)
    
    return model, tokenizer


def setup_lora(model, lora_config: dict):
    """Apply LoRA to the model."""
    
    peft_config = LoraConfig(
        r=lora_config.get("r", 64),
        lora_alpha=lora_config.get("lora_alpha", 16),
        lora_dropout=lora_config.get("lora_dropout", 0.1),
        target_modules=lora_config.get("target_modules", [
            "q_proj", "k_proj", "v_proj", "o_proj"
        ]),
        bias=lora_config.get("bias", "none"),
        task_type=TaskType.CAUSAL_LM
    )
    
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()
    
    return model


def prepare_dataset(dataset_config: dict, tokenizer):
    """Prepare dataset for training."""
    
    # Load dataset
    dataset = load_dataset(dataset_config["dataset_name"])
    
    if "train" in dataset:
        dataset = dataset["train"]
    
    # Format function
    def format_instruction(example):
        """Format as instruction-following."""
        if "instruction" in example and "response" in example:
            # Alpaca/Dolly format
            context = example.get("context", "")
            if context:
                text = f"""### Instruction:
{example['instruction']}

### Context:
{context}

### Response:
{example['response']}"""
            else:
                text = f"""### Instruction:
{example['instruction']}

### Response:
{example['response']}"""
        elif "text" in example:
            text = example["text"]
        else:
            raise ValueError("Unknown dataset format")
        
        return {"text": text}
    
    # Apply formatting
    dataset = dataset.map(format_instruction)
    
    # Tokenize
    def tokenize(example):
        return tokenizer(
            example["text"],
            truncation=True,
            max_length=dataset_config.get("max_seq_length", 2048),
            padding="max_length"
        )
    
    dataset = dataset.map(tokenize, remove_columns=dataset.column_names)
    
    return dataset


def main():
    # Load config
    config_path = sys.argv[1] if len(sys.argv) > 1 else "config.json"
    config = load_config(config_path)
    
    model_config = config.get("model", {})
    lora_config = config.get("lora", {})
    training_config = config.get("training", {})
    
    # Check GPU
    print(f"🎮 GPU: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    
    # Initialize wandb
    if os.environ.get("WANDB_API_KEY"):
        wandb.init(
            project=training_config.get("wandb_project", "clore-finetuning"),
            name=training_config.get("wandb_run_name"),
            config=config
        )
    
    # Setup model
    print(f"\n📦 Loading model: {model_config['model_name']}")
    model, tokenizer = setup_model(model_config)
    
    # Apply LoRA
    print("\n🔧 Applying LoRA...")
    model = setup_lora(model, lora_config)
    
    # Prepare dataset
    print(f"\n📊 Loading dataset: {training_config['dataset_name']}")
    dataset = prepare_dataset(training_config, tokenizer)
    print(f"   Samples: {len(dataset)}")
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir=training_config.get("output_dir", "/workspace/fine-tuned-model"),
        num_train_epochs=training_config.get("num_epochs", 3),
        per_device_train_batch_size=training_config.get("per_device_batch_size", 4),
        gradient_accumulation_steps=training_config.get("gradient_accumulation_steps", 4),
        learning_rate=training_config.get("learning_rate", 2e-4),
        weight_decay=training_config.get("weight_decay", 0.01),
        warmup_ratio=training_config.get("warmup_ratio", 0.03),
        optim=training_config.get("optim", "paged_adamw_8bit"),
        logging_steps=training_config.get("logging_steps", 10),
        save_steps=training_config.get("save_steps", 100),
        save_total_limit=3,
        bf16=True,
        gradient_checkpointing=True,
        max_grad_norm=0.3,
        report_to="wandb" if os.environ.get("WANDB_API_KEY") else "none"
    )
    
    # Data collator
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )
    
    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        data_collator=data_collator
    )
    
    # Train
    print("\n🚀 Starting fine-tuning...")
    trainer.train()
    
    # Save
    output_dir = training_config.get("output_dir", "/workspace/fine-tuned-model")
    print(f"\n💾 Saving model to {output_dir}")
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)
    
    # Push to hub if configured
    hub_model_id = training_config.get("hub_model_id")
    if hub_model_id:
        print(f"\n📤 Pushing to Hub: {hub_model_id}")
        trainer.push_to_hub(hub_model_id)
    
    print("\n✅ Fine-tuning complete!")
    
    if os.environ.get("WANDB_API_KEY"):
        wandb.finish()


if __name__ == "__main__":
    main()
```

## Step 3: Remote Fine-Tuning Orchestrator

```python
# run_finetune.py
"""Orchestrate Hugging Face fine-tuning on Clore GPUs."""

import os
import sys
import time
import json
import requests
import paramiko
from scp import SCPClient
from typing import Dict

class HFFineTuner:
    """Run Hugging Face fine-tuning on Clore."""
    
    BASE_URL = "https://api.clore.ai"
    
    def __init__(self, api_key: str, hf_token: str = None, wandb_key: str = None):
        self.api_key = api_key
        self.hf_token = hf_token
        self.wandb_key = wandb_key
        self.headers = {"auth": api_key}
        self.ssh_client = None
        self.current_order = None
    
    def _request(self, method: str, endpoint: str, **kwargs) -> Dict:
        url = f"{self.BASE_URL}{endpoint}"
        response = requests.request(method, url, headers=self.headers, **kwargs)
        return response.json()
    
    def find_gpu(self, gpu_type: str, max_price: float, min_vram_gb: int = 24) -> Dict:
        """Find a suitable GPU for fine-tuning."""
        servers = self._request("GET", "/v1/marketplace")["servers"]
        
        candidates = []
        for server in servers:
            if server.get("rented"):
                continue
            
            gpus = server.get("gpu_array", [])
            if not any(gpu_type in g for g in gpus):
                continue
            
            price = server.get("price", {}).get("usd", {}).get("on_demand_clore", 999)
            if price > max_price:
                continue
            
            candidates.append({
                "id": server["id"],
                "gpus": gpus,
                "price": price,
                "reliability": server.get("reliability", 0)
            })
        
        if not candidates:
            raise Exception(f"No {gpu_type} found under ${max_price}/hr")
        
        # Sort by reliability
        candidates.sort(key=lambda x: -x["reliability"])
        return candidates[0]
    
    def provision(self, server_id: int, ssh_password: str) -> Dict:
        """Provision a server."""
        order = self._request("POST", "/v1/create_order", json={
            "renting_server": server_id,
            "type": "on-demand",
            "currency": "CLORE-Blockchain",
            "image": "pytorch/pytorch:2.7.1-cuda12.8-cudnn9-devel",
            "ports": {"22": "tcp"},
            "env": {"NVIDIA_VISIBLE_DEVICES": "all"},
            "ssh_password": ssh_password
        })
        
        order_id = order["order_id"]
        print(f"📦 Order created: {order_id}")
        
        # Wait for ready
        for _ in range(120):
            orders = self._request("GET", "/v1/my_orders")["orders"]
            current = next((o for o in orders if o["order_id"] == order_id), None)
            if current and current.get("status") == "running":
                self.current_order = current
                return current
            time.sleep(2)
        
        raise Exception("Timeout")
    
    def connect_ssh(self, host: str, port: int, password: str):
        """Connect via SSH."""
        self.ssh_client = paramiko.SSHClient()
        self.ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
        
        for _ in range(5):
            try:
                self.ssh_client.connect(host, port=port, username="root",
                                        password=password, timeout=30)
                print(f"✅ Connected to {host}:{port}")
                return
            except Exception:
                time.sleep(10)
        raise Exception("SSH connection failed")
    
    def run_command(self, cmd: str, stream: bool = True) -> str:
        """Run command on server."""
        stdin, stdout, stderr = self.ssh_client.exec_command(cmd, get_pty=True)
        
        output = ""
        if stream:
            for line in iter(stdout.readline, ""):
                print(line, end="")
                output += line
        else:
            output = stdout.read().decode()
        return output
    
    def setup_environment(self):
        """Setup fine-tuning environment."""
        print("\n🔧 Setting up environment...")
        
        commands = [
            "pip install --upgrade pip",
            "pip install transformers datasets accelerate peft bitsandbytes trl wandb",
            "mkdir -p /workspace"
        ]
        
        if self.hf_token:
            commands.append(f"huggingface-cli login --token {self.hf_token}")
        
        if self.wandb_key:
            commands.append(f"wandb login {self.wandb_key}")
        
        for cmd in commands:
            self.run_command(cmd, stream=False)
        
        print("✅ Environment ready")
    
    def upload_files(self, files: Dict[str, str]):
        """Upload files to server."""
        with SCPClient(self.ssh_client.get_transport()) as scp:
            for local, remote in files.items():
                scp.put(local, remote)
                print(f"📤 {local} → {remote}")
    
    def download_model(self, local_dir: str):
        """Download fine-tuned model."""
        os.makedirs(local_dir, exist_ok=True)
        
        with SCPClient(self.ssh_client.get_transport()) as scp:
            scp.get("/workspace/fine-tuned-model", local_dir, recursive=True)
        
        print(f"📥 Model downloaded to {local_dir}")
    
    def run_finetuning(
        self,
        model_name: str,
        dataset_name: str,
        output_name: str = "fine-tuned-model",
        gpu_type: str = "RTX 4090",
        max_price: float = 0.50,
        num_epochs: int = 3,
        batch_size: int = 4,
        lora_r: int = 64,
        max_seq_length: int = 2048,
        ssh_password: str = "HFFine123!"
    ):
        """Run complete fine-tuning job."""
        
        start_time = time.time()
        
        try:
            # Find GPU
            print(f"🔍 Finding {gpu_type}...")
            gpu = self.find_gpu(gpu_type, max_price)
            print(f"   Found: Server {gpu['id']} @ ${gpu['price']:.2f}/hr")
            
            # Provision
            print(f"\n📦 Provisioning...")
            order = self.provision(gpu["id"], ssh_password)
            
            # Connect
            ssh_info = order["connection"]["ssh"]
            parts = ssh_info.split()
            host = parts[1].split("@")[1]
            port = int(parts[3]) if len(parts) > 3 else 22
            
            self.connect_ssh(host, port, ssh_password)
            
            # Setup
            self.setup_environment()
            
            # Create config
            config = {
                "model": {
                    "model_name": model_name,
                    "load_in_4bit": True,
                    "torch_dtype": "bfloat16"
                },
                "lora": {
                    "r": lora_r,
                    "lora_alpha": 16,
                    "lora_dropout": 0.1
                },
                "training": {
                    "dataset_name": dataset_name,
                    "max_seq_length": max_seq_length,
                    "num_epochs": num_epochs,
                    "per_device_batch_size": batch_size,
                    "output_dir": "/workspace/fine-tuned-model"
                }
            }
            
            # Upload config and script
            config_json = json.dumps(config, indent=2)
            self.run_command(f"cat > /workspace/config.json << 'EOF'\n{config_json}\nEOF")
            self.upload_files({"finetune.py": "/workspace/finetune.py"})
            
            # Run fine-tuning
            print("\n🚀 Starting fine-tuning...")
            self.run_command("cd /workspace && python finetune.py config.json")
            
            # Download results
            self.download_model(f"./{output_name}")
            
            # Summary
            duration = (time.time() - start_time) / 3600
            cost = duration * gpu["price"]
            
            print("\n" + "="*50)
            print("✅ Fine-tuning complete!")
            print(f"⏱️  Duration: {duration:.2f} hours")
            print(f"💰 Cost: ${cost:.2f}")
            
        finally:
            self.cleanup()
    
    def cleanup(self):
        """Cleanup resources."""
        if self.ssh_client:
            self.ssh_client.close()
        
        if self.current_order:
            order_id = self.current_order["order_id"]
            self._request("POST", "/v1/cancel_order", json={"id": order_id})
            print(f"✅ Order cancelled")


def main():
    api_key = os.environ.get("CLORE_API_KEY") or sys.argv[1]
    hf_token = os.environ.get("HF_TOKEN")
    wandb_key = os.environ.get("WANDB_API_KEY")
    
    finetuner = HFFineTuner(api_key, hf_token, wandb_key)
    
    finetuner.run_finetuning(
        model_name="meta-llama/Llama-2-7b-hf",
        dataset_name="databricks/databricks-dolly-15k",
        output_name="llama2-7b-dolly",
        gpu_type="RTX 4090",
        max_price=0.50,
        num_epochs=1,
        batch_size=4,
        lora_r=64
    )


if __name__ == "__main__":
    main()
```

## Model-Specific Configurations

### Llama 2 7B (QLoRA)

```python
config = {
    "model_name": "meta-llama/Llama-2-7b-hf",
    "load_in_4bit": True,
    "lora_r": 64,
    "batch_size": 4,  # ~18GB VRAM
    "max_seq_length": 2048
}
# Estimated: ~2 hours on RTX 4090, ~$0.80
```

### Mistral 7B

```python
config = {
    "model_name": "mistralai/Mistral-7B-v0.1",
    "load_in_4bit": True,
    "lora_r": 32,
    "batch_size": 4,
    "max_seq_length": 4096  # Mistral supports longer context
}
# Estimated: ~2 hours on RTX 4090, ~$0.80
```

### Llama 2 13B (QLoRA)

```python
config = {
    "model_name": "meta-llama/Llama-2-13b-hf",
    "load_in_4bit": True,
    "lora_r": 32,
    "batch_size": 2,  # ~22GB VRAM
    "max_seq_length": 2048
}
# Estimated: ~4 hours on RTX 4090, ~$1.60
```

## VRAM Requirements

| Model       | 4-bit QLoRA     | Full Fine-tune |
| ----------- | --------------- | -------------- |
| Llama 2 7B  | \~18GB          | \~56GB         |
| Llama 2 13B | \~22GB          | \~104GB        |
| Llama 2 70B | \~48GB (2x24GB) | \~560GB        |
| Mistral 7B  | \~16GB          | \~56GB         |
| Falcon 7B   | \~16GB          | \~56GB         |

> 📚 See also: [How to Fine-Tune LLaMA 3 on a Cloud GPU](https://blog.clore.ai/how-to-fine-tune-llama-3-cloud-gpu/)

## Cost Comparison

| Model       | Dataset    | Clore (RTX 4090) | AWS (A100) | Savings |
| ----------- | ---------- | ---------------- | ---------- | ------- |
| Llama 2 7B  | Dolly 15K  | \~$0.80          | \~$12.00   | 93%     |
| Mistral 7B  | Custom 50K | \~$2.40          | \~$36.00   | 93%     |
| Llama 2 13B | Alpaca     | \~$1.60          | \~$24.00   | 93%     |

## Next Steps

* [Hyperparameter Sweeps with Optuna](https://docs.clore.ai/dev/machine-learning-and-training/hyperparameter-sweeps)
* [Auto-Scaling ML Training Pipeline](https://docs.clore.ai/dev/machine-learning-and-training/auto-scaling-pipeline)
* [Deploying a REST API for Model Inference](https://docs.clore.ai/dev/inference-and-deployment/rest-api-deployment)
