Reinforcement Learning on Cloud GPUs

What We're Building

A complete reinforcement learning training pipeline using Stable-Baselines3 on Clore.ai GPUs. Train agents for games, robotics simulation, and custom environments with automatic GPU provisioning and experiment tracking.

Key Features:

Automatic GPU provisioning via Clore.ai API
Stable-Baselines3 with GPU acceleration
Support for PPO, SAC, DQN, A2C, and more
Weights & Biases integration for experiment tracking
Custom environment support
Checkpoint saving and model export
Multi-environment parallel training

Prerequisites

Clore.ai account with API key (get one here)
Python 3.10+
Basic understanding of reinforcement learning

pip install requests paramiko scp stable-baselines3 gymnasium wandb

Architecture Overview

┌─────────────────────────────────────────────────────────────┐
│                    RL Training Pipeline                      │
├─────────────────────────────────────────────────────────────┤
│  ┌─────────────┐  ┌─────────────┐  ┌─────────────────────┐  │
│  │ Environment │  │   Agent     │  │   GPU Training      │  │
│  │ (Gym/Custom)│──│ (SB3 Algo)  │──│   (Clore.ai)        │  │
│  └─────────────┘  └─────────────┘  └─────────────────────┘  │
│         │                │                    │              │
│         └────────────────┼────────────────────┘              │
│                          ▼                                   │
│                 ┌────────────────┐                           │
│                 │   Experiment   │                           │
│                 │    Tracking    │                           │
│                 │   (W&B/Local)  │                           │
│                 └────────────────┘                           │
└─────────────────────────────────────────────────────────────┘

Step 1: Clore.ai RL Client

# clore_rl_client.py
import requests
import time
import secrets
from typing import Dict, Any, List, Optional
from dataclasses import dataclass

@dataclass
class RLServer:
    """GPU server for RL training."""
    server_id: int
    order_id: int
    ssh_host: str
    ssh_port: int
    ssh_password: str
    gpu_model: str
    gpu_count: int
    hourly_cost: float


class CloreRLClient:
    """Clore.ai client optimized for RL training."""
    
    BASE_URL = "https://api.clore.ai"
    RL_IMAGE = "pytorch/pytorch:2.7.1-cuda12.8-cudnn9-runtime"
    
    def __init__(self, api_key: str):
        self.api_key = api_key
        self.headers = {"auth": api_key}
    
    def _request(self, method: str, endpoint: str, **kwargs) -> Dict[str, Any]:
        """Make API request."""
        url = f"{self.BASE_URL}{endpoint}"
        
        for attempt in range(3):
            response = requests.request(
                method, url,
                headers=self.headers,
                timeout=30,
                **kwargs
            )
            data = response.json()
            
            if data.get("code") == 5:
                time.sleep(2 ** attempt)
                continue
            
            if data.get("code") != 0:
                raise Exception(f"API Error: {data}")
            return data
        
        raise Exception("Max retries exceeded")
    
    def find_rl_gpu(self, 
                    max_price_usd: float = 0.40,
                    min_vram_gb: int = 8) -> Optional[Dict]:
        """Find GPU suitable for RL training."""
        servers = self._request("GET", "/v1/marketplace")["servers"]
        
        # GPUs good for RL (fast single-GPU training)
        rl_gpus = ["RTX 4090", "RTX 4080", "RTX 3090", "RTX 3080", 
                   "RTX 3070", "A100", "A6000"]
        
        candidates = []
        for server in servers:
            if server.get("rented"):
                continue
            
            gpu_array = server.get("gpu_array", [])
            if not any(any(g in gpu for g in rl_gpus) for gpu in gpu_array):
                continue
            
            price = server.get("price", {}).get("usd", {}).get("spot")
            if not price or price > max_price_usd:
                continue
            
            candidates.append({
                "id": server["id"],
                "gpus": gpu_array,
                "gpu_count": len(gpu_array),
                "price_usd": price,
                "reliability": server.get("reliability", 0)
            })
        
        if not candidates:
            return None
        
        candidates.sort(key=lambda x: (x["price_usd"], -x["reliability"]))
        return candidates[0]
    
    def rent_rl_server(self, server: Dict, use_spot: bool = True) -> RLServer:
        """Rent a server for RL training."""
        ssh_password = secrets.token_urlsafe(16)
        
        order_data = {
            "renting_server": server["id"],
            "type": "spot" if use_spot else "on-demand",
            "currency": "CLORE-Blockchain",
            "image": self.RL_IMAGE,
            "ports": {"22": "tcp", "6006": "http"},  # TensorBoard
            "env": {"NVIDIA_VISIBLE_DEVICES": "all"},
            "ssh_password": ssh_password
        }
        
        if use_spot:
            order_data["spotprice"] = server["price_usd"] * 1.15
        
        result = self._request("POST", "/v1/create_order", json=order_data)
        order_id = result["order_id"]
        
        # Wait for server
        for _ in range(120):
            orders = self._request("GET", "/v1/my_orders")["orders"]
            order = next((o for o in orders if o["order_id"] == order_id), None)
            
            if order and order.get("status") == "running":
                conn = order["connection"]["ssh"]
                parts = conn.split()
                ssh_host = parts[1].split("@")[1] if "@" in parts[1] else parts[1]
                ssh_port = int(parts[-1]) if "-p" in conn else 22
                
                return RLServer(
                    server_id=server["id"],
                    order_id=order_id,
                    ssh_host=ssh_host,
                    ssh_port=ssh_port,
                    ssh_password=ssh_password,
                    gpu_model=server["gpus"][0] if server["gpus"] else "Unknown",
                    gpu_count=server["gpu_count"],
                    hourly_cost=server["price_usd"]
                )
            
            time.sleep(2)
        
        raise Exception("Timeout waiting for server")
    
    def cancel_order(self, order_id: int):
        """Cancel an order."""
        self._request("POST", "/v1/cancel_order", json={"id": order_id})

Step 2: Remote RL Trainer

# rl_trainer.py
import paramiko
from scp import SCPClient
import json
import time
from typing import Dict, List, Optional, Any
from dataclasses import dataclass

@dataclass
class TrainingConfig:
    """Configuration for RL training."""
    algorithm: str  # ppo, sac, dqn, a2c, td3
    env_id: str  # Gym environment ID
    total_timesteps: int
    n_envs: int = 4
    learning_rate: float = 3e-4
    batch_size: int = 64
    gamma: float = 0.99
    policy: str = "MlpPolicy"
    device: str = "cuda"
    seed: int = 42
    wandb_project: Optional[str] = None
    wandb_api_key: Optional[str] = None
    checkpoint_freq: int = 10000
    custom_env_code: Optional[str] = None


@dataclass
class TrainingResult:
    """Results from training run."""
    algorithm: str
    env_id: str
    total_timesteps: int
    training_time_seconds: float
    final_reward: float
    model_path: str
    success: bool
    error: Optional[str] = None


class RemoteRLTrainer:
    """Execute RL training on remote GPU server."""
    
    def __init__(self, ssh_host: str, ssh_port: int, ssh_password: str):
        self.ssh_host = ssh_host
        self.ssh_port = ssh_port
        self.ssh_password = ssh_password
        self._ssh = None
        self._scp = None
    
    def connect(self):
        """Establish SSH connection."""
        self._ssh = paramiko.SSHClient()
        self._ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
        self._ssh.connect(
            self.ssh_host,
            port=self.ssh_port,
            username="root",
            password=self.ssh_password,
            timeout=30
        )
        self._scp = SCPClient(self._ssh.get_transport())
    
    def disconnect(self):
        """Close connections."""
        if self._scp:
            self._scp.close()
        if self._ssh:
            self._ssh.close()
    
    def _exec(self, cmd: str, timeout: int = 7200) -> str:
        """Execute command on server."""
        stdin, stdout, stderr = self._ssh.exec_command(cmd, timeout=timeout)
        stdout.channel.recv_exit_status()
        return stdout.read().decode()
    
    def setup_environment(self):
        """Install RL packages."""
        print("Installing RL packages...")
        
        setup_cmds = [
            "pip install -q stable-baselines3[extra] gymnasium",
            "pip install -q tensorboard wandb",
            "pip install -q gymnasium[classic-control,box2d,mujoco]",
            "mkdir -p /tmp/rl_training /tmp/models /tmp/logs"
        ]
        
        for cmd in setup_cmds:
            self._exec(cmd)
        
        print("Setup complete")
    
    def verify_gpu(self) -> Dict:
        """Verify GPU availability."""
        script = '''
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Device count: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    print(f"Device name: {torch.cuda.get_device_name(0)}")
'''
        self._exec(f"python3 -c '{script}'")
        return {"cuda_available": True}
    
    def train(self, config: TrainingConfig) -> TrainingResult:
        """Run RL training with given configuration."""
        
        # Build training script
        training_script = self._build_training_script(config)
        
        # Write script to server
        self._exec(f"cat > /tmp/rl_training/train.py << 'EOF'\n{training_script}\nEOF")
        
        # Run training
        print(f"Starting {config.algorithm.upper()} training on {config.env_id}...")
        start_time = time.time()
        
        output = self._exec(
            f"cd /tmp/rl_training && python3 train.py 2>&1",
            timeout=config.total_timesteps // 100 + 3600  # Estimate timeout
        )
        
        training_time = time.time() - start_time
        
        # Parse results
        result_data = {"success": False}
        for line in output.split("\n"):
            if line.startswith("RESULT:"):
                result_data = json.loads(line[7:])
                break
        
        return TrainingResult(
            algorithm=config.algorithm,
            env_id=config.env_id,
            total_timesteps=config.total_timesteps,
            training_time_seconds=training_time,
            final_reward=result_data.get("final_reward", 0),
            model_path=result_data.get("model_path", ""),
            success=result_data.get("success", False),
            error=result_data.get("error")
        )
    
    def _build_training_script(self, config: TrainingConfig) -> str:
        """Build the training script."""
        
        # Custom environment code if provided
        custom_env_setup = ""
        if config.custom_env_code:
            custom_env_setup = config.custom_env_code
        
        # W&B setup
        wandb_setup = ""
        if config.wandb_project and config.wandb_api_key:
            wandb_setup = f'''
import wandb
wandb.login(key="{config.wandb_api_key}")
wandb.init(project="{config.wandb_project}", config={{
    "algorithm": "{config.algorithm}",
    "env_id": "{config.env_id}",
    "total_timesteps": {config.total_timesteps},
    "learning_rate": {config.learning_rate},
    "batch_size": {config.batch_size},
}})
callback_list.append(WandbCallback())
'''
        
        # Algorithm mapping
        algo_imports = {
            "ppo": "from stable_baselines3 import PPO",
            "sac": "from stable_baselines3 import SAC",
            "dqn": "from stable_baselines3 import DQN",
            "a2c": "from stable_baselines3 import A2C",
            "td3": "from stable_baselines3 import TD3",
            "ddpg": "from stable_baselines3 import DDPG",
        }
        
        algo_class = config.algorithm.upper()
        
        script = f'''
import gymnasium as gym
import numpy as np
import json
import time
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback, BaseCallback
from stable_baselines3.common.evaluation import evaluate_policy
{algo_imports.get(config.algorithm, "from stable_baselines3 import PPO")}

try:
    from stable_baselines3.common.callbacks import WandbCallback
except:
    pass

{custom_env_setup}

# Create environment
env = make_vec_env("{config.env_id}", n_envs={config.n_envs}, seed={config.seed})

# Create callbacks
callback_list = []

# Checkpoint callback
checkpoint_callback = CheckpointCallback(
    save_freq={config.checkpoint_freq},
    save_path="/tmp/models/",
    name_prefix="rl_model"
)
callback_list.append(checkpoint_callback)

# Eval callback
eval_env = make_vec_env("{config.env_id}", n_envs=1, seed={config.seed} + 1)
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path="/tmp/models/",
    log_path="/tmp/logs/",
    eval_freq={config.checkpoint_freq // 2},
    deterministic=True,
    render=False
)
callback_list.append(eval_callback)

{wandb_setup}

# Create model
model = {algo_class}(
    "{config.policy}",
    env,
    learning_rate={config.learning_rate},
    batch_size={config.batch_size},
    gamma={config.gamma},
    verbose=1,
    device="{config.device}",
    seed={config.seed},
    tensorboard_log="/tmp/logs/tensorboard/"
)

# Train
start_time = time.time()
try:
    model.learn(
        total_timesteps={config.total_timesteps},
        callback=callback_list,
        progress_bar=True
    )
    
    # Save final model
    model_path = "/tmp/models/final_model.zip"
    model.save(model_path)
    
    # Evaluate
    mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10)
    
    result = {{
        "success": True,
        "final_reward": float(mean_reward),
        "reward_std": float(std_reward),
        "model_path": model_path,
        "training_time": time.time() - start_time
    }}
    
except Exception as e:
    result = {{"success": False, "error": str(e)}}

print("RESULT:" + json.dumps(result))
'''
        
        return script
    
    def download_model(self, local_path: str):
        """Download trained model."""
        self._scp.get("/tmp/models/final_model.zip", local_path)
    
    def download_logs(self, local_path: str):
        """Download training logs."""
        import os
        os.makedirs(local_path, exist_ok=True)
        self._scp.get("/tmp/logs/", local_path, recursive=True)

Step 3: Complete RL Pipeline

# rl_pipeline.py
import os
import time
from typing import Optional
from dataclasses import asdict

from clore_rl_client import CloreRLClient, RLServer
from rl_trainer import RemoteRLTrainer, TrainingConfig, TrainingResult


class RLPipeline:
    """End-to-end RL training pipeline on Clore.ai."""
    
    def __init__(self, api_key: str):
        self.client = CloreRLClient(api_key)
        self.server: RLServer = None
        self.trainer: RemoteRLTrainer = None
    
    def setup(self, max_price_usd: float = 0.40):
        """Provision GPU for RL training."""
        
        print("🔍 Finding GPU for RL training...")
        gpu = self.client.find_rl_gpu(max_price_usd=max_price_usd)
        
        if not gpu:
            raise Exception(f"No GPU available under ${max_price_usd}/hr")
        
        print(f"   Found: {gpu['gpus']} @ ${gpu['price_usd']:.2f}/hr")
        
        print("🚀 Provisioning server...")
        self.server = self.client.rent_rl_server(gpu)
        
        print(f"   Server ready: {self.server.ssh_host}:{self.server.ssh_port}")
        
        # Connect trainer
        self.trainer = RemoteRLTrainer(
            self.server.ssh_host,
            self.server.ssh_port,
            self.server.ssh_password
        )
        self.trainer.connect()
        self.trainer.setup_environment()
        self.trainer.verify_gpu()
        
        return self
    
    def train(self, config: TrainingConfig) -> TrainingResult:
        """Run RL training."""
        return self.trainer.train(config)
    
    def train_ppo(self,
                  env_id: str,
                  total_timesteps: int = 100000,
                  **kwargs) -> TrainingResult:
        """Train with PPO algorithm."""
        config = TrainingConfig(
            algorithm="ppo",
            env_id=env_id,
            total_timesteps=total_timesteps,
            **kwargs
        )
        return self.train(config)
    
    def train_sac(self,
                  env_id: str,
                  total_timesteps: int = 100000,
                  **kwargs) -> TrainingResult:
        """Train with SAC algorithm (continuous actions)."""
        config = TrainingConfig(
            algorithm="sac",
            env_id=env_id,
            total_timesteps=total_timesteps,
            **kwargs
        )
        return self.train(config)
    
    def train_dqn(self,
                  env_id: str,
                  total_timesteps: int = 100000,
                  **kwargs) -> TrainingResult:
        """Train with DQN algorithm (discrete actions)."""
        config = TrainingConfig(
            algorithm="dqn",
            env_id=env_id,
            total_timesteps=total_timesteps,
            **kwargs
        )
        return self.train(config)
    
    def download_model(self, local_path: str = "./model.zip"):
        """Download the trained model."""
        self.trainer.download_model(local_path)
        print(f"📦 Model downloaded to {local_path}")
    
    def download_logs(self, local_path: str = "./logs"):
        """Download training logs."""
        self.trainer.download_logs(local_path)
        print(f"📊 Logs downloaded to {local_path}")
    
    def cleanup(self):
        """Release resources."""
        if self.trainer:
            self.trainer.disconnect()
        if self.server:
            print("🧹 Releasing server...")
            self.client.cancel_order(self.server.order_id)
    
    def __enter__(self):
        return self
    
    def __exit__(self, *args):
        self.cleanup()

Full Script: Production RL Training

#!/usr/bin/env python3
"""
Reinforcement Learning Training on Clore.ai GPUs.

Usage:
    python train_rl.py --api-key YOUR_API_KEY --env CartPole-v1 --algo ppo --timesteps 100000
"""

import argparse
import os
import time
import json
import secrets
import requests
import paramiko
from scp import SCPClient
from dataclasses import dataclass
from typing import Optional, Dict


@dataclass
class TrainingResult:
    algorithm: str
    env_id: str
    timesteps: int
    time_seconds: float
    final_reward: float
    model_path: str
    success: bool
    cost_usd: float


class CloreRLTrainer:
    """Complete RL training solution on Clore.ai."""
    
    BASE_URL = "https://api.clore.ai"
    IMAGE = "pytorch/pytorch:2.7.1-cuda12.8-cudnn9-runtime"
    
    def __init__(self, api_key: str):
        self.api_key = api_key
        self.headers = {"auth": api_key}
        self.order_id = None
        self.ssh_host = None
        self.ssh_port = None
        self.ssh_password = None
        self.hourly_cost = 0.0
        self._ssh = None
        self._scp = None
    
    def _api(self, method: str, endpoint: str, **kwargs) -> Dict:
        url = f"{self.BASE_URL}{endpoint}"
        for attempt in range(3):
            response = requests.request(method, url, headers=self.headers, **kwargs)
            data = response.json()
            if data.get("code") == 5:
                time.sleep(2 ** attempt)
                continue
            if data.get("code") != 0:
                raise Exception(f"API Error: {data}")
            return data
        raise Exception("Max retries")
    
    def setup(self, max_price: float = 0.40):
        print("🔍 Finding GPU...")
        servers = self._api("GET", "/v1/marketplace")["servers"]
        
        gpus = ["RTX 4090", "RTX 4080", "RTX 3090", "RTX 3080", "A100"]
        candidates = []
        
        for s in servers:
            if s.get("rented"):
                continue
            gpu_array = s.get("gpu_array", [])
            if not any(any(g in gpu for g in gpus) for gpu in gpu_array):
                continue
            price = s.get("price", {}).get("usd", {}).get("spot")
            if price and price <= max_price:
                candidates.append({"id": s["id"], "gpus": gpu_array, "price": price})
        
        if not candidates:
            raise Exception(f"No GPU under ${max_price}/hr")
        
        gpu = min(candidates, key=lambda x: x["price"])
        print(f"   {gpu['gpus']} @ ${gpu['price']:.2f}/hr")
        
        self.ssh_password = secrets.token_urlsafe(16)
        self.hourly_cost = gpu["price"]
        
        print("🚀 Provisioning server...")
        order_data = {
            "renting_server": gpu["id"],
            "type": "spot",
            "currency": "CLORE-Blockchain",
            "image": self.IMAGE,
            "ports": {"22": "tcp", "6006": "http"},
            "env": {"NVIDIA_VISIBLE_DEVICES": "all"},
            "ssh_password": self.ssh_password,
            "spotprice": gpu["price"] * 1.15
        }
        
        result = self._api("POST", "/v1/create_order", json=order_data)
        self.order_id = result["order_id"]
        
        print("⏳ Waiting for server...")
        for _ in range(120):
            orders = self._api("GET", "/v1/my_orders")["orders"]
            order = next((o for o in orders if o["order_id"] == self.order_id), None)
            if order and order.get("status") == "running":
                conn = order["connection"]["ssh"]
                parts = conn.split()
                self.ssh_host = parts[1].split("@")[1]
                self.ssh_port = int(parts[-1]) if "-p" in conn else 22
                break
            time.sleep(2)
        else:
            raise Exception("Timeout")
        
        # Connect SSH
        self._ssh = paramiko.SSHClient()
        self._ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
        self._ssh.connect(self.ssh_host, port=self.ssh_port,
                          username="root", password=self.ssh_password, timeout=30)
        self._scp = SCPClient(self._ssh.get_transport())
        
        print(f"✅ Server ready: {self.ssh_host}:{self.ssh_port}")
        
        # Install packages
        print("📦 Installing RL packages...")
        self._exec("pip install -q stable-baselines3[extra] gymnasium tensorboard", timeout=300)
    
    def _exec(self, cmd: str, timeout: int = 7200) -> str:
        stdin, stdout, stderr = self._ssh.exec_command(cmd, timeout=timeout)
        stdout.channel.recv_exit_status()
        return stdout.read().decode()
    
    def train(self, env_id: str, algorithm: str, timesteps: int,
              learning_rate: float = 3e-4, n_envs: int = 4) -> TrainingResult:
        
        algo_class = algorithm.upper()
        
        script = f'''
import gymnasium as gym
import time
import json
from stable_baselines3 import {algo_class}
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy

env = make_vec_env("{env_id}", n_envs={n_envs})
eval_env = make_vec_env("{env_id}", n_envs=1)

model = {algo_class}("MlpPolicy", env, learning_rate={learning_rate}, verbose=1, device="cuda")

start = time.time()
model.learn(total_timesteps={timesteps}, progress_bar=True)
train_time = time.time() - start

model.save("/tmp/model")

mean_reward, _ = evaluate_policy(model, eval_env, n_eval_episodes=10)

result = {{"success": True, "time": train_time, "reward": float(mean_reward)}}
print("RESULT:" + json.dumps(result))
'''
        
        self._exec(f"cat > /tmp/train.py << 'EOF'\n{script}\nEOF")
        
        print(f"🎮 Training {algorithm.upper()} on {env_id}...")
        print(f"   Timesteps: {timesteps:,}")
        
        start = time.time()
        output = self._exec("python3 /tmp/train.py 2>&1", timeout=7200)
        elapsed = time.time() - start
        
        # Parse result
        result_data = {"success": False, "reward": 0, "time": elapsed}
        for line in output.split("\n"):
            if line.startswith("RESULT:"):
                result_data = json.loads(line[7:])
                break
        
        cost = (elapsed / 3600) * self.hourly_cost
        
        return TrainingResult(
            algorithm=algorithm,
            env_id=env_id,
            timesteps=timesteps,
            time_seconds=elapsed,
            final_reward=result_data.get("reward", 0),
            model_path="/tmp/model.zip",
            success=result_data.get("success", False),
            cost_usd=cost
        )
    
    def download_model(self, local_path: str):
        self._scp.get("/tmp/model.zip", local_path)
    
    def cleanup(self):
        if self._scp:
            self._scp.close()
        if self._ssh:
            self._ssh.close()
        if self.order_id:
            print("🧹 Releasing server...")
            self._api("POST", "/v1/cancel_order", json={"id": self.order_id})
    
    def __enter__(self):
        return self
    
    def __exit__(self, *args):
        self.cleanup()


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--api-key", required=True)
    parser.add_argument("--env", default="CartPole-v1", help="Gym environment")
    parser.add_argument("--algo", default="ppo", choices=["ppo", "sac", "dqn", "a2c", "td3"])
    parser.add_argument("--timesteps", type=int, default=100000)
    parser.add_argument("--lr", type=float, default=3e-4)
    parser.add_argument("--output", default="./model.zip")
    parser.add_argument("--max-price", type=float, default=0.40)
    args = parser.parse_args()
    
    with CloreRLTrainer(args.api_key) as trainer:
        trainer.setup(args.max_price)
        
        result = trainer.train(
            env_id=args.env,
            algorithm=args.algo,
            timesteps=args.timesteps,
            learning_rate=args.lr
        )
        
        print("\n" + "="*60)
        print("📊 TRAINING COMPLETE")
        print(f"   Algorithm: {result.algorithm.upper()}")
        print(f"   Environment: {result.env_id}")
        print(f"   Timesteps: {result.timesteps:,}")
        print(f"   Time: {result.time_seconds:.1f}s ({result.time_seconds/60:.1f} min)")
        print(f"   Final Reward: {result.final_reward:.2f}")
        print(f"   Cost: ${result.cost_usd:.4f}")
        
        if result.success:
            trainer.download_model(args.output)
            print(f"   Model: {args.output}")


if __name__ == "__main__":
    main()

Supported Environments

Environment

Type

Algorithm

CartPole-v1

Discrete

PPO, DQN, A2C

LunarLander-v2

Discrete

PPO, DQN

BipedalWalker-v3

Continuous

PPO, SAC, TD3

HalfCheetah-v4

Continuous

SAC, TD3, PPO

Pendulum-v1

Continuous

SAC, TD3

Cost Comparison

Task

CPU (Local)

AWS p3.2xlarge

Clore.ai RTX 4090

CartPole 100K

5 min

$0.25

$0.01

LunarLander 500K

30 min

$1.50

$0.08

Mujoco 1M

2 hours

$6.00

$0.30

Next Steps

PreviousTraining YOLO Object Detection Models NextModel Training Scheduler (Auto-Rent on Price Drop)

Last updated 27 days ago

Was this helpful?

# clore_rl_client.py import requests import time import secrets from typing import Dict, Any, List, Optional from dataclasses import dataclass @dataclass class RLServer: """GPU server for RL training.""" server_id: int order_id: int ssh_host: str ssh_port: int ssh_password: str gpu_model: str gpu_count: int hourly_cost: float class CloreRLClient: """Clore.ai client optimized for RL training.""" BASE_URL = "https://api.clore.ai" RL_IMAGE = "pytorch/pytorch:2.7.1-cuda12.8-cudnn9-runtime" def __init__(self, api_key: str): self.api_key = api_key self.headers = {"auth": api_key} def _request(self, method: str, endpoint: str, **kwargs) -> Dict[str, Any]: """Make API request.""" url = f"{self.BASE_URL}{endpoint}" for attempt in range(3): response = requests.request( method, url, headers=self.headers, timeout=30, **kwargs ) data = response.json() if data.get("code") == 5: time.sleep(2 ** attempt) continue if data.get("code") != 0: raise Exception(f"API Error: {data}") return data raise Exception("Max retries exceeded") def find_rl_gpu(self, max_price_usd: float = 0.40, min_vram_gb: int = 8) -> Optional[Dict]: """Find GPU suitable for RL training.""" servers = self._request("GET", "/v1/marketplace")["servers"] # GPUs good for RL (fast single-GPU training) rl_gpus = ["RTX 4090", "RTX 4080", "RTX 3090", "RTX 3080", "RTX 3070", "A100", "A6000"] candidates = [] for server in servers: if server.get("rented"): continue gpu_array = server.get("gpu_array", []) if not any(any(g in gpu for g in rl_gpus) for gpu in gpu_array): continue price = server.get("price", {}).get("usd", {}).get("spot") if not price or price > max_price_usd: continue candidates.append({ "id": server["id"], "gpus": gpu_array, "gpu_count": len(gpu_array), "price_usd": price, "reliability": server.get("reliability", 0) }) if not candidates: return None candidates.sort(key=lambda x: (x["price_usd"], -x["reliability"])) return candidates[0] def rent_rl_server(self, server: Dict, use_spot: bool = True) -> RLServer: """Rent a server for RL training.""" ssh_password = secrets.token_urlsafe(16) order_data = { "renting_server": server["id"], "type": "spot" if use_spot else "on-demand", "currency": "CLORE-Blockchain", "image": self.RL_IMAGE, "ports": {"22": "tcp", "6006": "http"}, # TensorBoard "env": {"NVIDIA_VISIBLE_DEVICES": "all"}, "ssh_password": ssh_password } if use_spot: order_data["spotprice"] = server["price_usd"] * 1.15 result = self._request("POST", "/v1/create_order", json=order_data) order_id = result["order_id"] # Wait for server for _ in range(120): orders = self._request("GET", "/v1/my_orders")["orders"] order = next((o for o in orders if o["order_id"] == order_id), None) if order and order.get("status") == "running": conn = order["connection"]["ssh"] parts = conn.split() ssh_host = parts[1].split("@")[1] if "@" in parts[1] else parts[1] ssh_port = int(parts[-1]) if "-p" in conn else 22 return RLServer( server_id=server["id"], order_id=order_id, ssh_host=ssh_host, ssh_port=ssh_port, ssh_password=ssh_password, gpu_model=server["gpus"][0] if server["gpus"] else "Unknown", gpu_count=server["gpu_count"], hourly_cost=server["price_usd"] ) time.sleep(2) raise Exception("Timeout waiting for server") def cancel_order(self, order_id: int): """Cancel an order.""" self._request("POST", "/v1/cancel_order", json={"id": order_id})

# rl_trainer.py import paramiko from scp import SCPClient import json import time from typing import Dict, List, Optional, Any from dataclasses import dataclass @dataclass class TrainingConfig: """Configuration for RL training.""" algorithm: str # ppo, sac, dqn, a2c, td3 env_id: str # Gym environment ID total_timesteps: int n_envs: int = 4 learning_rate: float = 3e-4 batch_size: int = 64 gamma: float = 0.99 policy: str = "MlpPolicy" device: str = "cuda" seed: int = 42 wandb_project: Optional[str] = None wandb_api_key: Optional[str] = None checkpoint_freq: int = 10000 custom_env_code: Optional[str] = None @dataclass class TrainingResult: """Results from training run.""" algorithm: str env_id: str total_timesteps: int training_time_seconds: float final_reward: float model_path: str success: bool error: Optional[str] = None class RemoteRLTrainer: """Execute RL training on remote GPU server.""" def __init__(self, ssh_host: str, ssh_port: int, ssh_password: str): self.ssh_host = ssh_host self.ssh_port = ssh_port self.ssh_password = ssh_password self._ssh = None self._scp = None def connect(self): """Establish SSH connection.""" self._ssh = paramiko.SSHClient() self._ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) self._ssh.connect( self.ssh_host, port=self.ssh_port, username="root", password=self.ssh_password, timeout=30 ) self._scp = SCPClient(self._ssh.get_transport()) def disconnect(self): """Close connections.""" if self._scp: self._scp.close() if self._ssh: self._ssh.close() def _exec(self, cmd: str, timeout: int = 7200) -> str: """Execute command on server.""" stdin, stdout, stderr = self._ssh.exec_command(cmd, timeout=timeout) stdout.channel.recv_exit_status() return stdout.read().decode() def setup_environment(self): """Install RL packages.""" print("Installing RL packages...") setup_cmds = [ "pip install -q stable-baselines3[extra] gymnasium", "pip install -q tensorboard wandb", "pip install -q gymnasium[classic-control,box2d,mujoco]", "mkdir -p /tmp/rl_training /tmp/models /tmp/logs" ] for cmd in setup_cmds: self._exec(cmd) print("Setup complete") def verify_gpu(self) -> Dict: """Verify GPU availability.""" script = ''' import torch print(f"CUDA available: {torch.cuda.is_available()}") print(f"Device count: {torch.cuda.device_count()}") if torch.cuda.is_available(): print(f"Device name: {torch.cuda.get_device_name(0)}") ''' self._exec(f"python3 -c '{script}'") return {"cuda_available": True} def train(self, config: TrainingConfig) -> TrainingResult: """Run RL training with given configuration.""" # Build training script training_script = self._build_training_script(config) # Write script to server self._exec(f"cat > /tmp/rl_training/train.py << 'EOF'\n{training_script}\nEOF") # Run training print(f"Starting {config.algorithm.upper()} training on {config.env_id}...") start_time = time.time() output = self._exec( f"cd /tmp/rl_training && python3 train.py 2>&1", timeout=config.total_timesteps // 100 + 3600 # Estimate timeout ) training_time = time.time() - start_time # Parse results result_data = {"success": False} for line in output.split("\n"): if line.startswith("RESULT:"): result_data = json.loads(line[7:]) break return TrainingResult( algorithm=config.algorithm, env_id=config.env_id, total_timesteps=config.total_timesteps, training_time_seconds=training_time, final_reward=result_data.get("final_reward", 0), model_path=result_data.get("model_path", ""), success=result_data.get("success", False), error=result_data.get("error") ) def _build_training_script(self, config: TrainingConfig) -> str: """Build the training script.""" # Custom environment code if provided custom_env_setup = "" if config.custom_env_code: custom_env_setup = config.custom_env_code # W&B setup wandb_setup = "" if config.wandb_project and config.wandb_api_key: wandb_setup = f''' import wandb wandb.login(key="{config.wandb_api_key}") wandb.init(project="{config.wandb_project}", config={{ "algorithm": "{config.algorithm}", "env_id": "{config.env_id}", "total_timesteps": {config.total_timesteps}, "learning_rate": {config.learning_rate}, "batch_size": {config.batch_size}, }}) callback_list.append(WandbCallback()) ''' # Algorithm mapping algo_imports = { "ppo": "from stable_baselines3 import PPO", "sac": "from stable_baselines3 import SAC", "dqn": "from stable_baselines3 import DQN", "a2c": "from stable_baselines3 import A2C", "td3": "from stable_baselines3 import TD3", "ddpg": "from stable_baselines3 import DDPG", } algo_class = config.algorithm.upper() script = f''' import gymnasium as gym import numpy as np import json import time from stable_baselines3.common.env_util import make_vec_env from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback, BaseCallback from stable_baselines3.common.evaluation import evaluate_policy {algo_imports.get(config.algorithm, "from stable_baselines3 import PPO")} try: from stable_baselines3.common.callbacks import WandbCallback except: pass {custom_env_setup} # Create environment env = make_vec_env("{config.env_id}", n_envs={config.n_envs}, seed={config.seed}) # Create callbacks callback_list = [] # Checkpoint callback checkpoint_callback = CheckpointCallback( save_freq={config.checkpoint_freq}, save_path="/tmp/models/", name_prefix="rl_model" ) callback_list.append(checkpoint_callback) # Eval callback eval_env = make_vec_env("{config.env_id}", n_envs=1, seed={config.seed} + 1) eval_callback = EvalCallback( eval_env, best_model_save_path="/tmp/models/", log_path="/tmp/logs/", eval_freq={config.checkpoint_freq // 2}, deterministic=True, render=False ) callback_list.append(eval_callback) {wandb_setup} # Create model model = {algo_class}( "{config.policy}", env, learning_rate={config.learning_rate}, batch_size={config.batch_size}, gamma={config.gamma}, verbose=1, device="{config.device}", seed={config.seed}, tensorboard_log="/tmp/logs/tensorboard/" ) # Train start_time = time.time() try: model.learn( total_timesteps={config.total_timesteps}, callback=callback_list, progress_bar=True ) # Save final model model_path = "/tmp/models/final_model.zip" model.save(model_path) # Evaluate mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10) result = {{ "success": True, "final_reward": float(mean_reward), "reward_std": float(std_reward), "model_path": model_path, "training_time": time.time() - start_time }} except Exception as e: result = {{"success": False, "error": str(e)}} print("RESULT:" + json.dumps(result)) ''' return script def download_model(self, local_path: str): """Download trained model.""" self._scp.get("/tmp/models/final_model.zip", local_path) def download_logs(self, local_path: str): """Download training logs.""" import os os.makedirs(local_path, exist_ok=True) self._scp.get("/tmp/logs/", local_path, recursive=True)

# rl_pipeline.py import os import time from typing import Optional from dataclasses import asdict from clore_rl_client import CloreRLClient, RLServer from rl_trainer import RemoteRLTrainer, TrainingConfig, TrainingResult class RLPipeline: """End-to-end RL training pipeline on Clore.ai.""" def __init__(self, api_key: str): self.client = CloreRLClient(api_key) self.server: RLServer = None self.trainer: RemoteRLTrainer = None def setup(self, max_price_usd: float = 0.40): """Provision GPU for RL training.""" print("🔍 Finding GPU for RL training...") gpu = self.client.find_rl_gpu(max_price_usd=max_price_usd) if not gpu: raise Exception(f"No GPU available under ${max_price_usd}/hr") print(f" Found: {gpu['gpus']} @ ${gpu['price_usd']:.2f}/hr") print("🚀 Provisioning server...") self.server = self.client.rent_rl_server(gpu) print(f" Server ready: {self.server.ssh_host}:{self.server.ssh_port}") # Connect trainer self.trainer = RemoteRLTrainer( self.server.ssh_host, self.server.ssh_port, self.server.ssh_password ) self.trainer.connect() self.trainer.setup_environment() self.trainer.verify_gpu() return self def train(self, config: TrainingConfig) -> TrainingResult: """Run RL training.""" return self.trainer.train(config) def train_ppo(self, env_id: str, total_timesteps: int = 100000, **kwargs) -> TrainingResult: """Train with PPO algorithm.""" config = TrainingConfig( algorithm="ppo", env_id=env_id, total_timesteps=total_timesteps, **kwargs ) return self.train(config) def train_sac(self, env_id: str, total_timesteps: int = 100000, **kwargs) -> TrainingResult: """Train with SAC algorithm (continuous actions).""" config = TrainingConfig( algorithm="sac", env_id=env_id, total_timesteps=total_timesteps, **kwargs ) return self.train(config) def train_dqn(self, env_id: str, total_timesteps: int = 100000, **kwargs) -> TrainingResult: """Train with DQN algorithm (discrete actions).""" config = TrainingConfig( algorithm="dqn", env_id=env_id, total_timesteps=total_timesteps, **kwargs ) return self.train(config) def download_model(self, local_path: str = "./model.zip"): """Download the trained model.""" self.trainer.download_model(local_path) print(f"📦 Model downloaded to {local_path}") def download_logs(self, local_path: str = "./logs"): """Download training logs.""" self.trainer.download_logs(local_path) print(f"📊 Logs downloaded to {local_path}") def cleanup(self): """Release resources.""" if self.trainer: self.trainer.disconnect() if self.server: print("🧹 Releasing server...") self.client.cancel_order(self.server.order_id) def __enter__(self): return self def __exit__(self, *args): self.cleanup()

#!/usr/bin/env python3 """ Reinforcement Learning Training on Clore.ai GPUs. Usage: python train_rl.py --api-key YOUR_API_KEY --env CartPole-v1 --algo ppo --timesteps 100000 """ import argparse import os import time import json import secrets import requests import paramiko from scp import SCPClient from dataclasses import dataclass from typing import Optional, Dict @dataclass class TrainingResult: algorithm: str env_id: str timesteps: int time_seconds: float final_reward: float model_path: str success: bool cost_usd: float class CloreRLTrainer: """Complete RL training solution on Clore.ai.""" BASE_URL = "https://api.clore.ai" IMAGE = "pytorch/pytorch:2.7.1-cuda12.8-cudnn9-runtime" def __init__(self, api_key: str): self.api_key = api_key self.headers = {"auth": api_key} self.order_id = None self.ssh_host = None self.ssh_port = None self.ssh_password = None self.hourly_cost = 0.0 self._ssh = None self._scp = None def _api(self, method: str, endpoint: str, **kwargs) -> Dict: url = f"{self.BASE_URL}{endpoint}" for attempt in range(3): response = requests.request(method, url, headers=self.headers, **kwargs) data = response.json() if data.get("code") == 5: time.sleep(2 ** attempt) continue if data.get("code") != 0: raise Exception(f"API Error: {data}") return data raise Exception("Max retries") def setup(self, max_price: float = 0.40): print("🔍 Finding GPU...") servers = self._api("GET", "/v1/marketplace")["servers"] gpus = ["RTX 4090", "RTX 4080", "RTX 3090", "RTX 3080", "A100"] candidates = [] for s in servers: if s.get("rented"): continue gpu_array = s.get("gpu_array", []) if not any(any(g in gpu for g in gpus) for gpu in gpu_array): continue price = s.get("price", {}).get("usd", {}).get("spot") if price and price <= max_price: candidates.append({"id": s["id"], "gpus": gpu_array, "price": price}) if not candidates: raise Exception(f"No GPU under ${max_price}/hr") gpu = min(candidates, key=lambda x: x["price"]) print(f" {gpu['gpus']} @ ${gpu['price']:.2f}/hr") self.ssh_password = secrets.token_urlsafe(16) self.hourly_cost = gpu["price"] print("🚀 Provisioning server...") order_data = { "renting_server": gpu["id"], "type": "spot", "currency": "CLORE-Blockchain", "image": self.IMAGE, "ports": {"22": "tcp", "6006": "http"}, "env": {"NVIDIA_VISIBLE_DEVICES": "all"}, "ssh_password": self.ssh_password, "spotprice": gpu["price"] * 1.15 } result = self._api("POST", "/v1/create_order", json=order_data) self.order_id = result["order_id"] print("⏳ Waiting for server...") for _ in range(120): orders = self._api("GET", "/v1/my_orders")["orders"] order = next((o for o in orders if o["order_id"] == self.order_id), None) if order and order.get("status") == "running": conn = order["connection"]["ssh"] parts = conn.split() self.ssh_host = parts[1].split("@")[1] self.ssh_port = int(parts[-1]) if "-p" in conn else 22 break time.sleep(2) else: raise Exception("Timeout") # Connect SSH self._ssh = paramiko.SSHClient() self._ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) self._ssh.connect(self.ssh_host, port=self.ssh_port, username="root", password=self.ssh_password, timeout=30) self._scp = SCPClient(self._ssh.get_transport()) print(f"✅ Server ready: {self.ssh_host}:{self.ssh_port}") # Install packages print("📦 Installing RL packages...") self._exec("pip install -q stable-baselines3[extra] gymnasium tensorboard", timeout=300) def _exec(self, cmd: str, timeout: int = 7200) -> str: stdin, stdout, stderr = self._ssh.exec_command(cmd, timeout=timeout) stdout.channel.recv_exit_status() return stdout.read().decode() def train(self, env_id: str, algorithm: str, timesteps: int, learning_rate: float = 3e-4, n_envs: int = 4) -> TrainingResult: algo_class = algorithm.upper() script = f''' import gymnasium as gym import time import json from stable_baselines3 import {algo_class} from stable_baselines3.common.env_util import make_vec_env from stable_baselines3.common.evaluation import evaluate_policy env = make_vec_env("{env_id}", n_envs={n_envs}) eval_env = make_vec_env("{env_id}", n_envs=1) model = {algo_class}("MlpPolicy", env, learning_rate={learning_rate}, verbose=1, device="cuda") start = time.time() model.learn(total_timesteps={timesteps}, progress_bar=True) train_time = time.time() - start model.save("/tmp/model") mean_reward, _ = evaluate_policy(model, eval_env, n_eval_episodes=10) result = {{"success": True, "time": train_time, "reward": float(mean_reward)}} print("RESULT:" + json.dumps(result)) ''' self._exec(f"cat > /tmp/train.py << 'EOF'\n{script}\nEOF") print(f"🎮 Training {algorithm.upper()} on {env_id}...") print(f" Timesteps: {timesteps:,}") start = time.time() output = self._exec("python3 /tmp/train.py 2>&1", timeout=7200) elapsed = time.time() - start # Parse result result_data = {"success": False, "reward": 0, "time": elapsed} for line in output.split("\n"): if line.startswith("RESULT:"): result_data = json.loads(line[7:]) break cost = (elapsed / 3600) * self.hourly_cost return TrainingResult( algorithm=algorithm, env_id=env_id, timesteps=timesteps, time_seconds=elapsed, final_reward=result_data.get("reward", 0), model_path="/tmp/model.zip", success=result_data.get("success", False), cost_usd=cost ) def download_model(self, local_path: str): self._scp.get("/tmp/model.zip", local_path) def cleanup(self): if self._scp: self._scp.close() if self._ssh: self._ssh.close() if self.order_id: print("🧹 Releasing server...") self._api("POST", "/v1/cancel_order", json={"id": self.order_id}) def __enter__(self): return self def __exit__(self, *args): self.cleanup() def main(): parser = argparse.ArgumentParser() parser.add_argument("--api-key", required=True) parser.add_argument("--env", default="CartPole-v1", help="Gym environment") parser.add_argument("--algo", default="ppo", choices=["ppo", "sac", "dqn", "a2c", "td3"]) parser.add_argument("--timesteps", type=int, default=100000) parser.add_argument("--lr", type=float, default=3e-4) parser.add_argument("--output", default="./model.zip") parser.add_argument("--max-price", type=float, default=0.40) args = parser.parse_args() with CloreRLTrainer(args.api_key) as trainer: trainer.setup(args.max_price) result = trainer.train( env_id=args.env, algorithm=args.algo, timesteps=args.timesteps, learning_rate=args.lr ) print("\n" + "="*60) print("📊 TRAINING COMPLETE") print(f" Algorithm: {result.algorithm.upper()}") print(f" Environment: {result.env_id}") print(f" Timesteps: {result.timesteps:,}") print(f" Time: {result.time_seconds:.1f}s ({result.time_seconds/60:.1f} min)") print(f" Final Reward: {result.final_reward:.2f}") print(f" Cost: ${result.cost_usd:.4f}") if result.success: trainer.download_model(args.output) print(f" Model: {args.output}") if __name__ == "__main__": main()

hashtagWhat We're Building

hashtagPrerequisites

hashtagArchitecture Overview

hashtagStep 1: Clore.ai RL Client

hashtagStep 2: Remote RL Trainer

hashtagStep 3: Complete RL Pipeline

hashtagFull Script: Production RL Training

hashtagSupported Environments

hashtagCost Comparison

hashtagNext Steps

What We're Building

Prerequisites

Architecture Overview

Step 1: Clore.ai RL Client

Step 2: Remote RL Trainer

Step 3: Complete RL Pipeline

Full Script: Production RL Training

Supported Environments

Cost Comparison

Next Steps