import requests import time from typing import Dict, Any, List, Optional from dataclasses import dataclass
@dataclass class RentalInfo: """Information about an active rental.""" order_id: int server_id: int status: str ssh_host: str ssh_port: int http_endpoint: str cost_per_hour: float started_at: int
Step 2: FastAPI Inference Server
Step 3: Docker Configuration
Step 4: Deployment Script
Step 5: Monitoring Dashboard
Running the Deployment
Using Pre-built Images
Instead of building your own Docker image, use these pre-built inference servers:
Cost Comparison
Model Size
GPU
Clore.ai
AWS p3.2xlarge
Savings
7B params
RTX 3090
~$0.20/hr
$3.06/hr
93%
13B params
RTX 4090
~$0.35/hr
$3.06/hr
89%
70B params
A100 40GB
~$1.20/hr
$4.10/hr
71%
70B params (Q4)
RTX 4090
~$0.35/hr
N/A
โ
Monthly savings for a 70B model: ~$2,100/month compared to AWS.
## Step 1: Set Up the Clore Client
> ๐ฆ **Using the standard Clore API client.** See [Clore API Client Reference](../reference/clore-client.md) for the full implementation and setup instructions. Save it as `clore_client.py` in your project.
```python
from clore_client import CloreClient
client = CloreClient(api_key="your-api-key")
# inference_server.py
"""
Production-ready inference server with:
- Model loading and caching
- API key authentication
- Request validation
- Health checks
- Prometheus metrics
- Error handling
"""
import os
import time
import torch
import logging
from typing import Optional, List
from contextlib import asynccontextmanager
from fastapi import FastAPI, HTTPException, Depends, Security, Request
from fastapi.security import APIKeyHeader
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import Response
from pydantic import BaseModel, Field
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from prometheus_client import Counter, Histogram, Gauge, generate_latest
# Logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Prometheus metrics
REQUEST_COUNT = Counter(
"inference_requests_total",
"Total inference requests",
["endpoint", "status"]
)
REQUEST_LATENCY = Histogram(
"inference_latency_seconds",
"Request latency in seconds",
["endpoint"]
)
GPU_MEMORY_USED = Gauge(
"gpu_memory_used_bytes",
"GPU memory used in bytes"
)
MODEL_LOADED = Gauge(
"model_loaded",
"Whether model is loaded (1) or not (0)"
)
# Configuration
API_KEYS = set(os.environ.get("API_KEYS", "demo-key-12345").split(","))
MODEL_ID = os.environ.get("MODEL_ID", "microsoft/DialoGPT-medium")
MAX_LENGTH = int(os.environ.get("MAX_LENGTH", "256"))
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# Global model state
model = None
tokenizer = None
generator = None
class CompletionRequest(BaseModel):
"""Text completion request."""
prompt: str = Field(..., min_length=1, max_length=4096)
max_tokens: int = Field(default=128, ge=1, le=2048)
temperature: float = Field(default=0.7, ge=0.0, le=2.0)
top_p: float = Field(default=0.9, ge=0.0, le=1.0)
stop: Optional[List[str]] = None
class CompletionResponse(BaseModel):
"""Text completion response."""
id: str
object: str = "text_completion"
created: int
model: str
choices: List[dict]
usage: dict
class HealthResponse(BaseModel):
"""Health check response."""
status: str
model: str
device: str
gpu_available: bool
gpu_memory_used_mb: Optional[float]
uptime_seconds: float
# Security
api_key_header = APIKeyHeader(name="X-API-Key", auto_error=False)
async def verify_api_key(api_key: str = Security(api_key_header)):
if api_key not in API_KEYS:
raise HTTPException(status_code=401, detail="Invalid API key")
return api_key
def load_model():
"""Load the model into GPU memory."""
global model, tokenizer, generator
logger.info(f"Loading model: {MODEL_ID}")
logger.info(f"Device: {DEVICE}")
try:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# Set padding token if not set
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
device_map="auto" if DEVICE == "cuda" else None,
low_cpu_mem_usage=True
)
generator = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
device=0 if DEVICE == "cuda" else -1
)
MODEL_LOADED.set(1)
logger.info("Model loaded successfully")
# Log GPU memory
if torch.cuda.is_available():
mem = torch.cuda.memory_allocated() / 1024**2
logger.info(f"GPU memory used: {mem:.2f} MB")
except Exception as e:
MODEL_LOADED.set(0)
logger.error(f"Failed to load model: {e}")
raise
# Track startup time
startup_time = time.time()
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Application lifespan handler."""
load_model()
yield
# Cleanup
if torch.cuda.is_available():
torch.cuda.empty_cache()
app = FastAPI(
title="Model Inference API",
description="Production ML inference endpoint on Clore.ai GPU",
version="1.0.0",
lifespan=lifespan
)
# CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.post("/v1/completions", response_model=CompletionResponse)
async def create_completion(
request: CompletionRequest,
api_key: str = Depends(verify_api_key)
):
"""Generate text completion."""
start_time = time.time()
try:
# Generate
outputs = generator(
request.prompt,
max_new_tokens=request.max_tokens,
temperature=request.temperature,
top_p=request.top_p,
do_sample=request.temperature > 0,
pad_token_id=tokenizer.eos_token_id,
return_full_text=False
)
generated_text = outputs[0]["generated_text"]
# Handle stop sequences
if request.stop:
for stop_seq in request.stop:
if stop_seq in generated_text:
generated_text = generated_text.split(stop_seq)[0]
# Calculate tokens
prompt_tokens = len(tokenizer.encode(request.prompt))
completion_tokens = len(tokenizer.encode(generated_text))
latency = time.time() - start_time
# Update metrics
REQUEST_COUNT.labels(endpoint="/v1/completions", status="success").inc()
REQUEST_LATENCY.labels(endpoint="/v1/completions").observe(latency)
if torch.cuda.is_available():
GPU_MEMORY_USED.set(torch.cuda.memory_allocated())
return CompletionResponse(
id=f"cmpl-{int(time.time()*1000)}",
created=int(time.time()),
model=MODEL_ID,
choices=[{
"text": generated_text,
"index": 0,
"finish_reason": "stop" if request.stop else "length"
}],
usage={
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
"total_tokens": prompt_tokens + completion_tokens
}
)
except Exception as e:
REQUEST_COUNT.labels(endpoint="/v1/completions", status="error").inc()
logger.error(f"Inference error: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.post("/v1/chat/completions")
async def create_chat_completion(
request: dict,
api_key: str = Depends(verify_api_key)
):
"""Chat completion endpoint (OpenAI compatible)."""
messages = request.get("messages", [])
# Convert messages to prompt
prompt = ""
for msg in messages:
role = msg.get("role", "user")
content = msg.get("content", "")
if role == "system":
prompt += f"System: {content}\n"
elif role == "user":
prompt += f"User: {content}\n"
elif role == "assistant":
prompt += f"Assistant: {content}\n"
prompt += "Assistant:"
# Generate
completion_request = CompletionRequest(
prompt=prompt,
max_tokens=request.get("max_tokens", 128),
temperature=request.get("temperature", 0.7),
top_p=request.get("top_p", 0.9)
)
result = await create_completion(completion_request, api_key)
# Convert to chat format
return {
"id": result.id,
"object": "chat.completion",
"created": result.created,
"model": result.model,
"choices": [{
"index": 0,
"message": {
"role": "assistant",
"content": result.choices[0]["text"]
},
"finish_reason": result.choices[0]["finish_reason"]
}],
"usage": result.usage
}
@app.get("/health", response_model=HealthResponse)
async def health_check():
"""Health check endpoint."""
gpu_memory = None
if torch.cuda.is_available():
gpu_memory = torch.cuda.memory_allocated() / 1024**2
return HealthResponse(
status="healthy" if MODEL_LOADED._value._value == 1 else "unhealthy",
model=MODEL_ID,
device=DEVICE,
gpu_available=torch.cuda.is_available(),
gpu_memory_used_mb=gpu_memory,
uptime_seconds=time.time() - startup_time
)
@app.get("/v1/models")
async def list_models():
"""List available models."""
return {
"object": "list",
"data": [{
"id": MODEL_ID,
"object": "model",
"created": int(startup_time),
"owned_by": "clore-deployment"
}]
}
@app.get("/metrics")
async def metrics():
"""Prometheus metrics endpoint."""
if torch.cuda.is_available():
GPU_MEMORY_USED.set(torch.cuda.memory_allocated())
return Response(
content=generate_latest(),
media_type="text/plain"
)
@app.get("/")
async def root():
"""Root endpoint."""
return {
"name": "Model Inference API",
"version": "1.0.0",
"model": MODEL_ID,
"docs": "/docs"
}
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)