Integrate Clore.ai GPU resources into your GitHub Actions CI/CD pipeline. Run GPU tests, CUDA validation, ML model testing, and performance benchmarks automatically on every commit or PR.
#!/usr/bin/env python3
"""
GPU Test Runner for GitHub Actions
Provisions a Clore.ai GPU, runs tests, and cleans up.
"""
import argparse
import json
import os
import sys
import time
import secrets
import requests
import paramiko
from scp import SCPClient
from typing import Dict, Optional
class GPUTestRunner:
"""Runs GPU tests on Clore.ai infrastructure."""
BASE_URL = "https://api.clore.ai"
DEFAULT_IMAGE = "nvidia/cuda:12.8.0-base-ubuntu22.04"
def __init__(self, api_key: str):
self.api_key = api_key
self.headers = {"auth": api_key}
self.order_id = None
self.ssh_host = None
self.ssh_port = None
self.ssh_password = None
def _api(self, method: str, endpoint: str, **kwargs) -> Dict:
"""Make API request."""
url = f"{self.BASE_URL}{endpoint}"
response = requests.request(method, url, headers=self.headers, timeout=30)
data = response.json()
if data.get("code") != 0:
raise Exception(f"API Error: {data}")
return data
def find_gpu(self, gpu_type: str, max_price: float) -> Optional[Dict]:
"""Find available GPU matching criteria."""
data = self._api("GET", "/v1/marketplace")
for server in data.get("servers", []):
if server.get("rented"):
continue
gpus = server.get("gpu_array", [])
if not any(gpu_type.lower() in g.lower() for g in gpus):
continue
price = server.get("price", {}).get("usd", {}).get("spot")
if price and price <= max_price:
return {
"id": server["id"],
"gpus": gpus,
"price": price
}
return None
def provision(self, gpu_type: str, max_price: float, image: str = None) -> bool:
"""Provision a GPU server."""
print(f"🔍 Looking for {gpu_type} under ${max_price}/hr...")
server = self.find_gpu(gpu_type, max_price)
if not server:
print(f"❌ No {gpu_type} available under ${max_price}/hr")
return False
print(f"✅ Found: {server['gpus']} @ ${server['price']}/hr")
self.ssh_password = secrets.token_urlsafe(16)
order_data = {
"renting_server": server["id"],
"type": "on-demand", # Use on-demand for CI reliability
"currency": "CLORE-Blockchain",
"image": image or self.DEFAULT_IMAGE,
"ports": {"22": "tcp"},
"env": {"NVIDIA_VISIBLE_DEVICES": "all"},
"ssh_password": self.ssh_password
}
print("🚀 Creating order...")
result = self._api("POST", "/v1/create_order", json=order_data)
self.order_id = result["order_id"]
print(f"⏳ Waiting for server (order: {self.order_id})...")
for _ in range(90):
orders = self._api("GET", "/v1/my_orders")["orders"]
order = next((o for o in orders if o["order_id"] == self.order_id), None)
if order and order.get("status") == "running":
conn = order["connection"]["ssh"]
parts = conn.split()
self.ssh_host = parts[1].split("@")[1]
self.ssh_port = int(parts[-1]) if "-p" in conn else 22
print(f"✅ Server ready: {self.ssh_host}:{self.ssh_port}")
return True
time.sleep(2)
print("❌ Timeout waiting for server")
self.cleanup()
return False
def run_tests(self, test_script: str, timeout: int = 600) -> Dict:
"""Run tests on the GPU server."""
print(f"🧪 Running tests: {test_script}")
# Connect SSH
ssh = paramiko.SSHClient()
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh.connect(
self.ssh_host,
port=self.ssh_port,
username="root",
password=self.ssh_password,
timeout=30
)
scp = SCPClient(ssh.get_transport())
try:
# Upload test script
scp.put(test_script, "/tmp/test.sh")
# Make executable and run
stdin, stdout, stderr = ssh.exec_command(
"chmod +x /tmp/test.sh && /tmp/test.sh 2>&1",
timeout=timeout
)
exit_code = stdout.channel.recv_exit_status()
output = stdout.read().decode()
# Save results
os.makedirs("test-results", exist_ok=True)
with open("test-results/output.txt", "w") as f:
f.write(output)
# Get GPU info
stdin, stdout, stderr = ssh.exec_command("nvidia-smi")
gpu_info = stdout.read().decode()
with open("test-results/gpu-info.txt", "w") as f:
f.write(gpu_info)
# Summary
status = "PASSED" if exit_code == 0 else "FAILED"
summary = f"""
Test Status: {status}
Exit Code: {exit_code}
GPU: {self.ssh_host}
Output (last 50 lines):
{chr(10).join(output.split(chr(10))[-50:])}
"""
with open("test-results/summary.txt", "w") as f:
f.write(summary)
return {
"success": exit_code == 0,
"exit_code": exit_code,
"output": output
}
finally:
scp.close()
ssh.close()
def cleanup(self):
"""Cancel the order and release resources."""
if self.order_id:
print(f"🧹 Cleaning up order {self.order_id}...")
try:
self._api("POST", "/v1/cancel_order", json={"id": self.order_id})
print("✅ Order cancelled")
except Exception as e:
print(f"⚠️ Cleanup error: {e}")
def main():
parser = argparse.ArgumentParser(description="GPU Test Runner for CI/CD")
parser.add_argument("--api-key", required=True)
parser.add_argument("--gpu", default="RTX 3080")
parser.add_argument("--max-price", type=float, default=0.30)
parser.add_argument("--image", default=None)
parser.add_argument("--test-script", required=True)
parser.add_argument("--timeout", type=int, default=600)
args = parser.parse_args()
runner = GPUTestRunner(args.api_key)
try:
# Provision
if not runner.provision(args.gpu, args.max_price, args.image):
sys.exit(1)
# Run tests
result = runner.run_tests(args.test_script, args.timeout)
if result["success"]:
print("✅ All tests passed!")
else:
print(f"❌ Tests failed with exit code {result['exit_code']}")
sys.exit(1)
finally:
runner.cleanup()
if __name__ == "__main__":
main()
#!/bin/bash
# tests/cuda_test.sh
set -e
echo "=== CUDA Compatibility Test ==="
echo ""
# Check NVIDIA driver
echo "1. Checking NVIDIA driver..."
nvidia-smi
echo ""
# Check CUDA version
echo "2. Checking CUDA version..."
nvcc --version || echo "nvcc not in path (may be in container)"
echo ""
# Install PyTorch and test CUDA
echo "3. Testing PyTorch CUDA..."
pip install -q torch
python3 -c "
import torch
print(f'PyTorch version: {torch.__version__}')
print(f'CUDA available: {torch.cuda.is_available()}')
print(f'CUDA version: {torch.version.cuda}')
print(f'Device count: {torch.cuda.device_count()}')
if torch.cuda.is_available():
print(f'Device name: {torch.cuda.get_device_name(0)}')
print(f'Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB')
# Simple CUDA operation
x = torch.randn(1000, 1000).cuda()
y = torch.randn(1000, 1000).cuda()
z = torch.matmul(x, y)
print(f'Matrix multiplication test: PASSED')
"
echo ""
echo "=== All CUDA tests passed! ==="
#!/bin/bash
# tests/ml_model_test.sh
set -e
echo "=== ML Model Test ==="
echo ""
# Install dependencies
pip install -q torch torchvision transformers
# Test model loading and inference
python3 << 'EOF'
import torch
import time
print("1. Loading model...")
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
if torch.cuda.is_available():
model = model.cuda()
print(f" Model loaded on GPU: {torch.cuda.get_device_name(0)}")
else:
print(" WARNING: Running on CPU")
print("\n2. Running inference benchmark...")
text = "Hello, this is a test of the GPU inference speed."
inputs = tokenizer(text, return_tensors="pt")
if torch.cuda.is_available():
inputs = {k: v.cuda() for k, v in inputs.items()}
# Warmup
with torch.no_grad():
_ = model(**inputs)
# Benchmark
iterations = 100
start = time.time()
with torch.no_grad():
for _ in range(iterations):
_ = model(**inputs)
if torch.cuda.is_available():
torch.cuda.synchronize()
elapsed = time.time() - start
throughput = iterations / elapsed
print(f" Iterations: {iterations}")
print(f" Total time: {elapsed:.2f}s")
print(f" Throughput: {throughput:.1f} inferences/sec")
# Baseline check
min_throughput = 50 # Expected minimum on any GPU
if throughput < min_throughput:
print(f"\n❌ FAILED: Throughput below minimum ({min_throughput}/sec)")
exit(1)
print(f"\n✅ PASSED: Throughput meets requirements")
EOF
echo ""
echo "=== ML Model tests passed! ==="
#!/bin/bash
# tests/performance_test.sh
set -e
echo "=== Performance Regression Test ==="
echo ""
pip install -q torch numpy
python3 << 'EOF'
import torch
import time
import json
results = {}
# Test 1: Matrix multiplication
print("1. Matrix multiplication benchmark...")
size = 4096
iterations = 50
a = torch.randn(size, size, device='cuda')
b = torch.randn(size, size, device='cuda')
# Warmup
torch.matmul(a, b)
torch.cuda.synchronize()
start = time.time()
for _ in range(iterations):
c = torch.matmul(a, b)
torch.cuda.synchronize()
elapsed = time.time() - start
ops_per_sec = iterations / elapsed
tflops = (2 * size ** 3 * iterations) / elapsed / 1e12
print(f" Size: {size}x{size}")
print(f" TFLOPS: {tflops:.2f}")
results["matmul_tflops"] = tflops
# Test 2: Memory bandwidth
print("\n2. Memory bandwidth test...")
size_gb = 1
data = torch.randn(size_gb * 1024 * 1024 * 1024 // 4, device='cuda')
torch.cuda.synchronize()
start = time.time()
for _ in range(10):
data.clone()
torch.cuda.synchronize()
elapsed = time.time() - start
bandwidth_gbps = (size_gb * 10 * 2) / elapsed # Read + write
print(f" Bandwidth: {bandwidth_gbps:.1f} GB/s")
results["bandwidth_gbps"] = bandwidth_gbps
# Test 3: Convolution
print("\n3. Convolution benchmark...")
import torch.nn as nn
batch, channels, height, width = 32, 64, 224, 224
conv = nn.Conv2d(channels, 128, 3, padding=1).cuda()
input_data = torch.randn(batch, channels, height, width, device='cuda')
# Warmup
conv(input_data)
torch.cuda.synchronize()
start = time.time()
for _ in range(50):
conv(input_data)
torch.cuda.synchronize()
elapsed = time.time() - start
conv_throughput = 50 / elapsed
print(f" Throughput: {conv_throughput:.1f} batches/sec")
results["conv_batches_per_sec"] = conv_throughput
# Save results
with open("test-results/performance.json", "w") as f:
json.dump(results, f, indent=2)
print("\n=== Performance Results ===")
print(json.dumps(results, indent=2))
# Check against baselines (adjust for your GPU)
BASELINES = {
"matmul_tflops": 10.0, # Minimum expected TFLOPS
"bandwidth_gbps": 200.0, # Minimum expected bandwidth
"conv_batches_per_sec": 20.0
}
failed = False
for metric, baseline in BASELINES.items():
actual = results.get(metric, 0)
status = "✅" if actual >= baseline * 0.8 else "❌" # 80% tolerance
print(f"{status} {metric}: {actual:.2f} (baseline: {baseline})")
if actual < baseline * 0.5: # Hard fail at 50%
failed = True
if failed:
exit(1)
print("\n✅ All performance tests passed!")
EOF
echo ""
echo "=== Performance tests complete ==="