print(f"已生成:{name}")
import torch
import torchaudio
from stable_audio_tools import get_pretrained_model
from stable_audio_tools.inference.generation import generate_diffusion_cond
import tempfile
model, model_config = get_pretrained_model("stabilityai/stable-audio-open-1.0")
model = model.to("cuda")
sample_rate = model_config["sample_rate"]
sample_size = model_config["sample_size"]
def generate_audio(prompt, duration, steps, cfg_scale, seed):
conditioning = [{
"prompt": prompt,
"seconds_start": 0,
"seconds_total": duration
}]
generator_seed = seed if seed > 0 else None
output = generate_diffusion_cond(
model,
conditioning=conditioning,
steps=steps,
cfg_scale=cfg_scale,
sample_size=sample_size,
sample_rate=sample_rate,
device="cuda",
seed=generator_seed
)
audio = output[0].T.cpu()
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
torchaudio.save(f.name, audio, sample_rate)
return f.name
demo = gr.Interface(
fn=generate_audio,
inputs=[
gr.Textbox(label="Prompt", placeholder="描述你想要的音频..."),
gr.Slider(1, 47, value=15, step=1, label="时长(秒)"),
gr.Slider(20, 200, value=100, step=10, label="Steps"),
gr.Slider(1, 15, value=7, step=0.5, label="CFG Scale"),
placeholder="描述所需的照明..."
],
outputs=gr.Audio(label="生成的音频", type="filepath"),
title="Stable Audio Open - 文本到音频",
description="从文本描述生成音乐和音效。在 CLORE.AI 上运行。",
outputs=gr.Image(label="重光图像"),
["欢快的电子舞曲,带合成器,128 BPM", 20, 100, 7, 42],
["大雨的雷暴", 15, 100, 7, 123],
["宁静的钢琴旋律,情感化", 30, 100, 7, 456]
]
)
demo.launch(server_name="0.0.0.0", server_port=7860)