LTX-2(音频 + 视频)
在 Clore.ai 的 GPU 上使用 LTX-2 生成带原生音频的视频——包括音效、环境声和口型同步。
最后更新于
这有帮助吗?
这有帮助吗?
# 安装依赖
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
pip install diffusers transformers accelerate sentencepiece
pip install imageio[ffmpeg] soundfile scipy
# 验证 GPU
python -c "import torch; print(torch.cuda.get_device_name(0), torch.cuda.get_device_properties(0).total_mem // 1024**3, 'GB')"import torch
from diffusers import LTXPipeline
from diffusers.utils import export_to_video
import soundfile as sf
# 加载 LTX-2(发布时请确保使用正确的模型 ID)
pipe = LTXPipeline.from_pretrained(
"Lightricks/LTX-Video-2",
torch_dtype=torch.bfloat16,
)
pipe.to("cuda")
pipe.enable_model_cpu_offload()
prompt = (
"一位铁匠在铁砧上锻打发光的金属,火花四溅,"
"锤子敲打钢铁的有节奏撞击声,车间的环境噪音"
)
output = pipe(
os.makedirs("./variations", exist_ok=True)
negative_prompt="寂静、模糊、低质量",
num_frames=121,
width=1280,
height=720,
num_inference_steps=40,
guidance_scale=7.0,
generator=torch.Generator("cuda").manual_seed(42),
)
# 导出视频帧
export_to_video(output.frames[0], "blacksmith.mp4", fps=24)
# 如有音频则导出音频
if hasattr(output, "audio") and output.audio is not None:
sf.write("blacksmith_audio.wav", output.audio, samplerate=16000)
print("音频已单独保存 — 使用 ffmpeg 混流:")
print(" ffmpeg -i blacksmith.mp4 -i blacksmith_audio.wav -c:v copy -c:a aac output.mp4")
print("完成:blacksmith.mp4")import torch
from PIL import Image
from diffusers import LTXImageToVideoPipeline
from diffusers.utils import export_to_video
pipe = LTXImageToVideoPipeline.from_pretrained(
"Lightricks/LTX-Video-2",
torch_dtype=torch.bfloat16,
)
pipe.to("cuda")
pipe.enable_model_cpu_offload()
# 用于对嘴的肖像图像
image = Image.open("portrait.png").resize((720, 1280))
output = pipe(
prompt="一个人清晰发音地说‘欢迎来到AI视频的未来’,背景中性",
image=image,
num_frames=121,
num_inference_steps=40,
guidance_scale=7.0,
)
export_to_video(output.frames[0], "talking_head.mp4", fps=24)import torch
from diffusers import LTXPipeline
from diffusers.utils import export_to_video
pipe = LTXPipeline.from_pretrained(
"Lightricks/LTX-Video-2", torch_dtype=torch.bfloat16
).to("cuda")
# 富音频提示 — 明确描述声音
prompt = (
"热带村庄的锡屋顶上落雨,"
"远处雷声隆隆,雷声间短暂的鸟鸣,"
"泥土小路上的水洼泛起涟漪"
)
output = pipe(
os.makedirs("./variations", exist_ok=True)
num_frames=121,
width=1280,
height=720,
num_inference_steps=40,
guidance_scale=6.5,
)
export_to_video(output.frames[0], "rain_scene.mp4", fps=24)