Kokoro TTS
在 Clore.ai 的 GPU 上运行 Kokoro TTS——一个超轻量级的 8200 万参数文本转语音模型。
最后更新于
这有帮助吗?
这有帮助吗?
# 安装系统依赖项
apt-get install -y espeak-ng
# 安装 Kokoro 和音频 I/O
pip install kokoro>=0.9.4 soundfile torch
# 支持日语(可选)
pip install misaki[ja]
# 支持中文(可选)
pip install misaki[zh]
# 验证
python -c "from kokoro import KPipeline; print('Kokoro ready')"from kokoro import KPipeline
import soundfile as sf
# 初始化流水线
# 'a' = 美式英语,'b' = 英式英语
pipeline = KPipeline(lang_code='a')
text = """
Kokoro 是一个轻量级的文本到语音模型,只有八千二百万
参数。尽管体积小,但它能产生自然且富有表现力的语音。
"""
# 生成音频 — 语音选项: af_heart、af_bella、af_nicole、af_sarah、af_sky、
# am_adam、am_michael、bf_emma、bf_isabella、bm_george、bm_lewis
generator = pipeline(text, voice='af_heart', speed=1.0)
for i, (graphemes, phonemes, audio) in enumerate(generator):
sf.write(f'output_{i}.wav', audio, 24000)
print(f"Chunk {i}: {graphemes[:50]}...")
print("完成!")from kokoro import KPipeline
import soundfile as sf
pipeline = KPipeline(lang_code='a')
text = "Welcome to Clore.ai, the peer-to-peer GPU marketplace."
voices = ['af_heart', 'af_bella', 'am_adam', 'am_michael']
for voice in voices:
generator = pipeline(text, voice=voice, speed=1.0)
for i, (gs, ps, audio) in enumerate(generator):
sf.write(f'{voice}_{i}.wav', audio, 24000)
print(f"Generated: {voice}")from kokoro import KPipeline
import soundfile as sf
# 'b' = 英式英语
pipeline = KPipeline(lang_code='b')
text = "Good afternoon. This is a demonstration of British English synthesis."
# speed < 1.0 = 更慢,speed > 1.0 = 更快
generator = pipeline(text, voice='bf_emma', speed=0.85)
all_audio = []
for gs, ps, audio in generator:
all_audio.append(audio)
import numpy as np
combined = np.concatenate(all_audio)
sf.write('british_slow.wav', combined, 24000)
print(f"Total duration: {len(combined)/24000:.1f}s")from kokoro import KPipeline
import soundfile as sf
import numpy as np
pipeline = KPipeline(lang_code='a')
chapters = [
"第一章。我们的旅程从这里开始。",
"太阳从群山升起,长长的影子投在山谷上。",
"她打开门,迈入未知。",
]
all_audio = []
silence = np.zeros(int(24000 * 0.5)) # 章间 0.5 秒静音
for idx, text in enumerate(chapters):
for gs, ps, audio in pipeline(text, voice='af_bella', speed=1.0):
all_audio.append(audio)
all_audio.append(silence)
print(f"Chapter {idx+1} done")
combined = np.concatenate(all_audio)
sf.write('audiobook.wav', combined, 24000)
print(f"Total: {len(combined)/24000:.1f}s")