Compare commits
11 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 68af4fe502 | |||
| ee3649c1b3 | |||
| 82d77d445c | |||
| 8f95d13073 | |||
| df38f0a167 | |||
| 9adfaf6996 | |||
| 46cfce0c97 | |||
| da700f264e | |||
| 9da570d409 | |||
| 9374524c47 | |||
| ec6d30e996 |
@@ -1,5 +1,9 @@
|
||||
<h2 align="center">VoxCPM2: Tokenizer-Free TTS for Multilingual Speech Generation, Creative Voice Design, and True-to-Life Cloning</h2>
|
||||
|
||||
<p align="center">
|
||||
<b>English</b> | <a href="./README_zh.md">中文</a>
|
||||
</p>
|
||||
|
||||
<p align="center">
|
||||
<a href="https://github.com/OpenBMB/VoxCPM/"><img src="https://img.shields.io/badge/Project%20Page-GitHub-blue" alt="Project Page"></a>
|
||||
<a href="https://huggingface.co/spaces/OpenBMB/VoxCPM-Demo"><img src="https://img.shields.io/badge/Live%20Playground-Demo-orange" alt="Live Playground"></a>
|
||||
@@ -45,13 +49,13 @@ VoxCPM is a **tokenizer-free** Text-to-Speech system that directly generates con
|
||||
- ⚡ **Real-Time Streaming** — RTF as low as ~0.3 on NVIDIA RTX 4090, and ~0.13 accelerated by [Nano-VLLM](https://github.com/a710128/nanovllm-voxcpm)
|
||||
- 📜 **Fully Open-Source & Commercial-Ready** — Weights and code released under the [Apache-2.0](LICENSE) license, free for commercial use
|
||||
|
||||
<details>
|
||||
|
||||
<summary><b>🌍 Supported Languages (30)</b></summary>
|
||||
<br>
|
||||
Arabic, Burmese, Chinese, Danish, Dutch, English, Finnish, French, German, Greek, Hebrew, Hindi, Indonesian, Italian, Japanese, Khmer, Korean, Lao, Malay, Norwegian, Polish, Portuguese, Russian, Spanish, Swahili, Swedish, Tagalog, Thai, Turkish, Vietnamese
|
||||
|
||||
Chinese Dialect: 四川话, 粤语, 吴语, 东北话, 河南话, 陕西话, 山东话, 天津话, 闽南话
|
||||
</details>
|
||||
|
||||
|
||||
### News
|
||||
|
||||
@@ -99,7 +103,7 @@ from voxcpm import VoxCPM
|
||||
import soundfile as sf
|
||||
|
||||
model = VoxCPM.from_pretrained(
|
||||
"openbmb/VoxCPM2"
|
||||
"openbmb/VoxCPM2",
|
||||
load_denoiser=False,
|
||||
)
|
||||
|
||||
@@ -112,6 +116,28 @@ sf.write("demo.wav", wav, model.tts_model.sample_rate)
|
||||
print("saved: demo.wav")
|
||||
```
|
||||
|
||||
If you prefer downloading from ModelScope first, you can use:
|
||||
|
||||
```bash
|
||||
pip install modelscope
|
||||
```
|
||||
|
||||
```python
|
||||
from modelscope.hub.snapshot_download import snapshot_download
|
||||
from voxcpm import VoxCPM
|
||||
import soundfile as sf
|
||||
|
||||
local_model_dir = snapshot_download("OpenBMB/VoxCPM2")
|
||||
model = VoxCPM.from_pretrained(local_model_dir, load_denoiser=False)
|
||||
|
||||
wav = model.generate(
|
||||
text="VoxCPM2 is the current recommended release for realistic multilingual speech synthesis.",
|
||||
cfg_value=2.0,
|
||||
inference_timesteps=10,
|
||||
)
|
||||
sf.write("demo.wav", wav, model.tts_model.sample_rate)
|
||||
```
|
||||
|
||||
#### 🎨 Voice Design
|
||||
|
||||
Create a voice from a natural-language description — no reference audio needed. **Format:** put the description in parentheses at the start of `text`(e.g. `"(your voice description)The text to synthesize."`):
|
||||
@@ -132,13 +158,13 @@ Upload a reference audio. The model clones the timbre, and you can still use con
|
||||
```python
|
||||
wav = model.generate(
|
||||
text="This is a cloned voice generated by VoxCPM2.",
|
||||
reference_wav_path="speaker.wav",
|
||||
reference_wav_path="path/to/voice.wav",
|
||||
)
|
||||
sf.write("clone.wav", wav, model.tts_model.sample_rate)
|
||||
|
||||
wav = model.generate(
|
||||
text="(slightly faster, cheerful tone)This is a cloned voice with style control.",
|
||||
reference_wav_path="speaker.wav",
|
||||
reference_wav_path="path/to/voice.wav",
|
||||
cfg_value=2.0,
|
||||
inference_timesteps=10,
|
||||
)
|
||||
@@ -152,9 +178,9 @@ Provide both the reference audio and its exact transcript for audio-continuation
|
||||
```python
|
||||
wav = model.generate(
|
||||
text="This is an ultimate cloning demonstration using VoxCPM2.",
|
||||
prompt_wav_path="speaker_reference.wav",
|
||||
prompt_wav_path="path/to/voice.wav",
|
||||
prompt_text="The transcript of the reference audio.",
|
||||
reference_wav_path="speaker_reference.wav",
|
||||
reference_wav_path="path/to/voice.wav", # optional, for better simliarity
|
||||
)
|
||||
sf.write("hifi_clone.wav", wav, model.tts_model.sample_rate)
|
||||
```
|
||||
@@ -200,6 +226,7 @@ voxcpm clone \
|
||||
--text "This is a voice cloning demo." \
|
||||
--prompt-audio path/to/voice.wav \
|
||||
--prompt-text "reference transcript" \
|
||||
--reference-audio path/to/voice.wav \ # optional, for better simliarity
|
||||
--output out.wav
|
||||
|
||||
# Batch processing
|
||||
@@ -211,8 +238,8 @@ voxcpm --help
|
||||
|
||||
### Web Demo
|
||||
|
||||
```bash
|
||||
python app.py # then open http://localhost:7860
|
||||
```bash
|
||||
python app.py --model-dir /path/to/VoxCPM2 --port 8808 # use a local model directory, open http://localhost:8808
|
||||
```
|
||||
|
||||
### 🚢 Production Deployment (Nano-vLLM)
|
||||
@@ -388,10 +415,54 @@ VoxCPM2 achieves state-of-the-art or comparable results on public zero-shot and
|
||||
|
||||
</details>
|
||||
|
||||
|
||||
### Internal 30-Language ASR Benchmark
|
||||
|
||||
We additionally run an internal multilingual intelligibility benchmark with **30 languages × 500 samples**. ASR transcription is evaluated via **Gemini 3.1 Flash Lite API**.
|
||||
|
||||
<details>
|
||||
<summary><b>Internal 30-Language ASR Benchmark (click to expand)</b></summary>
|
||||
|
||||
| Language | Metric | VoxCPM2 | Fish S2-Pro |
|
||||
|---|---:|---:|---:|
|
||||
| ar (Arabic) | CER | 1.23% | 0.30% |
|
||||
| da (Danish) | WER | 2.70% | 3.52% |
|
||||
| de (German) | WER | 0.96% | 0.64% |
|
||||
| el (Greek) | WER | 3.17% | 4.61% |
|
||||
| en (English) | WER | 0.42% | 1.03% |
|
||||
| es (Spanish) | WER | 1.33% | 0.64% |
|
||||
| fi (Finnish) | WER | 2.24% | 2.80% |
|
||||
| fr (French) | WER | 2.16% | 2.34% |
|
||||
| he (Hebrew) | CER | 2.98% | 15.27% |
|
||||
| hi (Hindi) | CER | 0.79% | 0.91% |
|
||||
| id (Indonesian) | WER | 1.36% | 1.68% |
|
||||
| it (Italian) | WER | 1.65% | 1.08% |
|
||||
| ja (Japanese) | CER | 2.40% | 1.82% |
|
||||
| km (Khmer) | CER | 2.05% | 75.15% |
|
||||
| ko (Korean) | CER | 0.95% | 0.29% |
|
||||
| lo (Lao) | CER | 1.90% | 87.40% |
|
||||
| ms (Malay) | WER | 1.75% | 1.41% |
|
||||
| my (Burmese) | CER | 1.42% | 85.27% |
|
||||
| nl (Dutch) | WER | 1.25% | 1.68% |
|
||||
| no (Norwegian) | WER | 2.49% | 3.76% |
|
||||
| pl (Polish) | WER | 1.90% | 1.65% |
|
||||
| pt (Portuguese) | WER | 1.48% | 1.49% |
|
||||
| ru (Russian) | WER | 0.90% | 0.86% |
|
||||
| sv (Swedish) | WER | 2.22% | 2.63% |
|
||||
| sw (Swahili) | CER | 1.07% | 2.02% |
|
||||
| th (Thai) | CER | 0.94% | 1.92% |
|
||||
| tl (Tagalog) | WER | 2.63% | 4.00% |
|
||||
| tr (Turkish) | WER | 1.65% | 1.65% |
|
||||
| vi (Vietnamese) | WER | 1.56% | 5.56% |
|
||||
| zh (Chinese) | CER | 0.92% | 1.02% |
|
||||
| Average (30 languages) | | **1.68%** | - |
|
||||
|
||||
</details>
|
||||
|
||||
### InstructTTSEval
|
||||
|
||||
<details>
|
||||
<summary><b>Instruction-Guided Voice Design Results</b></summary>
|
||||
<summary><b>Instruction-Guided Voice Design Results (click to expand)</b></summary>
|
||||
|
||||
| Model | InstructTTSEval-ZH | | | InstructTTSEval-EN | | |
|
||||
|-------|:---:|:----:|:----:|:----:|:----:|:----:|
|
||||
|
||||
+590
@@ -0,0 +1,590 @@
|
||||
<h2 align="center">VoxCPM2:基于连续表征的多语言语音合成、创意音色设计与高保真声音克隆</h2>
|
||||
|
||||
<p align="center">
|
||||
<a href="./README.md">English</a> | <b>中文</b>
|
||||
</p>
|
||||
|
||||
<p align="center">
|
||||
<a href="https://github.com/OpenBMB/VoxCPM/"><img src="https://img.shields.io/badge/Project%20Page-GitHub-blue" alt="Project Page"></a>
|
||||
<a href="https://huggingface.co/spaces/OpenBMB/VoxCPM-Demo"><img src="https://img.shields.io/badge/Live%20Playground-Demo-orange" alt="Live Playground"></a>
|
||||
<a href="https://voxcpm.readthedocs.io/zh-cn/latest/"><img src="https://img.shields.io/badge/Docs-ReadTheDocs-8CA1AF" alt="Documentation"></a>
|
||||
<a href="https://huggingface.co/openbmb/VoxCPM2"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-VoxCPM2-yellow" alt="Hugging Face"></a>
|
||||
<a href="https://modelscope.cn/models/OpenBMB/VoxCPM2"><img src="https://img.shields.io/badge/ModelScope-VoxCPM2-purple" alt="ModelScope"></a>
|
||||
<a href="https://openbmb.github.io/voxcpm2-demopage/"><img src="https://img.shields.io/badge/DemoPage-Audio Samples-red"></a>
|
||||
|
||||
</p>
|
||||
|
||||
<div align="center">
|
||||
<img src="assets/voxcpm_logo.png" alt="VoxCPM Logo" width="35%">
|
||||
<br><br>
|
||||
<a href="https://trendshift.io/repositories/17704" target="_blank"><img src="https://trendshift.io/api/badge/repositories/17704" alt="OpenBMB%2FVoxCPM | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
||||
</div>
|
||||
|
||||
<br>
|
||||
|
||||
<p align="center">
|
||||
👋 欢迎加入社区,参与讨论与交流!
|
||||
<br>
|
||||
<a href="./assets/feishu-group.png" style="display:inline-block;vertical-align:middle; margin-left: 10px;">
|
||||
<img src="./assets/feishu-logo.png" width="16" height="16" style="vertical-align:middle;"> 飞书群
|
||||
</a>
|
||||
|
|
||||
<a href="https://discord.gg/KZUx7tVNwz" style="display:inline-block;vertical-align:middle;">
|
||||
<img src="./assets/discord-logo.png" width="16" height="16" style="vertical-align:middle;"> Discord
|
||||
</a>
|
||||
</p>
|
||||
|
||||
VoxCPM 是一个**无离散音频分词器**(Tokenizer-Free)的语音合成系统,通过端到端的**扩散自回归架构**直接生成连续语音表征,绕过对音频的离散编码步骤,实现高度自然且富有表现力的语音合成。
|
||||
|
||||
**VoxCPM2** 是最新的版本 — 基于 [MiniCPM-4](https://github.com/OpenBMB/MiniCPM) 基座构建,总计 **20亿** 参数,在超过 **200万小时** 的多语种音频数据上训练,支持 **30种全球语言+9种中文方言**、**音色设计**、**可控声音克隆**,原生输出 **48kHz** 高质量音频。
|
||||
|
||||
### ✨ 核心特性
|
||||
|
||||
- 🌍 **30种语言语音合成** — 直接输入原始文本即可合成(支持语言详见下文),无需额外语言标签
|
||||
- 🎨 **音色设计** — 用自然语言描述(性别、年龄、音色、情绪、语速……)凭空创建全新音色,无需参考音频
|
||||
- 🎛️ **可控声音克隆** — 从参考音频片段克隆任意声音,可叠加风格指令控制情绪、语速和表现力,同时保持原始音色
|
||||
- 🎙️ **极致克隆** — 提供参考音频及其文本内容,模型接着参考音频进行无缝续写,从而精准还原声音细节特征(与 VoxCPM1.5 一致)
|
||||
- 🔊 **48kHz 高质量音频** — 输入 16kHz 参考音频,通过 AudioVAE V2 的非对称编解码设计直接输出 48kHz 高质量音频,内置超分能力
|
||||
- 🧠 **语境感知合成** — 根据文本内容自动推断合适的韵律和表现力
|
||||
- ⚡ **实时流式合成** — 在 NVIDIA RTX 4090 上 RTF 低至 ~0.3,通过 [Nano-VLLM](https://github.com/a710128/nanovllm-voxcpm) 加速后可达 ~0.13
|
||||
- 📜 **完全开源,商用就绪** — 权重和代码基于 [Apache-2.0](LICENSE) 协议发布,免费商用
|
||||
|
||||
<summary><b>🌍 支持的语言(30种)</b></summary>
|
||||
<br>
|
||||
阿拉伯语、缅甸语、中文、丹麦语、荷兰语、英语、芬兰语、法语、德语、希腊语、希伯来语、印地语、印尼语、意大利语、日语、高棉语、韩语、老挝语、马来语、挪威语、波兰语、葡萄牙语、俄语、西班牙语、斯瓦希里语、瑞典语、菲律宾语、泰语、土耳其语、越南语
|
||||
|
||||
中国方言:四川话、粤语、吴语、东北话、河南话、陕西话、山东话、天津话、闽南话
|
||||
|
||||
|
||||
### 最新动态
|
||||
|
||||
* **[2026.04]** 🔥 发布 **VoxCPM2** — 20亿参数,30种语言,音色设计与可控声音克隆,48kHz 音频输出 | [使用文档](https://voxcpm.readthedocs.io/zh-cn/latest/) | [在线体验](https://huggingface.co/spaces/OpenBMB/VoxCPM-Demo) | [官网体验](https://voxcpm.modelbest.cn/) (适用国内访问)
|
||||
* **[2025.12]** 🎉 开源 **VoxCPM1.5** [模型权重](https://huggingface.co/openbmb/VoxCPM1.5),支持 SFT 和 LoRA 微调。(**🏆 GitHub Trending #1**)
|
||||
* **[2025.09]** 🔥 发布 VoxCPM [技术报告](https://arxiv.org/abs/2509.24650)。
|
||||
* **[2025.09]** 🎉 开源 **VoxCPM-0.5B** [模型权重](https://huggingface.co/openbmb/VoxCPM-0.5B) (**🏆 HuggingFace Trending #1**)
|
||||
|
||||
---
|
||||
|
||||
## 目录
|
||||
|
||||
- [快速开始](#-快速开始)
|
||||
- [安装](#安装)
|
||||
- [Python API](#python-api)
|
||||
- [命令行使用](#命令行使用)
|
||||
- [Web Demo](#web-demo)
|
||||
- [生产部署](#-生产部署nano-vllm)
|
||||
- [模型与版本](#-模型与版本)
|
||||
- [性能评测](#-性能评测)
|
||||
- [微调](#%EF%B8%8F-微调)
|
||||
- [文档](#-文档)
|
||||
- [生态与社区](#-生态与社区)
|
||||
- [风险与局限性](#%EF%B8%8F-风险与局限性)
|
||||
- [引用](#-引用)
|
||||
|
||||
---
|
||||
|
||||
## 🚀 快速开始
|
||||
|
||||
### 安装
|
||||
|
||||
```sh
|
||||
pip install voxcpm
|
||||
```
|
||||
|
||||
> **环境要求:** Python ≥ 3.10,PyTorch ≥ 2.5.0,CUDA ≥ 12.0。详见 [快速开始文档](https://voxcpm.readthedocs.io/zh-cn/latest/quickstart.html)。
|
||||
|
||||
### Python API
|
||||
|
||||
#### 🗣️ 文本转语音
|
||||
|
||||
```python
|
||||
from voxcpm import VoxCPM
|
||||
import soundfile as sf
|
||||
|
||||
model = VoxCPM.from_pretrained(
|
||||
"openbmb/VoxCPM2",
|
||||
load_denoiser=False,
|
||||
)
|
||||
|
||||
wav = model.generate(
|
||||
text="VoxCPM2 是目前推荐使用的多语言语音合成版本。",
|
||||
cfg_value=2.0,
|
||||
inference_timesteps=10,
|
||||
)
|
||||
sf.write("demo.wav", wav, model.tts_model.sample_rate)
|
||||
print("已保存: demo.wav")
|
||||
```
|
||||
|
||||
如果你希望先从 ModelScope 下载模型到本地(适用于国内网络访问),可以使用:
|
||||
|
||||
```bash
|
||||
pip install modelscope
|
||||
```
|
||||
|
||||
```python
|
||||
from modelscope.hub.snapshot_download import snapshot_download
|
||||
from voxcpm import VoxCPM
|
||||
import soundfile as sf
|
||||
|
||||
local_model_dir = snapshot_download("OpenBMB/VoxCPM2")
|
||||
model = VoxCPM.from_pretrained(local_model_dir, load_denoiser=False)
|
||||
|
||||
wav = model.generate(
|
||||
text="VoxCPM2 是目前推荐使用的多语言语音合成版本。",
|
||||
cfg_value=2.0,
|
||||
inference_timesteps=10,
|
||||
)
|
||||
sf.write("demo.wav", wav, model.tts_model.sample_rate)
|
||||
```
|
||||
|
||||
#### 🎨 音色设计
|
||||
|
||||
用自然语言描述创建全新音色,无需参考音频。**格式:** 在 `text` 开头用括号写入音色描述(如 `"(音色描述)要合成的文本。"`):
|
||||
|
||||
```python
|
||||
wav = model.generate(
|
||||
text="(年轻女性,声音温柔甜美)你好,欢迎使用VoxCPM2!",
|
||||
cfg_value=2.0,
|
||||
inference_timesteps=10,
|
||||
)
|
||||
sf.write("voice_design.wav", wav, model.tts_model.sample_rate)
|
||||
```
|
||||
|
||||
#### 🎛️ 可控声音克隆
|
||||
|
||||
上传一段参考音频,模型克隆其音色,同时可以使用控制指令调节语速、情绪或风格。
|
||||
|
||||
```python
|
||||
wav = model.generate(
|
||||
text="这是VoxCPM2生成的克隆语音。",
|
||||
reference_wav_path="path/to/voice.wav",
|
||||
)
|
||||
sf.write("clone.wav", wav, model.tts_model.sample_rate)
|
||||
|
||||
wav = model.generate(
|
||||
text="(稍快一点,欢快的语气)这是带风格控制的克隆语音。",
|
||||
reference_wav_path="path/to/voice.wav",
|
||||
cfg_value=2.0,
|
||||
inference_timesteps=10,
|
||||
)
|
||||
sf.write("controllable_clone.wav", wav, model.tts_model.sample_rate)
|
||||
```
|
||||
|
||||
#### 🎙️ 极致克隆
|
||||
|
||||
提供参考音频及其精确文本转录,实现基于音频续写的高保真克隆。为获得最高克隆相似度,可将同一音频同时传给 `reference_wav_path` 和 `prompt_wav_path`:
|
||||
|
||||
```python
|
||||
wav = model.generate(
|
||||
text="这是使用VoxCPM2的极致克隆演示。",
|
||||
prompt_wav_path="path/to/voice.wav",
|
||||
prompt_text="参考音频的文本转录。",
|
||||
reference_wav_path="path/to/voice.wav", # 可选,提升相似度
|
||||
)
|
||||
sf.write("hifi_clone.wav", wav, model.tts_model.sample_rate)
|
||||
```
|
||||
|
||||
<details>
|
||||
<summary><b>🔄 流式 API</b></summary>
|
||||
|
||||
```python
|
||||
import numpy as np
|
||||
|
||||
chunks = []
|
||||
for chunk in model.generate_streaming(
|
||||
text="使用VoxCPM进行流式语音合成非常简单!",
|
||||
):
|
||||
chunks.append(chunk)
|
||||
wav = np.concatenate(chunks)
|
||||
sf.write("streaming.wav", wav, model.tts_model.sample_rate)
|
||||
```
|
||||
</details>
|
||||
|
||||
### 命令行使用
|
||||
|
||||
```bash
|
||||
# 音色设计(无需参考音频)
|
||||
voxcpm design \
|
||||
--text "VoxCPM2带来全新语音合成体验。" \
|
||||
--output out.wav
|
||||
|
||||
# 可控声音克隆(带风格控制)
|
||||
voxcpm design \
|
||||
--text "VoxCPM2带来全新语音合成体验。" \
|
||||
--control "年轻女声,温暖温柔,略带微笑" \
|
||||
--output out.wav
|
||||
|
||||
# 声音克隆(参考音频)
|
||||
voxcpm clone \
|
||||
--text "这是一个声音克隆的演示。" \
|
||||
--reference-audio path/to/voice.wav \
|
||||
--output out.wav
|
||||
|
||||
# 极致克隆(提示音频 + 转录文本)
|
||||
voxcpm clone \
|
||||
--text "这是一个声音克隆的演示。" \
|
||||
--prompt-audio path/to/voice.wav \
|
||||
--prompt-text "参考音频转录文本" \
|
||||
--reference-audio path/to/voice.wav \
|
||||
--output out.wav
|
||||
|
||||
# 批量处理
|
||||
voxcpm batch --input examples/input.txt --output-dir outs
|
||||
|
||||
# 帮助
|
||||
voxcpm --help
|
||||
```
|
||||
|
||||
### Web Demo
|
||||
|
||||
```bash
|
||||
python app.py --model-dir /path/to/VoxCPM2 --port 8808 # 指定本地模型路径,然后打开 http://localhost:8808
|
||||
```
|
||||
|
||||
### 🚢 生产部署(Nano-vLLM)
|
||||
|
||||
如需高吞吐量部署,使用 [**Nano-vLLM-VoxCPM**](https://github.com/a710128/nanovllm-voxcpm) — 基于 Nano-vLLM 构建的专用推理引擎,支持并发请求和异步 API。
|
||||
|
||||
```bash
|
||||
pip install nano-vllm-voxcpm
|
||||
```
|
||||
|
||||
```python
|
||||
from nanovllm_voxcpm import VoxCPM
|
||||
import numpy as np, soundfile as sf
|
||||
|
||||
server = VoxCPM.from_pretrained(model="/path/to/VoxCPM", devices=[0])
|
||||
chunks = list(server.generate(target_text="你好,我来自VoxCPM!"))
|
||||
sf.write("out.wav", np.concatenate(chunks), 48000)
|
||||
server.stop()
|
||||
```
|
||||
|
||||
> **在 NVIDIA RTX 4090 上 RTF 低至 ~0.13**(标准 PyTorch 实现约 ~0.3),支持批量并发请求和 FastAPI HTTP 服务。详见 [Nano-vLLM-VoxCPM 仓库](https://github.com/a710128/nanovllm-voxcpm)。
|
||||
|
||||
> **完整参数说明、多场景示例与声音克隆技巧 →** [快速开始指南](https://voxcpm.readthedocs.io/zh-cn/latest/quickstart.html) | [使用指南](https://voxcpm.readthedocs.io/zh-cn/latest/usage_guide.html) | [Cookbook](https://voxcpm.readthedocs.io/zh-cn/latest/cookbook.html)
|
||||
|
||||
---
|
||||
|
||||
## 📦 模型与版本
|
||||
|
||||
| | **VoxCPM2** | **VoxCPM1.5** | **VoxCPM-0.5B** |
|
||||
|---|:---:|:---:|:---:|
|
||||
| **状态** | 🟢 最新版本 | 稳定版 | 旧版 |
|
||||
| **主模型参数量** | 2B | 0.6B | 0.5B |
|
||||
| **音频采样率** | 48kHz | 44.1kHz | 16kHz |
|
||||
| **LM处理码率** | 6.25Hz | 6.25Hz | 12.5Hz |
|
||||
| **语言支持数量** | 30 | 2(中文、英文) | 2(中文、英文) |
|
||||
| **克隆模式** | 隔离参考音频(无需文本) & 音频续写 | 仅音频续写 | 仅音频续写 |
|
||||
| **音色设计** | ✅ | — | — |
|
||||
| **可控声音克隆** | ✅ | — | — |
|
||||
| **SFT / LoRA** | ✅ | ✅ | ✅ |
|
||||
| **RTF (RTX 4090)** | ~0.30 | ~0.15 | ~0.17 |
|
||||
| **RTF Nano-VLLM (RTX 4090)** | ~0.13 | ~0.08 | ~0.10 |
|
||||
| **显存占用** | ~8 GB | ~6 GB | ~5 GB |
|
||||
| **模型权重** | [🤗 HF](https://huggingface.co/openbmb/VoxCPM2) / [MS](https://modelscope.cn/models/OpenBMB/VoxCPM2) | [🤗 HF](https://huggingface.co/openbmb/VoxCPM1.5) / [MS](https://modelscope.cn/models/OpenBMB/VoxCPM1.5) | [🤗 HF](https://huggingface.co/openbmb/VoxCPM-0.5B) / [MS](https://modelscope.cn/models/OpenBMB/VoxCPM-0.5B) |
|
||||
| **技术报告** | 即将发布 | — | [arXiv](https://arxiv.org/abs/2509.24650) [ICLR 2026](https://openreview.net/forum?id=h5KLpGoqzC) |
|
||||
| **Demo 页面** | [音频示例](https://openbmb.github.io/voxcpm2-demopage) | — | [音频示例](https://openbmb.github.io/VoxCPM-demopage) |
|
||||
|
||||
VoxCPM2 采用**连续音频表征、扩散自回归**范式,模型在 **AudioVAE** 的连续隐空间中通过四阶段处理:**LocEnc → TSLM → RALM → LocDiT**,实现丰富的表现力语音合成和 48kHz 原生音频输出。
|
||||
|
||||
<div align="center">
|
||||
<img src="assets/voxcpm_model.png" alt="VoxCPM2 模型架构" width="90%">
|
||||
</div>
|
||||
|
||||
> 完整架构细节、VoxCPM2 升级内容和模型对比表见 [架构设计文档](https://voxcpm.readthedocs.io/zh-cn/latest/models/architecture.html)。
|
||||
|
||||
---
|
||||
|
||||
## 📊 性能评测
|
||||
|
||||
VoxCPM2 在公开的零样本和可控 TTS 基准测试中取得了 SOTA 或可比的结果。
|
||||
|
||||
### Seed-TTS-eval
|
||||
|
||||
<details>
|
||||
<summary><b>Seed-TTS-eval WER(⬇)&SIM(⬆) 结果(点击展开)</b></summary>
|
||||
|
||||
| Model | Parameters | Open-Source | test-EN | | test-ZH | | test-Hard | |
|
||||
|------|------|------|:------------:|:--:|:------------:|:--:|:-------------:|:--:|
|
||||
| | | | WER/%⬇ | SIM/%⬆| CER/%⬇| SIM/%⬆ | CER/%⬇ | SIM/%⬆ |
|
||||
| MegaTTS3 | 0.5B | ❌ | 2.79 | 77.1 | 1.52 | 79.0 | - | - |
|
||||
| DiTAR | 0.6B | ❌ | 1.69 | 73.5 | 1.02 | 75.3 | - | - |
|
||||
| CosyVoice3 | 0.5B | ❌ | 2.02 | 71.8 | 1.16 | 78.0 | 6.08 | 75.8 |
|
||||
| CosyVoice3 | 1.5B | ❌ | 2.22 | 72.0 | 1.12 | 78.1 | 5.83 | 75.8 |
|
||||
| Seed-TTS | - | ❌ | 2.25 | 76.2 | 1.12 | 79.6 | 7.59 | 77.6 |
|
||||
| MiniMax-Speech | - | ❌ | 1.65 | 69.2 | 0.83 | 78.3 | - | - |
|
||||
| F5-TTS | 0.3B | ✅ | 2.00 | 67.0 | 1.53 | 76.0 | 8.67 | 71.3 |
|
||||
| MaskGCT | 1B | ✅ | 2.62 | 71.7 | 2.27 | 77.4 | - | - |
|
||||
| CosyVoice | 0.3B | ✅ | 4.29 | 60.9 | 3.63 | 72.3 | 11.75 | 70.9 |
|
||||
| CosyVoice2 | 0.5B | ✅ | 3.09 | 65.9 | 1.38 | 75.7 | 6.83 | 72.4 |
|
||||
| SparkTTS | 0.5B | ✅ | 3.14 | 57.3 | 1.54 | 66.0 | - | - |
|
||||
| FireRedTTS | 0.5B | ✅ | 3.82 | 46.0 | 1.51 | 63.5 | 17.45 | 62.1 |
|
||||
| FireRedTTS-2 | 1.5B | ✅ | 1.95 | 66.5 | 1.14 | 73.6 | - | - |
|
||||
| Qwen2.5-Omni | 7B | ✅ | 2.72 | 63.2 | 1.70 | 75.2 | 7.97 | 74.7 |
|
||||
| Qwen3-Omni | 30B-A3B | ✅ | 1.39 | - | 1.07 | - | - | - |
|
||||
| OpenAudio-s1-mini | 0.5B | ✅ | 1.94 | 55.0 | 1.18 | 68.5 | 23.37 | 64.3 |
|
||||
| IndexTTS2 | 1.5B | ✅ | 2.23 | 70.6 | 1.03 | 76.5 | 7.12 | 75.5 |
|
||||
| VibeVoice | 1.5B | ✅ | 3.04 | 68.9 | 1.16 | 74.4 | - | - |
|
||||
| HiggsAudio-v2 | 3B | ✅ | 2.44 | 67.7 | 1.50 | 74.0 | 55.07 | 65.6 |
|
||||
| VoxCPM-0.5B | 0.6B | ✅ | 1.85 | 72.9 | 0.93 | 77.2 | 8.87 | 73.0 |
|
||||
| VoxCPM1.5 | 0.8B | ✅ | 2.12 | 71.4 | 1.18 | 77.0 | 7.74 | 73.1 |
|
||||
| MOSS-TTS | | ✅ | 1.85 | 73.4 | 1.20 | 78.8 | - | - |
|
||||
| Qwen3-TTS | 1.7B | ✅ | 1.23 | 71.7 | 1.22 | 77.0 | 6.76 | 74.8 |
|
||||
| FishAudio S2 | 4B | ✅ | 0.99 | - | 0.54 | - | 5.99 | - |
|
||||
| LongCat-Audio-DiT | 3.5B | ✅ | 1.50 | 78.6 | 1.09 | 81.8 | 6.04 | 79.7 |
|
||||
| **VoxCPM2** | 2B | ✅ | 1.84 | 75.3 | 0.97| 79.5| 8.13 | 75.3 |
|
||||
</details>
|
||||
|
||||
|
||||
### CV3-eval
|
||||
<details>
|
||||
<summary><b>CV3-eval 多语言 WER/CER(⬇) 结果(点击展开)</b></summary>
|
||||
|
||||
| Model | zh | en | hard-zh | hard-en | ja | ko | de | es | fr | it | ru |
|
||||
|-------|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|
|
||||
| CosyVoice2 | 4.08 | 6.32 | 12.58| 11.96| 9.13 | 19.7 |- | - | - | - | - |
|
||||
| CosyVoice3-1.5B | 3.91 | 4.99 | 9.77 | 10.55 | 7.57 | 5.69 | 6.43 | 4.47 | 11.8 | 10.5 | 6.64 |
|
||||
| Fish Audio S2 | 2.65 | 2.43 | 9.10 | 4.40 | 3.96 | 2.76 | 2.22 | 2.00 | 6.26 | 2.04 | 2.78 |
|
||||
| **VoxCPM2** | 3.65 | 5.00 | 8.55 | 8.48 | 5.96 | 5.69 | 4.77 | 3.80 | 9.85 | 4.25 | 5.21 |
|
||||
</details>
|
||||
|
||||
### MiniMax-Multilingual-Test
|
||||
|
||||
<details>
|
||||
<summary><b>Minimax-MLS-test WER(⬇) 结果(点击展开)</b></summary>
|
||||
|
||||
| Language | Minimax | ElevenLabs | Qwen3-TTS | FishAudio S2 | **VoxCPM2** |
|
||||
|----------|:-------:|:----------:|:--------------------:|:------------:|:-----------:|
|
||||
| Arabic | **1.665** | 1.666 | – | 3.500 | 13.046 |
|
||||
| Cantonese | 34.111 | 51.513 | – | **30.670** | 38.584 |
|
||||
| Chinese | 2.252 | 16.026 | 0.928 | **0.730** | 1.136 |
|
||||
| Czech | 3.875 | **2.108** | – | 2.840 | 24.132 |
|
||||
| Dutch | 1.143 | **0.803** | – | 0.990 | 0.913 |
|
||||
| English | 2.164 | 2.339 | **0.934** | 1.620 | 2.289 |
|
||||
| Finnish | 4.666 | 2.964 | – | 3.330 | **2.632** |
|
||||
| French | 4.099 | 5.216 | **2.858** | 3.050 | 4.534 |
|
||||
| German | 1.906 | 0.572 | 1.235 | **0.550** | 0.679 |
|
||||
| Greek | 2.016 | **0.991** | – | 5.740 | 2.844 |
|
||||
| Hindi | 6.962 | **5.827** | – | 14.640 | 19.699 |
|
||||
| Indonesian | 1.237 | **1.059** | – | 1.460 | 1.084 |
|
||||
| Italian | 1.543 | 1.743 | **0.948** | 1.270 | 1.563 |
|
||||
| Japanese | 3.519 | 10.646 | 3.823 | **2.760** | 4.628 |
|
||||
| Korean | 1.747 | 1.865 | 1.755 | **1.180** | 1.962 |
|
||||
| Polish | 1.415 | **0.766** | – | 1.260 | 1.141 |
|
||||
| Portuguese | 1.877 | 1.331 | 1.526 | **1.140** | 1.938 |
|
||||
| Romanian | 2.878 | **1.347** | – | 10.740 | 21.577 |
|
||||
| Russian | 4.281 | 3.878 | 3.212 | **2.400** | 3.634 |
|
||||
| Spanish | 1.029 | 1.084 | 1.126 | **0.910** | 1.438 |
|
||||
| Thai | 2.701 | 73.936 | – | 4.230 | 2.961 |
|
||||
| Turkish | 1.52 | 0.699 | – | 0.870 | 0.817 |
|
||||
| Ukrainian | 1.082 | **0.997** | – | 2.300 | 6.316 |
|
||||
| Vietnamese | **0.88** | 73.415 | – | 7.410 | 3.307 |
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><b>Minimax-MLS-test SIM(⬆) 结果(点击展开)</b></summary>
|
||||
|
||||
| Language | Minimax | ElevenLabs | Qwen3-TTS | FishAudio S2 | **VoxCPM2** |
|
||||
|----------|:-------:|:----------:|:--------------------:|:------------:|:-----------:|
|
||||
| Arabic | 73.6 | 70.6 | – | 75.0 | **79.1** |
|
||||
| Cantonese | 77.8 | 67.0 | – | 80.5 | **83.5** |
|
||||
| Chinese | 78.0 | 67.7 | 79.9 | 81.6 | **82.5** |
|
||||
| Czech | 79.6 | 68.5 | – | **79.8** | 78.3 |
|
||||
| Dutch | 73.8 | 68.0 | – | 73.0 | **80.8** |
|
||||
| English | 75.6 | 61.3 | 77.5 | 79.7 | **85.4** |
|
||||
| Finnish | 83.5 | 75.9 | – | 81.9 | **89.0** |
|
||||
| French | 62.8 | 53.5 | 62.8 | 69.8 | **73.5** |
|
||||
| German | 73.3 | 61.4 | 77.5 | 76.7 | **80.3** |
|
||||
| Greek | 82.6 | 73.3 | – | 79.5 | **86.0** |
|
||||
| Hindi | 81.8 | 73.0 | – | 82.1 | **85.6** |
|
||||
| Indonesian | 72.9 | 66.0 | – | 76.3 | **80.0** |
|
||||
| Italian | 69.9 | 57.9 | 81.7 | 74.7 | **78.0** |
|
||||
| Japanese | 77.6 | 73.8 | 78.8 | 79.6 | **82.8** |
|
||||
| Korean | 77.6 | 70.0 | 79.9 | 81.7 | **83.3** |
|
||||
| Polish | 80.2 | 72.9 | – | 81.9 | **88.4** |
|
||||
| Portuguese | 80.5 | 71.1 | 81.7 | 78.1 | **83.7** |
|
||||
| Romanian | **80.9** | 69.9 | – | 73.3 | 79.7 |
|
||||
| Russian | 76.1 | 67.6 | 79.2 | 79.0 | **81.1** |
|
||||
| Spanish | 76.2 | 61.5 | 81.4 | 77.6 | **83.1** |
|
||||
| Thai | 80.0 | 58.8 | – | 78.6 | **84.0** |
|
||||
| Turkish | 77.9 | 59.6 | – | 83.5 | **87.1** |
|
||||
| Ukrainian | 73.0 | 64.7 | – | 74.7 | **79.8** |
|
||||
| Vietnamese | 74.3 | 36.9 | – | 74.0 | **80.6** |
|
||||
|
||||
</details>
|
||||
|
||||
### Internal 30-Language ASR Benchmark
|
||||
|
||||
我们额外进行了内部多语言可懂度评测:**30 语种 × 500 样本**,ASR 转写评估使用 **Gemini 3.1 Flash Lite API**。
|
||||
|
||||
<details>
|
||||
<summary><b>内部30语种评测集ASR结果(点击展开)</b></summary>
|
||||
|
||||
| 语言 | 指标 | VoxCPM2 | Fish S2-Pro |
|
||||
|---|---:|---:|---:|
|
||||
| ar (阿拉伯语) | CER | 1.23% | 0.30% |
|
||||
| da (丹麦语) | WER | 2.70% | 3.52% |
|
||||
| de (德语) | WER | 0.96% | 0.64% |
|
||||
| el (希腊语) | WER | 3.17% | 4.61% |
|
||||
| en (英语) | WER | 0.42% | 1.03% |
|
||||
| es (西班牙语) | WER | 1.33% | 0.64% |
|
||||
| fi (芬兰语) | WER | 2.24% | 2.80% |
|
||||
| fr (法语) | WER | 2.16% | 2.34% |
|
||||
| he (希伯来语) | CER | 2.98% | 15.27% |
|
||||
| hi (印地语) | CER | 0.79% | 0.91% |
|
||||
| id (印尼语) | WER | 1.36% | 1.68% |
|
||||
| it (意大利语) | WER | 1.65% | 1.08% |
|
||||
| ja (日语) | CER | 2.40% | 1.82% |
|
||||
| km (高棉语) | CER | 2.05% | 75.15% |
|
||||
| ko (韩语) | CER | 0.95% | 0.29% |
|
||||
| lo (老挝语) | CER | 1.90% | 87.40% |
|
||||
| ms (马来语) | WER | 1.75% | 1.41% |
|
||||
| my (缅甸语) | CER | 1.42% | 85.27% |
|
||||
| nl (荷兰语) | WER | 1.25% | 1.68% |
|
||||
| no (挪威语) | WER | 2.49% | 3.76% |
|
||||
| pl (波兰语) | WER | 1.90% | 1.65% |
|
||||
| pt (葡萄牙语) | WER | 1.48% | 1.49% |
|
||||
| ru (俄语) | WER | 0.90% | 0.86% |
|
||||
| sv (瑞典语) | WER | 2.22% | 2.63% |
|
||||
| sw (斯瓦希里语) | CER | 1.07% | 2.02% |
|
||||
| th (泰语) | CER | 0.94% | 1.92% |
|
||||
| tl (菲律宾语) | WER | 2.63% | 4.00% |
|
||||
| tr (土耳其语) | WER | 1.65% | 1.65% |
|
||||
| vi (越南语) | WER | 1.56% | 5.56% |
|
||||
| zh (中文) | CER | 0.92% | 1.02% |
|
||||
| 平均(30 语种) | | **1.68%** | - |
|
||||
|
||||
</details>
|
||||
|
||||
|
||||
### InstructTTSEval
|
||||
|
||||
<details>
|
||||
<summary><b>指令驱动音色设计结果 (点击展开)</b></summary>
|
||||
|
||||
| Model | InstructTTSEval-ZH | | | InstructTTSEval-EN | | |
|
||||
|-------|:---:|:----:|:----:|:----:|:----:|:----:|
|
||||
| | APS⬆| DSD⬆ | RP⬆| APS⬆ | DSD⬆ | RP⬆ |
|
||||
| Hume | – | – | – | 83.0 | 75.3 | 54.3 |
|
||||
| VoxInstruct | 47.5 | 52.3 | 42.6 | 54.9 | 57.0 | 39.3 |
|
||||
| Parler-tts-mini | – | – | – | 63.4 | 48.7 | 28.6 |
|
||||
| Parler-tts-large | – | – | – | 60.0 | 45.9 | 31.2 |
|
||||
| PromptTTS | – | – | – | 64.3 | 47.2 | 31.4 |
|
||||
| PromptStyle | – | – | – | 57.4 | 46.4 | 30.9 |
|
||||
| VoiceSculptor | 75.7 | 64.7 | 61.5 | – | – | – |
|
||||
| Mimo-Audio-7B-Instruct | 75.7 | 74.3 | 61.5 | 80.6 | 77.6 | 59.5 |
|
||||
| Qwen3TTS-12Hz-1.7B-VD | **85.2** | **81.1** | **65.1** | 82.9 | 82.4 | 68.4 |
|
||||
| **VoxCPM2** | **85.2** | 71.5 | 60.8 | **84.2** | **83.2** | **71.4** |
|
||||
|
||||
</details>
|
||||
|
||||
---
|
||||
|
||||
## ⚙️ 微调
|
||||
|
||||
VoxCPM 支持**全参数微调(SFT)** 和 **LoRA 微调**。仅需 **5-10分钟** 的音频数据,即可适配特定说话人、语言或领域。
|
||||
|
||||
```bash
|
||||
# LoRA 微调(参数高效,推荐)
|
||||
python scripts/train_voxcpm_finetune.py \
|
||||
--config_path conf/voxcpm_v2/voxcpm_finetune_lora.yaml
|
||||
|
||||
# 全参数微调
|
||||
python scripts/train_voxcpm_finetune.py \
|
||||
--config_path conf/voxcpm_v2/voxcpm_finetune_all.yaml
|
||||
|
||||
# WebUI 训练与推理
|
||||
python lora_ft_webui.py # 然后打开 http://localhost:7860
|
||||
```
|
||||
|
||||
> **完整指南 →** [微调文档](https://voxcpm.readthedocs.io/zh-cn/latest/finetuning/finetune.html)(数据准备、配置、训练、LoRA 热切换、常见问题)
|
||||
|
||||
---
|
||||
|
||||
## 📚 文档
|
||||
|
||||
完整文档:**[voxcpm.readthedocs.io](https://voxcpm.readthedocs.io/zh-cn/latest/)**
|
||||
|
||||
| 主题 | 链接 |
|
||||
|---|---|
|
||||
| 快速开始与安装 | [快速开始](https://voxcpm.readthedocs.io/zh-cn/latest/quickstart.html) |
|
||||
| 使用指南与 Cookbook | [使用指南](https://voxcpm.readthedocs.io/zh-cn/latest/usage_guide.html) |
|
||||
| VoxCPM 系列模型 | [模型列表](https://voxcpm.readthedocs.io/zh-cn/latest/models/version_history.html) |
|
||||
| 微调(SFT & LoRA) | [微调指南](https://voxcpm.readthedocs.io/zh-cn/latest/finetuning/finetune.html) |
|
||||
| 常见问题 | [FAQ](https://voxcpm.readthedocs.io/zh-cn/latest/faq.html) |
|
||||
|
||||
---
|
||||
|
||||
## 🌟 生态与社区
|
||||
|
||||
| 项目 | 说明 |
|
||||
|---|---|
|
||||
| [**Nano-vLLM**](https://github.com/a710128/nanovllm-voxcpm) | 高吞吐快速 GPU 推理引擎 |
|
||||
| [**VoxCPM.cpp**](https://github.com/bluryar/VoxCPM.cpp) | GGML/GGUF:CPU、CUDA、Vulkan 推理 |
|
||||
| [**VoxCPM-ONNX**](https://github.com/bluryar/VoxCPM-ONNX) | ONNX 导出,支持 CPU 推理 |
|
||||
| [**VoxCPMANE**](https://github.com/0seba/VoxCPMANE) | Apple Neural Engine 后端 |
|
||||
| [**voxcpm_rs**](https://github.com/madushan1000/voxcpm_rs) | Rust 重新实现 |
|
||||
| [**ComfyUI-VoxCPM**](https://github.com/wildminder/ComfyUI-VoxCPM) | ComfyUI 节点工作流 |
|
||||
| [**ComfyUI-VoxCPMTTS**](https://github.com/1038lab/ComfyUI-VoxCPMTTS) | ComfyUI TTS 扩展 |
|
||||
| [**TTS WebUI**](https://github.com/rsxdalv/tts_webui_extension.vox_cpm) | 浏览器端 TTS 扩展 |
|
||||
|
||||
> 完整生态见[文档](https://voxcpm.readthedocs.io/zh-cn/latest/)。社区项目非 OpenBMB 官方维护。做了什么有趣的东西?[提 Issue 或 PR](https://github.com/OpenBMB/VoxCPM/issues) 把它加进来!
|
||||
|
||||
---
|
||||
|
||||
## ⚠️ 风险与局限性
|
||||
|
||||
- **滥用风险:** VoxCPM 的声音克隆能力可生成高度逼真的合成语音。**严禁**将 VoxCPM 用于冒充他人、欺诈或虚假信息传播。我们强烈建议对所有 AI 生成的内容进行明确标注。
|
||||
- **可控生成稳定性:** 音色设计和可控声音克隆的结果可能因生成次数而异 — 建议尝试生成 1~3 次以获得理想的音色或风格。我们正在积极提升可控性的一致性。
|
||||
- **语言覆盖:** VoxCPM2 官方支持 30 种语言。对于未列入的语言,欢迎直接测试或使用自有数据进行微调。我们计划在未来版本中扩展语言覆盖。
|
||||
- **使用说明:** 本模型基于 Apache-2.0 协议发布。用于生产部署时,我们建议针对具体场景进行充分的测试和安全评估。
|
||||
|
||||
---
|
||||
|
||||
## 📖 引用
|
||||
|
||||
如果 VoxCPM 对您有帮助,请考虑引用我们的工作并为仓库加星 ⭐!
|
||||
|
||||
```bib
|
||||
@article{voxcpm2_2026,
|
||||
title = {VoxCPM2: Tokenizer-Free TTS for Multilingual Speech Generation, Creative Voice Design, and True-to-Life Cloning},
|
||||
author = {VoxCPM Team},
|
||||
journal = {GitHub},
|
||||
year = {2026},
|
||||
}
|
||||
|
||||
@article{voxcpm2025,
|
||||
title = {VoxCPM: Tokenizer-Free TTS for Context-Aware Speech Generation
|
||||
and True-to-Life Voice Cloning},
|
||||
author = {Zhou, Yixuan and Zeng, Guoyang and Liu, Xin and Li, Xiang and
|
||||
Yu, Renjie and Wang, Ziyang and Ye, Runchuan and Sun, Weiyue and
|
||||
Gui, Jiancheng and Li, Kehan and Wu, Zhiyong and Liu, Zhiyuan},
|
||||
journal = {arXiv preprint arXiv:2509.24650},
|
||||
year = {2025},
|
||||
}
|
||||
```
|
||||
|
||||
## 📄 许可证
|
||||
|
||||
VoxCPM 模型权重和代码基于 [Apache-2.0](LICENSE) 协议开源。
|
||||
|
||||
## 🙏 致谢
|
||||
|
||||
- [DiTAR](https://arxiv.org/abs/2502.03930) 扩散自回归骨干架构
|
||||
- [MiniCPM-4](https://github.com/OpenBMB/MiniCPM) 语言模型基座
|
||||
- [CosyVoice](https://github.com/FunAudioLLM/CosyVoice) 基于 Flow Matching 的 LocDiT 实现
|
||||
- [DAC](https://github.com/descriptinc/descript-audio-codec) Audio VAE 骨干
|
||||
- 感谢所有社区用户试用 VoxCPM、反馈问题、分享想法和贡献——你们的支持让项目持续进步
|
||||
|
||||
## 机构
|
||||
|
||||
<p>
|
||||
<a href="https://modelbest.cn/"><img src="assets/modelbest_logo.png" width="28px"> 面壁智能</a>
|
||||
|
||||
<a href="https://github.com/thuhcsi"><img src="assets/thuhcsi_logo.png" width="28px"> 清华大学人机交互实验室</a>
|
||||
</p>
|
||||
|
||||
## ⭐ Star 历史
|
||||
|
||||
[](https://star-history.com/#OpenBMB/VoxCPM&Date)
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 9.5 KiB |
@@ -1,7 +1,8 @@
|
||||
pretrained_path: /path/to/VoxCPM2/
|
||||
train_manifest: /path/to/train.jsonl
|
||||
val_manifest: null
|
||||
sample_rate: 48000
|
||||
sample_rate: 16000 # AudioVAE encoder input rate; must match audio_vae_config.sample_rate
|
||||
out_sample_rate: 48000 # AudioVAE decoder output rate; used for TensorBoard audio logging
|
||||
batch_size: 2
|
||||
grad_accum_steps: 8 # effective batch size = batch_size × grad_accum_steps = 16
|
||||
num_workers: 8
|
||||
@@ -14,6 +15,7 @@ weight_decay: 0.01
|
||||
warmup_steps: 100
|
||||
max_steps: 1000
|
||||
max_batch_tokens: 8192
|
||||
max_grad_norm: 1.0 # gradient clipping max norm; 0 = disabled
|
||||
save_path: /path/to/checkpoints/finetune_all
|
||||
tensorboard: /path/to/logs/finetune_all
|
||||
lambdas:
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
pretrained_path: /path/to/VoxCPM2/
|
||||
train_manifest: /path/to/train.jsonl
|
||||
val_manifest: null
|
||||
sample_rate: 48000
|
||||
sample_rate: 16000 # AudioVAE encoder input rate; must match audio_vae_config.sample_rate
|
||||
out_sample_rate: 48000 # AudioVAE decoder output rate; used for TensorBoard audio logging
|
||||
batch_size: 2
|
||||
grad_accum_steps: 8 # effective batch size = batch_size × grad_accum_steps = 16
|
||||
num_workers: 8
|
||||
@@ -14,6 +15,7 @@ weight_decay: 0.01
|
||||
warmup_steps: 100
|
||||
max_steps: 1000
|
||||
max_batch_tokens: 8192
|
||||
max_grad_norm: 1.0 # gradient clipping max norm; 0 = disabled
|
||||
save_path: /path/to/checkpoints/finetune_lora
|
||||
tensorboard: /path/to/logs/finetune_lora
|
||||
lambdas:
|
||||
|
||||
+74
-8
@@ -14,8 +14,10 @@ from typing import Optional
|
||||
project_root = Path(__file__).parent
|
||||
sys.path.insert(0, str(project_root / "src"))
|
||||
|
||||
# Default pretrained model path relative to this repo
|
||||
default_pretrained_path = str(project_root / "models" / "openbmb__VoxCPM1.5")
|
||||
# Default pretrained model path: prefer VoxCPM2 if it exists, fallback to VoxCPM1.5
|
||||
_v2_path = project_root / "models" / "openbmb__VoxCPM2"
|
||||
_v15_path = project_root / "models" / "openbmb__VoxCPM1.5"
|
||||
default_pretrained_path = str(_v2_path if _v2_path.exists() else _v15_path)
|
||||
|
||||
from voxcpm.core import VoxCPM
|
||||
from voxcpm.model.voxcpm import LoRAConfig
|
||||
@@ -99,6 +101,24 @@ def get_timestamp_str():
|
||||
return datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
|
||||
|
||||
def detect_sample_rate(pretrained_path: str) -> Optional[int]:
|
||||
"""Read audio_vae_config.sample_rate from the model's config.json.
|
||||
|
||||
This is the AudioVAE *encoder* input rate, which is the correct rate for
|
||||
resampling training data. Returns None when detection fails.
|
||||
"""
|
||||
config_file = os.path.join(pretrained_path, "config.json")
|
||||
if not os.path.isfile(config_file):
|
||||
return None
|
||||
try:
|
||||
with open(config_file, "r", encoding="utf-8") as f:
|
||||
cfg = json.load(f)
|
||||
return int(cfg["audio_vae_config"]["sample_rate"])
|
||||
except (KeyError, ValueError, json.JSONDecodeError) as e:
|
||||
print(f"Warning: failed to detect sample_rate from {config_file}: {e}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
|
||||
def get_or_load_asr_model():
|
||||
global asr_model
|
||||
if asr_model is None:
|
||||
@@ -350,6 +370,7 @@ def start_training(
|
||||
warmup_steps=100,
|
||||
max_steps=None,
|
||||
sample_rate=44100,
|
||||
max_grad_norm=1.0,
|
||||
# LoRA advanced
|
||||
enable_lm=True,
|
||||
enable_dit=True,
|
||||
@@ -377,15 +398,39 @@ def start_training(
|
||||
os.makedirs(checkpoints_dir, exist_ok=True)
|
||||
os.makedirs(logs_dir, exist_ok=True)
|
||||
|
||||
# Auto-detect sample_rate from model config.json to prevent mismatch
|
||||
detected_sr = detect_sample_rate(pretrained_path)
|
||||
if detected_sr is not None:
|
||||
if int(sample_rate) != detected_sr:
|
||||
training_log += (
|
||||
f"[Auto-fix] sample_rate changed from {int(sample_rate)} to {detected_sr} "
|
||||
f"(read from {pretrained_path}/config.json audio_vae_config.sample_rate)\n"
|
||||
)
|
||||
sample_rate = detected_sr
|
||||
|
||||
# Create config dictionary
|
||||
# Resolve max_steps default
|
||||
resolved_max_steps = int(max_steps) if max_steps not in (None, "", 0) else int(num_iters)
|
||||
|
||||
# Auto-detect out_sample_rate from model config
|
||||
out_sample_rate = 0
|
||||
config_file = os.path.join(pretrained_path, "config.json")
|
||||
if os.path.isfile(config_file):
|
||||
try:
|
||||
with open(config_file, "r", encoding="utf-8") as f:
|
||||
cfg = json.load(f)
|
||||
out_sr = cfg.get("audio_vae_config", {}).get("out_sample_rate")
|
||||
if out_sr:
|
||||
out_sample_rate = int(out_sr)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
config = {
|
||||
"pretrained_path": pretrained_path,
|
||||
"train_manifest": train_manifest,
|
||||
"val_manifest": val_manifest,
|
||||
"sample_rate": int(sample_rate),
|
||||
"out_sample_rate": out_sample_rate,
|
||||
"batch_size": int(batch_size),
|
||||
"grad_accum_steps": int(grad_accum_steps),
|
||||
"num_workers": int(num_workers),
|
||||
@@ -397,6 +442,7 @@ def start_training(
|
||||
"weight_decay": float(weight_decay),
|
||||
"warmup_steps": int(warmup_steps),
|
||||
"max_steps": resolved_max_steps,
|
||||
"max_grad_norm": float(max_grad_norm),
|
||||
"save_path": checkpoints_dir,
|
||||
"tensorboard": tensorboard_path if tensorboard_path else logs_dir,
|
||||
"lambdas": {"loss/diff": 1.0, "loss/stop": 1.0},
|
||||
@@ -904,17 +950,19 @@ with gr.Blocks(title="VoxCPM LoRA WebUI", theme=gr.themes.Soft(), css=custom_css
|
||||
with gr.Row():
|
||||
max_steps = gr.Number(label="最大步数 (max_steps, 0→默认num_iters)", value=0, precision=0)
|
||||
sample_rate = gr.Number(label="采样率 (sample_rate)", value=44100, precision=0)
|
||||
tensorboard_path = gr.Textbox(label="Tensorboard 路径 (可选)", value="")
|
||||
max_grad_norm = gr.Number(label="梯度裁剪 (max_grad_norm, 0=关闭)", value=1.0)
|
||||
with gr.Row():
|
||||
tensorboard_path = gr.Textbox(label="Tensorboard 路径 (可选)", value="")
|
||||
enable_lm = gr.Checkbox(label="启用 LoRA LM (enable_lm)", value=True)
|
||||
enable_dit = gr.Checkbox(label="启用 LoRA DIT (enable_dit)", value=True)
|
||||
with gr.Row():
|
||||
enable_proj = gr.Checkbox(label="启用投影 (enable_proj)", value=False)
|
||||
dropout = gr.Number(label="LoRA Dropout", value=0.0)
|
||||
|
||||
gr.Markdown("#### 分发选项 (Distribution)")
|
||||
with gr.Row():
|
||||
hf_model_id = gr.Textbox(
|
||||
label="HuggingFace Model ID (e.g., openbmb/VoxCPM1.5)", value="openbmb/VoxCPM1.5"
|
||||
label="HuggingFace Model ID (e.g., openbmb/VoxCPM2)", value=""
|
||||
)
|
||||
distribute = gr.Checkbox(label="分发模式 (distribute)", value=False)
|
||||
|
||||
@@ -929,6 +977,19 @@ with gr.Blocks(title="VoxCPM LoRA WebUI", theme=gr.themes.Soft(), css=custom_css
|
||||
show_label=False,
|
||||
)
|
||||
|
||||
def on_pretrained_path_change(path):
|
||||
"""Auto-detect sample_rate when pretrained model path changes."""
|
||||
sr = detect_sample_rate(path)
|
||||
if sr is not None:
|
||||
return gr.update(value=sr)
|
||||
return gr.update()
|
||||
|
||||
train_pretrained_path.change(
|
||||
on_pretrained_path_change,
|
||||
inputs=[train_pretrained_path],
|
||||
outputs=[sample_rate],
|
||||
)
|
||||
|
||||
start_btn.click(
|
||||
start_training,
|
||||
inputs=[
|
||||
@@ -951,6 +1012,7 @@ with gr.Blocks(title="VoxCPM LoRA WebUI", theme=gr.themes.Soft(), css=custom_css
|
||||
warmup_steps,
|
||||
max_steps,
|
||||
sample_rate,
|
||||
max_grad_norm,
|
||||
enable_lm,
|
||||
enable_dit,
|
||||
enable_proj,
|
||||
@@ -1109,12 +1171,13 @@ with gr.Blocks(title="VoxCPM LoRA WebUI", theme=gr.themes.Soft(), css=custom_css
|
||||
"warmup_steps": "warmup_steps",
|
||||
"max_steps": "最大步数 (max_steps)",
|
||||
"sample_rate": "采样率 (sample_rate)",
|
||||
"max_grad_norm": "梯度裁剪 (max_grad_norm, 0=关闭)",
|
||||
"enable_lm": "启用 LoRA LM (enable_lm)",
|
||||
"enable_dit": "启用 LoRA DIT (enable_dit)",
|
||||
"enable_proj": "启用投影 (enable_proj)",
|
||||
"dropout": "LoRA Dropout",
|
||||
"tensorboard_path": "Tensorboard 路径 (可选)",
|
||||
"hf_model_id": "HuggingFace Model ID (e.g., openbmb/VoxCPM1.5)",
|
||||
"hf_model_id": "HuggingFace Model ID (e.g., openbmb/VoxCPM2)",
|
||||
"distribute": "分发模式 (distribute)",
|
||||
}
|
||||
else:
|
||||
@@ -1127,12 +1190,13 @@ with gr.Blocks(title="VoxCPM LoRA WebUI", theme=gr.themes.Soft(), css=custom_css
|
||||
"warmup_steps": "Warmup Steps",
|
||||
"max_steps": "Max Steps",
|
||||
"sample_rate": "Sample Rate",
|
||||
"max_grad_norm": "Max Grad Norm (0=disabled)",
|
||||
"enable_lm": "Enable LoRA LM",
|
||||
"enable_dit": "Enable LoRA DIT",
|
||||
"enable_proj": "Enable Projection",
|
||||
"dropout": "LoRA Dropout",
|
||||
"tensorboard_path": "Tensorboard Path (Optional)",
|
||||
"hf_model_id": "HuggingFace Model ID (e.g., openbmb/VoxCPM1.5)",
|
||||
"hf_model_id": "HuggingFace Model ID (e.g., openbmb/VoxCPM2)",
|
||||
"distribute": "Distribute Mode",
|
||||
}
|
||||
|
||||
@@ -1162,11 +1226,12 @@ with gr.Blocks(title="VoxCPM LoRA WebUI", theme=gr.themes.Soft(), css=custom_css
|
||||
gr.update(label=adv["warmup_steps"]),
|
||||
gr.update(label=adv["max_steps"]),
|
||||
gr.update(label=adv["sample_rate"]),
|
||||
gr.update(label=adv["max_grad_norm"]),
|
||||
gr.update(label=adv["tensorboard_path"]),
|
||||
gr.update(label=adv["enable_lm"]),
|
||||
gr.update(label=adv["enable_dit"]),
|
||||
gr.update(label=adv["enable_proj"]),
|
||||
gr.update(label=adv["dropout"]),
|
||||
gr.update(label=adv["tensorboard_path"]),
|
||||
# Distribution options
|
||||
gr.update(label=adv["hf_model_id"]),
|
||||
gr.update(label=adv["distribute"]),
|
||||
@@ -1213,11 +1278,12 @@ with gr.Blocks(title="VoxCPM LoRA WebUI", theme=gr.themes.Soft(), css=custom_css
|
||||
warmup_steps,
|
||||
max_steps,
|
||||
sample_rate,
|
||||
max_grad_norm,
|
||||
tensorboard_path,
|
||||
enable_lm,
|
||||
enable_dit,
|
||||
enable_proj,
|
||||
dropout,
|
||||
tensorboard_path,
|
||||
# distribution outputs
|
||||
hf_model_id,
|
||||
distribute,
|
||||
|
||||
@@ -30,7 +30,8 @@ except ImportError:
|
||||
import json
|
||||
|
||||
from voxcpm.model import VoxCPMModel, VoxCPM2Model
|
||||
from voxcpm.model.voxcpm import LoRAConfig
|
||||
from voxcpm.model.voxcpm import LoRAConfig as LoRAConfigV1
|
||||
from voxcpm.model.voxcpm2 import LoRAConfig as LoRAConfigV2
|
||||
from voxcpm.training import (
|
||||
Accelerator,
|
||||
BatchProcessor,
|
||||
@@ -46,6 +47,7 @@ def train(
|
||||
train_manifest: str,
|
||||
val_manifest: str = "",
|
||||
sample_rate: int = 16_000,
|
||||
out_sample_rate: int = 0, # AudioVAE decoder output rate; used for TensorBoard audio logging
|
||||
batch_size: int = 1,
|
||||
grad_accum_steps: int = 1,
|
||||
num_workers: int = 2,
|
||||
@@ -63,6 +65,7 @@ def train(
|
||||
lambdas: Dict[str, float] = {"loss/diff": 1.0, "loss/stop": 1.0},
|
||||
lora: dict = None,
|
||||
config_path: str = "",
|
||||
max_grad_norm: float = 0.0, # gradient clipping; 0 = disabled (backward compat)
|
||||
# Distribution options (for LoRA checkpoints)
|
||||
hf_model_id: str = "", # HuggingFace model ID (e.g., "openbmb/VoxCPM1.5")
|
||||
distribute: bool = False, # If True, save hf_model_id as base_model; otherwise save pretrained_path
|
||||
@@ -91,6 +94,7 @@ def train(
|
||||
with open(os.path.join(pretrained_path, "config.json"), "r", encoding="utf-8") as _f:
|
||||
_arch = json.load(_f).get("architecture", "voxcpm").lower()
|
||||
_model_cls = VoxCPM2Model if _arch == "voxcpm2" else VoxCPMModel
|
||||
LoRAConfig = LoRAConfigV2 if _arch == "voxcpm2" else LoRAConfigV1
|
||||
if accelerator.rank == 0:
|
||||
print(f"Detected architecture: {_arch} -> {_model_cls.__name__}", file=sys.stderr)
|
||||
base_model = _model_cls.from_local(
|
||||
@@ -98,6 +102,12 @@ def train(
|
||||
)
|
||||
tokenizer = base_model.text_tokenizer
|
||||
|
||||
expected_sr = base_model.audio_vae.sample_rate
|
||||
assert sample_rate == expected_sr, (
|
||||
f"sample_rate mismatch: config says {sample_rate}, but the AudioVAE encoder expects {expected_sr}. "
|
||||
f"Please set sample_rate: {expected_sr} in your training config. "
|
||||
)
|
||||
|
||||
train_ds, val_ds = load_audio_text_datasets(
|
||||
train_manifest=train_manifest,
|
||||
val_manifest=val_manifest,
|
||||
@@ -170,8 +180,12 @@ def train(
|
||||
dataset_cnt=dataset_cnt,
|
||||
device=accelerator.device,
|
||||
)
|
||||
# Save audio_vae for audio generation
|
||||
# Save audio_vae and output sample rate for audio generation.
|
||||
# Prefer model's actual output rate; fall back to YAML out_sample_rate or encode rate.
|
||||
audio_vae_for_gen = base_model.audio_vae
|
||||
out_sr = base_model.sample_rate # decoder output rate (e.g. 48000 for V2)
|
||||
if out_sr == 0 and out_sample_rate > 0:
|
||||
out_sr = out_sample_rate
|
||||
del base_model.audio_vae
|
||||
model = accelerator.prepare_model(base_model)
|
||||
unwrapped_model = accelerator.unwrap(model)
|
||||
@@ -304,8 +318,8 @@ def train(
|
||||
scaler = getattr(accelerator, "scaler", None)
|
||||
if scaler is not None:
|
||||
scaler.unscale_(optimizer)
|
||||
# Use large max_norm to only compute grad_norm without actual clipping
|
||||
grad_norm = torch.nn.utils.clip_grad_norm_(unwrapped_model.parameters(), max_norm=1e9)
|
||||
effective_max_norm = max_grad_norm if max_grad_norm > 0 else 1e9
|
||||
grad_norm = torch.nn.utils.clip_grad_norm_(unwrapped_model.parameters(), max_norm=effective_max_norm)
|
||||
|
||||
accelerator.step(optimizer)
|
||||
accelerator.update()
|
||||
@@ -333,6 +347,7 @@ def train(
|
||||
val_ds=val_ds,
|
||||
audio_vae=audio_vae_for_gen,
|
||||
sample_rate=sample_rate,
|
||||
out_sample_rate=out_sr,
|
||||
val_texts=val_texts,
|
||||
tokenizer=tokenizer,
|
||||
valid_interval=valid_interval,
|
||||
@@ -359,6 +374,7 @@ def validate(
|
||||
val_ds=None,
|
||||
audio_vae=None,
|
||||
sample_rate=22050,
|
||||
out_sample_rate=0,
|
||||
val_texts=None,
|
||||
tokenizer=None,
|
||||
valid_interval=1000,
|
||||
@@ -424,6 +440,7 @@ def validate(
|
||||
step,
|
||||
accelerator,
|
||||
sample_rate,
|
||||
out_sample_rate=out_sample_rate,
|
||||
val_texts=val_texts,
|
||||
tokenizer=tokenizer,
|
||||
valid_interval=valid_interval,
|
||||
@@ -526,6 +543,7 @@ def generate_sample_audio(
|
||||
step,
|
||||
accelerator,
|
||||
sample_rate=22050,
|
||||
out_sample_rate=0,
|
||||
val_texts=None,
|
||||
tokenizer=None,
|
||||
pretrained_path=None,
|
||||
@@ -540,6 +558,10 @@ def generate_sample_audio(
|
||||
log(f"[Audio] Starting audio generation for {num_samples} samples at step {step}")
|
||||
|
||||
unwrapped_model = accelerator.unwrap(model)
|
||||
# Determine the correct output sample rate for generated audio.
|
||||
# out_sample_rate is the decoder output rate (e.g. 48kHz for V2);
|
||||
# sample_rate is the encoder input rate (e.g. 16kHz for V2).
|
||||
gen_sr = out_sample_rate if out_sample_rate > 0 else sample_rate
|
||||
|
||||
for i in range(num_samples):
|
||||
sample = val_ds[i]
|
||||
@@ -596,10 +618,10 @@ def generate_sample_audio(
|
||||
gen_audio_np = normalize_audio(gen_audio_np)
|
||||
|
||||
tag = f"val_sample_{i}"
|
||||
writer.add_audio(f"{tag}/generated_audio", gen_audio_np, global_step=step, sample_rate=sample_rate)
|
||||
log(f"[Audio] Generated audio for sample {i}: duration={len(gen_audio_np)/sample_rate:.2f}s")
|
||||
writer.add_audio(f"{tag}/generated_audio", gen_audio_np, global_step=step, sample_rate=gen_sr)
|
||||
log(f"[Audio] Generated audio for sample {i}: duration={len(gen_audio_np)/gen_sr:.2f}s")
|
||||
|
||||
# Log reference audio
|
||||
# Log reference audio (at encoder input rate, which is what val_ds provides)
|
||||
if ref_audio_np is not None:
|
||||
writer.add_audio(
|
||||
f"{tag}/reference_audio", normalize_audio(ref_audio_np), global_step=step, sample_rate=sample_rate
|
||||
@@ -607,9 +629,9 @@ def generate_sample_audio(
|
||||
|
||||
# Generate mel spectrogram figure
|
||||
try:
|
||||
mel_gen = compute_mel_spectrogram(gen_audio_np, sample_rate)
|
||||
mel_gen = compute_mel_spectrogram(gen_audio_np, gen_sr)
|
||||
mel_ref = compute_mel_spectrogram(ref_audio_np, sample_rate) if ref_audio_np is not None else None
|
||||
fig = create_mel_figure(gen_audio_np, mel_gen, sample_rate, step, ref_audio_np, mel_ref)
|
||||
fig = create_mel_figure(gen_audio_np, mel_gen, gen_sr, step, ref_audio_np, mel_ref)
|
||||
writer.add_figure(f"{tag}/mel_spectrogram", fig, global_step=step)
|
||||
log(f"[Audio] Created mel spectrogram figure for sample {i}")
|
||||
except Exception as e:
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
+66
-54
@@ -48,25 +48,8 @@ from ..modules.minicpm4 import MiniCPM4Config, MiniCPMModel
|
||||
from .utils import get_dtype, mask_multichar_chinese_tokens
|
||||
|
||||
|
||||
def _trim_audio_silence_vad(
|
||||
audio: torch.Tensor,
|
||||
sample_rate: int,
|
||||
max_silence_ms: float = 200.0,
|
||||
top_db: float = 35.0,
|
||||
) -> torch.Tensor:
|
||||
"""使用能量阈值(VAD 方式)截取首尾静音及尾部长段伪静音,首尾各最多保留 max_silence_ms 毫秒静音。
|
||||
|
||||
会同时截掉末尾的长段伪静音(低能量但非完全静音的段落,如长时间底噪)。
|
||||
|
||||
Args:
|
||||
audio: (1, T) 的音频 tensor
|
||||
sample_rate: 采样率
|
||||
max_silence_ms: 首尾允许保留的最大静音长度(毫秒)
|
||||
top_db: 低于参考电平多少 dB 视为静音
|
||||
|
||||
Returns:
|
||||
截取后的 (1, T') tensor
|
||||
"""
|
||||
# A simple function to trim audio silence using VAD, not used default
|
||||
def _trim_audio_silence_vad(audio: torch.Tensor, sample_rate: int, max_silence_ms: float = 200.0, top_db: float = 35.0) -> torch.Tensor:
|
||||
if audio.numel() == 0:
|
||||
return audio
|
||||
y = audio.squeeze(0).numpy()
|
||||
@@ -85,7 +68,7 @@ def _trim_audio_silence_vad(
|
||||
except Exception:
|
||||
start, end = 0, n
|
||||
|
||||
# 用逐帧 RMS 找「最后一段有持续能量的位置」,截掉末尾长伪静音(低能量底噪等)
|
||||
# Find the last frame with continuous energy, trim the long pseudo-silence at the end (low energy background noise, etc.)
|
||||
n_frames = max(0, (n - frame_length) // hop_length + 1)
|
||||
last_voice_frame = -1
|
||||
for j in range(n_frames):
|
||||
@@ -246,6 +229,7 @@ class VoxCPM2Model(nn.Module):
|
||||
# Audio VAE
|
||||
self.audio_vae = audio_vae
|
||||
self.chunk_size = audio_vae.chunk_size
|
||||
self._decode_chunk_size = getattr(audio_vae, "decode_chunk_size", audio_vae.chunk_size)
|
||||
self._encode_sample_rate = audio_vae.sample_rate
|
||||
self.sample_rate = getattr(audio_vae, "out_sample_rate", audio_vae.sample_rate)
|
||||
|
||||
@@ -382,11 +366,7 @@ class VoxCPM2Model(nn.Module):
|
||||
mu=dit_hidden,
|
||||
patch_size=self.patch_size,
|
||||
cond=feat_cond_for_sample,
|
||||
n_timesteps=(
|
||||
self.config.dit_config.cfm_config.inference_cfg_rate
|
||||
if hasattr(self.config.dit_config.cfm_config, "inference_cfg_rate")
|
||||
else 10
|
||||
),
|
||||
n_timesteps=10,
|
||||
)
|
||||
feat_pred = rearrange(feat_pred_seq.transpose(1, 2), "(b t) d p -> b d (t p)", b=B, p=self.patch_size)
|
||||
|
||||
@@ -402,19 +382,26 @@ class VoxCPM2Model(nn.Module):
|
||||
def _dtype(self):
|
||||
return get_dtype(self.config.dtype)
|
||||
|
||||
def _encode_wav(self, wav_path: str, padding_mode: str = "right") -> torch.Tensor:
|
||||
def _encode_wav(
|
||||
self,
|
||||
wav_path: str,
|
||||
padding_mode: str = "right",
|
||||
trim_silence_vad: bool = False,
|
||||
) -> torch.Tensor:
|
||||
"""Load, trim, pad and VAE-encode an audio file.
|
||||
|
||||
Args:
|
||||
wav_path: path to the audio file.
|
||||
padding_mode: "right" (default) or "left" padding for alignment.
|
||||
trim_silence_vad: whether to apply VAD-based silence trimming.
|
||||
|
||||
Returns:
|
||||
audio_feat: (T, P, D) tensor of latent patches.
|
||||
"""
|
||||
audio, _ = librosa.load(wav_path, sr=self._encode_sample_rate, mono=True)
|
||||
audio = torch.from_numpy(audio).unsqueeze(0)
|
||||
audio = _trim_audio_silence_vad(audio, self._encode_sample_rate, max_silence_ms=200.0)
|
||||
if trim_silence_vad:
|
||||
audio = _trim_audio_silence_vad(audio, self._encode_sample_rate, max_silence_ms=200.0)
|
||||
patch_len = self.patch_size * self.chunk_size
|
||||
if audio.size(1) % patch_len != 0:
|
||||
padding_size = patch_len - audio.size(1) % patch_len
|
||||
@@ -475,6 +462,7 @@ class VoxCPM2Model(nn.Module):
|
||||
retry_badcase: bool = False,
|
||||
retry_badcase_max_times: int = 3,
|
||||
retry_badcase_ratio_threshold: float = 6.0,
|
||||
trim_silence_vad: bool = False,
|
||||
streaming: bool = False,
|
||||
streaming_prefix_len: int = 4,
|
||||
) -> Generator[torch.Tensor, None, None]:
|
||||
@@ -495,8 +483,12 @@ class VoxCPM2Model(nn.Module):
|
||||
)
|
||||
text_length = text_token.shape[0]
|
||||
|
||||
ref_feat = self._encode_wav(reference_wav_path, padding_mode="right")
|
||||
prompt_feat = self._encode_wav(prompt_wav_path, padding_mode="left")
|
||||
ref_feat = self._encode_wav(
|
||||
reference_wav_path,
|
||||
padding_mode="right",
|
||||
trim_silence_vad=trim_silence_vad,
|
||||
)
|
||||
prompt_feat = self._encode_wav(prompt_wav_path, padding_mode="left", trim_silence_vad=trim_silence_vad)
|
||||
prompt_audio_length = prompt_feat.size(0)
|
||||
|
||||
ref_tokens, ref_feats, ref_t_mask, ref_a_mask = self._make_ref_prefix(ref_feat, text_token.device)
|
||||
@@ -538,7 +530,11 @@ class VoxCPM2Model(nn.Module):
|
||||
)
|
||||
text_length = text_token.shape[0]
|
||||
|
||||
ref_feat = self._encode_wav(reference_wav_path, padding_mode="right")
|
||||
ref_feat = self._encode_wav(
|
||||
reference_wav_path,
|
||||
padding_mode="right",
|
||||
trim_silence_vad=trim_silence_vad,
|
||||
)
|
||||
ref_tokens, ref_feats, ref_t_mask, ref_a_mask = self._make_ref_prefix(ref_feat, text_token.device)
|
||||
|
||||
text_pad_feat = torch.zeros(
|
||||
@@ -595,7 +591,7 @@ class VoxCPM2Model(nn.Module):
|
||||
)
|
||||
text_length = text_token.shape[0]
|
||||
|
||||
prompt_feat = self._encode_wav(prompt_wav_path, padding_mode="left")
|
||||
prompt_feat = self._encode_wav(prompt_wav_path, padding_mode="left", trim_silence_vad=trim_silence_vad)
|
||||
prompt_audio_length = prompt_feat.size(0)
|
||||
prompt_pad_token = torch.zeros(prompt_audio_length, dtype=torch.int32, device=text_token.device)
|
||||
text_pad_feat = torch.zeros(
|
||||
@@ -640,14 +636,14 @@ class VoxCPM2Model(nn.Module):
|
||||
streaming_prefix_len=streaming_prefix_len,
|
||||
)
|
||||
if streaming:
|
||||
patch_len = self.patch_size * self.chunk_size
|
||||
for latent_pred, _ in inference_result:
|
||||
decode_patch_len = self.patch_size * self._decode_chunk_size
|
||||
for latent_pred, _, _ctx in inference_result:
|
||||
decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32))
|
||||
decode_audio = decode_audio[..., -patch_len:].squeeze(1).cpu()
|
||||
decode_audio = decode_audio[..., -decode_patch_len:].squeeze(1).cpu()
|
||||
yield decode_audio
|
||||
break
|
||||
else:
|
||||
latent_pred, pred_audio_feat = next(inference_result)
|
||||
latent_pred, pred_audio_feat, context_len = next(inference_result)
|
||||
if retry_badcase:
|
||||
if pred_audio_feat.shape[0] >= target_text_length * retry_badcase_ratio_threshold:
|
||||
print(
|
||||
@@ -663,10 +659,9 @@ class VoxCPM2Model(nn.Module):
|
||||
|
||||
if not streaming:
|
||||
decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32))
|
||||
patch_len = self.patch_size * self.chunk_size
|
||||
has_continuation = bool(prompt_wav_path)
|
||||
if has_continuation:
|
||||
decode_audio = decode_audio[..., patch_len * (streaming_prefix_len - 1):].squeeze(1).cpu()
|
||||
decode_patch_len = self.patch_size * self._decode_chunk_size
|
||||
if context_len > 0:
|
||||
decode_audio = decode_audio[..., decode_patch_len * context_len:].squeeze(1).cpu()
|
||||
else:
|
||||
decode_audio = decode_audio.squeeze(1).cpu()
|
||||
yield decode_audio
|
||||
@@ -677,6 +672,7 @@ class VoxCPM2Model(nn.Module):
|
||||
prompt_text: str = None,
|
||||
prompt_wav_path: str = None,
|
||||
reference_wav_path: str = None,
|
||||
trim_silence_vad: bool = False,
|
||||
):
|
||||
"""
|
||||
Build prompt cache for subsequent generation.
|
||||
@@ -693,6 +689,8 @@ class VoxCPM2Model(nn.Module):
|
||||
Must be paired with ``prompt_text``.
|
||||
reference_wav_path: reference audio path for voice cloning
|
||||
(structurally isolated via ref_audio tokens).
|
||||
trim_silence_vad: whether to apply VAD-based silence trimming
|
||||
before encoding prompt/reference audio.
|
||||
|
||||
Returns:
|
||||
prompt_cache: dict used by ``_generate_with_prompt_cache``.
|
||||
@@ -705,11 +703,19 @@ class VoxCPM2Model(nn.Module):
|
||||
cache = {}
|
||||
|
||||
if reference_wav_path:
|
||||
cache["ref_audio_feat"] = self._encode_wav(reference_wav_path, padding_mode="right")
|
||||
cache["ref_audio_feat"] = self._encode_wav(
|
||||
reference_wav_path,
|
||||
padding_mode="right",
|
||||
trim_silence_vad=trim_silence_vad,
|
||||
)
|
||||
|
||||
if prompt_wav_path and prompt_text is not None:
|
||||
cache["prompt_text"] = prompt_text
|
||||
cache["audio_feat"] = self._encode_wav(prompt_wav_path, padding_mode="left")
|
||||
cache["audio_feat"] = self._encode_wav(
|
||||
prompt_wav_path,
|
||||
padding_mode="left",
|
||||
trim_silence_vad=trim_silence_vad,
|
||||
)
|
||||
|
||||
has_ref = "ref_audio_feat" in cache
|
||||
has_prompt = "audio_feat" in cache
|
||||
@@ -917,14 +923,14 @@ class VoxCPM2Model(nn.Module):
|
||||
streaming_prefix_len=streaming_prefix_len,
|
||||
)
|
||||
if streaming:
|
||||
patch_len = self.patch_size * self.chunk_size
|
||||
for latent_pred, pred_audio_feat in inference_result:
|
||||
decode_patch_len = self.patch_size * self._decode_chunk_size
|
||||
for latent_pred, pred_audio_feat, _ctx in inference_result:
|
||||
decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32))
|
||||
decode_audio = decode_audio[..., -patch_len:].squeeze(1).cpu()
|
||||
decode_audio = decode_audio[..., -decode_patch_len:].squeeze(1).cpu()
|
||||
yield (decode_audio, target_text_token, pred_audio_feat)
|
||||
break
|
||||
else:
|
||||
latent_pred, pred_audio_feat = next(inference_result)
|
||||
latent_pred, pred_audio_feat, context_len = next(inference_result)
|
||||
if retry_badcase:
|
||||
if pred_audio_feat.shape[0] >= target_text_length * retry_badcase_ratio_threshold:
|
||||
print(
|
||||
@@ -939,18 +945,20 @@ class VoxCPM2Model(nn.Module):
|
||||
break
|
||||
if not streaming:
|
||||
decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32))
|
||||
patch_len = self.patch_size * self.chunk_size
|
||||
if mode in ("continuation", "ref_continuation"):
|
||||
decode_audio = decode_audio[..., patch_len * (streaming_prefix_len - 1) :].squeeze(1).cpu()
|
||||
decode_patch_len = self.patch_size * self._decode_chunk_size
|
||||
if context_len > 0:
|
||||
decode_audio = decode_audio[..., decode_patch_len * context_len:].squeeze(1).cpu()
|
||||
else:
|
||||
decode_audio = decode_audio[..., :].squeeze(1).cpu()
|
||||
decode_audio = decode_audio.squeeze(1).cpu()
|
||||
yield (decode_audio, target_text_token, pred_audio_feat)
|
||||
|
||||
def inference(self, *args, **kwargs) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
return next(self._inference(*args, streaming=False, **kwargs))
|
||||
feat_pred, generated_feat, _ = next(self._inference(*args, streaming=False, **kwargs))
|
||||
return feat_pred, generated_feat
|
||||
|
||||
def inference_streaming(self, *args, **kwargs) -> Generator[Tuple[torch.Tensor, List[torch.Tensor]], None, None]:
|
||||
return self._inference(*args, streaming=True, **kwargs)
|
||||
for feat_pred, pred_feat_seq, _ in self._inference(*args, streaming=True, **kwargs):
|
||||
yield feat_pred, pred_feat_seq
|
||||
|
||||
@torch.inference_mode()
|
||||
def _inference(
|
||||
@@ -1009,6 +1017,7 @@ class VoxCPM2Model(nn.Module):
|
||||
# trailing audio patches as initial context so the VAE can decode smoothly.
|
||||
# - Reference-only / zero-shot (feat_mask ends with 0): start from scratch.
|
||||
has_continuation_audio = feat_mask[0, -1].item() == 1
|
||||
context_len = 0
|
||||
if has_continuation_audio:
|
||||
audio_indices = feat_mask.squeeze(0).nonzero(as_tuple=True)[0]
|
||||
context_len = min(streaming_prefix_len - 1, len(audio_indices))
|
||||
@@ -1058,11 +1067,13 @@ class VoxCPM2Model(nn.Module):
|
||||
prefix_feat_cond = pred_feat
|
||||
|
||||
if streaming:
|
||||
# return the last three predicted latent features to provide enough context for smooth decoding
|
||||
pred_feat_chunk = torch.cat(pred_feat_seq[-streaming_prefix_len:], dim=1)
|
||||
feat_pred = rearrange(pred_feat_chunk, "b t p d -> b d (t p)", b=B, p=self.patch_size)
|
||||
|
||||
yield feat_pred, pred_feat_seq
|
||||
yield feat_pred, pred_feat_seq, context_len
|
||||
|
||||
if len(pred_feat_seq) > streaming_prefix_len:
|
||||
pred_feat_seq = pred_feat_seq[-streaming_prefix_len:]
|
||||
|
||||
stop_flag = self.stop_head(self.stop_actn(self.stop_proj(lm_hidden))).argmax(dim=-1)[0].cpu().item()
|
||||
if i > min_len and stop_flag == 1:
|
||||
@@ -1081,7 +1092,8 @@ class VoxCPM2Model(nn.Module):
|
||||
if not streaming:
|
||||
pred_feat_seq = torch.cat(pred_feat_seq, dim=1) # b, t, p, d
|
||||
feat_pred = rearrange(pred_feat_seq, "b t p d -> b d (t p)", b=B, p=self.patch_size)
|
||||
yield feat_pred, pred_feat_seq.squeeze(0).cpu()
|
||||
generated_feat = pred_feat_seq[:, context_len:, :, :].squeeze(0).cpu()
|
||||
yield feat_pred, generated_feat, context_len
|
||||
|
||||
@classmethod
|
||||
def from_local(cls, path: str, optimize: bool = True, training: bool = False, lora_config: LoRAConfig = None):
|
||||
|
||||
@@ -436,6 +436,7 @@ class AudioVAE(nn.Module):
|
||||
self.out_sample_rate = out_sample_rate
|
||||
self.sr_bin_boundaries = sr_bin_boundaries
|
||||
self.chunk_size = math.prod(encoder_rates)
|
||||
self.decode_chunk_size = math.prod(decoder_rates)
|
||||
|
||||
def preprocess(self, audio_data, sample_rate):
|
||||
if sample_rate is None:
|
||||
|
||||
@@ -225,7 +225,7 @@ class UnifiedCFM(torch.nn.Module):
|
||||
losses = F.mse_loss(u_pred, u_tgt.detach(), reduction="none").mean(dim=1)
|
||||
if tgt_mask is not None:
|
||||
weights = self.adaptive_loss_weighting(losses, tgt_mask.squeeze(1))
|
||||
loss = (weights * losses).sum() / torch.sum(tgt_mask)
|
||||
loss = (weights * losses).sum() / torch.clamp(torch.sum(tgt_mask), min=1.0)
|
||||
else:
|
||||
loss = losses.mean()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user