11 Commits

Author SHA1 Message Date
Labmem-Zhouyx 68af4fe502 fix: ft log and setting 2026-04-08 18:15:17 +08:00
Labmem-Zhouyx ee3649c1b3 fix: streaming decode 2026-04-08 17:25:54 +08:00
Labmem-Zhouyx 82d77d445c fix: decode chunksize for audiovae_v2 2026-04-08 16:31:36 +08:00
Labmem-Zhouyx 8f95d13073 update readme: 30-language asr result on internal benchmark 2026-04-08 15:36:56 +08:00
Labmem-Zhouyx df38f0a167 update readme for modelscope download 2026-04-08 11:29:19 +08:00
Labmem-Zhouyx 9adfaf6996 update demo for zh 2026-04-08 00:15:16 +08:00
刘鑫 46cfce0c97 fix VoxCPM2 training sample_rate: 48000 -> 16000 (match AudioVAE encoder)
Made-with: Cursor
2026-04-07 22:59:18 +08:00
Labmem-Zhouyx da700f264e update ZH readme 2026-04-07 18:04:56 +08:00
Labmem-Zhouyx 9da570d409 remove wechat link 2026-04-07 15:29:12 +08:00
Labmem-Zhouyx 9374524c47 update readme 2026-04-06 23:01:16 +08:00
Labmem-Zhouyx ec6d30e996 update readme 2026-04-06 22:56:06 +08:00
13 changed files with 850 additions and 84 deletions
+81 -10
View File
@@ -1,5 +1,9 @@
<h2 align="center">VoxCPM2: Tokenizer-Free TTS for Multilingual Speech Generation, Creative Voice Design, and True-to-Life Cloning</h2>
<p align="center">
<b>English</b> | <a href="./README_zh.md">中文</a>
</p>
<p align="center">
<a href="https://github.com/OpenBMB/VoxCPM/"><img src="https://img.shields.io/badge/Project%20Page-GitHub-blue" alt="Project Page"></a>
<a href="https://huggingface.co/spaces/OpenBMB/VoxCPM-Demo"><img src="https://img.shields.io/badge/Live%20Playground-Demo-orange" alt="Live Playground"></a>
@@ -45,13 +49,13 @@ VoxCPM is a **tokenizer-free** Text-to-Speech system that directly generates con
-**Real-Time Streaming** — RTF as low as ~0.3 on NVIDIA RTX 4090, and ~0.13 accelerated by [Nano-VLLM](https://github.com/a710128/nanovllm-voxcpm)
- 📜 **Fully Open-Source & Commercial-Ready** — Weights and code released under the [Apache-2.0](LICENSE) license, free for commercial use
<details>
<summary><b>🌍 Supported Languages (30)</b></summary>
<br>
Arabic, Burmese, Chinese, Danish, Dutch, English, Finnish, French, German, Greek, Hebrew, Hindi, Indonesian, Italian, Japanese, Khmer, Korean, Lao, Malay, Norwegian, Polish, Portuguese, Russian, Spanish, Swahili, Swedish, Tagalog, Thai, Turkish, Vietnamese
Chinese Dialect: 四川话, 粤语, 吴语, 东北话, 河南话, 陕西话, 山东话, 天津话, 闽南话
</details>
### News
@@ -99,7 +103,7 @@ from voxcpm import VoxCPM
import soundfile as sf
model = VoxCPM.from_pretrained(
"openbmb/VoxCPM2"
"openbmb/VoxCPM2",
load_denoiser=False,
)
@@ -112,6 +116,28 @@ sf.write("demo.wav", wav, model.tts_model.sample_rate)
print("saved: demo.wav")
```
If you prefer downloading from ModelScope first, you can use:
```bash
pip install modelscope
```
```python
from modelscope.hub.snapshot_download import snapshot_download
from voxcpm import VoxCPM
import soundfile as sf
local_model_dir = snapshot_download("OpenBMB/VoxCPM2")
model = VoxCPM.from_pretrained(local_model_dir, load_denoiser=False)
wav = model.generate(
text="VoxCPM2 is the current recommended release for realistic multilingual speech synthesis.",
cfg_value=2.0,
inference_timesteps=10,
)
sf.write("demo.wav", wav, model.tts_model.sample_rate)
```
#### 🎨 Voice Design
Create a voice from a natural-language description — no reference audio needed. **Format:** put the description in parentheses at the start of `text`(e.g. `"(your voice description)The text to synthesize."`):
@@ -132,13 +158,13 @@ Upload a reference audio. The model clones the timbre, and you can still use con
```python
wav = model.generate(
text="This is a cloned voice generated by VoxCPM2.",
reference_wav_path="speaker.wav",
reference_wav_path="path/to/voice.wav",
)
sf.write("clone.wav", wav, model.tts_model.sample_rate)
wav = model.generate(
text="(slightly faster, cheerful tone)This is a cloned voice with style control.",
reference_wav_path="speaker.wav",
reference_wav_path="path/to/voice.wav",
cfg_value=2.0,
inference_timesteps=10,
)
@@ -152,9 +178,9 @@ Provide both the reference audio and its exact transcript for audio-continuation
```python
wav = model.generate(
text="This is an ultimate cloning demonstration using VoxCPM2.",
prompt_wav_path="speaker_reference.wav",
prompt_wav_path="path/to/voice.wav",
prompt_text="The transcript of the reference audio.",
reference_wav_path="speaker_reference.wav",
reference_wav_path="path/to/voice.wav", # optional, for better simliarity
)
sf.write("hifi_clone.wav", wav, model.tts_model.sample_rate)
```
@@ -200,6 +226,7 @@ voxcpm clone \
--text "This is a voice cloning demo." \
--prompt-audio path/to/voice.wav \
--prompt-text "reference transcript" \
--reference-audio path/to/voice.wav \ # optional, for better simliarity
--output out.wav
# Batch processing
@@ -211,8 +238,8 @@ voxcpm --help
### Web Demo
```bash
python app.py # then open http://localhost:7860
```bash
python app.py --model-dir /path/to/VoxCPM2 --port 8808 # use a local model directory, open http://localhost:8808
```
### 🚢 Production Deployment (Nano-vLLM)
@@ -388,10 +415,54 @@ VoxCPM2 achieves state-of-the-art or comparable results on public zero-shot and
</details>
### Internal 30-Language ASR Benchmark
We additionally run an internal multilingual intelligibility benchmark with **30 languages × 500 samples**. ASR transcription is evaluated via **Gemini 3.1 Flash Lite API**.
<details>
<summary><b>Internal 30-Language ASR Benchmark (click to expand)</b></summary>
| Language | Metric | VoxCPM2 | Fish S2-Pro |
|---|---:|---:|---:|
| ar (Arabic) | CER | 1.23% | 0.30% |
| da (Danish) | WER | 2.70% | 3.52% |
| de (German) | WER | 0.96% | 0.64% |
| el (Greek) | WER | 3.17% | 4.61% |
| en (English) | WER | 0.42% | 1.03% |
| es (Spanish) | WER | 1.33% | 0.64% |
| fi (Finnish) | WER | 2.24% | 2.80% |
| fr (French) | WER | 2.16% | 2.34% |
| he (Hebrew) | CER | 2.98% | 15.27% |
| hi (Hindi) | CER | 0.79% | 0.91% |
| id (Indonesian) | WER | 1.36% | 1.68% |
| it (Italian) | WER | 1.65% | 1.08% |
| ja (Japanese) | CER | 2.40% | 1.82% |
| km (Khmer) | CER | 2.05% | 75.15% |
| ko (Korean) | CER | 0.95% | 0.29% |
| lo (Lao) | CER | 1.90% | 87.40% |
| ms (Malay) | WER | 1.75% | 1.41% |
| my (Burmese) | CER | 1.42% | 85.27% |
| nl (Dutch) | WER | 1.25% | 1.68% |
| no (Norwegian) | WER | 2.49% | 3.76% |
| pl (Polish) | WER | 1.90% | 1.65% |
| pt (Portuguese) | WER | 1.48% | 1.49% |
| ru (Russian) | WER | 0.90% | 0.86% |
| sv (Swedish) | WER | 2.22% | 2.63% |
| sw (Swahili) | CER | 1.07% | 2.02% |
| th (Thai) | CER | 0.94% | 1.92% |
| tl (Tagalog) | WER | 2.63% | 4.00% |
| tr (Turkish) | WER | 1.65% | 1.65% |
| vi (Vietnamese) | WER | 1.56% | 5.56% |
| zh (Chinese) | CER | 0.92% | 1.02% |
| Average (30 languages) | | **1.68%** | - |
</details>
### InstructTTSEval
<details>
<summary><b>Instruction-Guided Voice Design Results</b></summary>
<summary><b>Instruction-Guided Voice Design Results (click to expand)</b></summary>
| Model | InstructTTSEval-ZH | | | InstructTTSEval-EN | | |
|-------|:---:|:----:|:----:|:----:|:----:|:----:|
+590
View File
@@ -0,0 +1,590 @@
<h2 align="center">VoxCPM2:基于连续表征的多语言语音合成、创意音色设计与高保真声音克隆</h2>
<p align="center">
<a href="./README.md">English</a> | <b>中文</b>
</p>
<p align="center">
<a href="https://github.com/OpenBMB/VoxCPM/"><img src="https://img.shields.io/badge/Project%20Page-GitHub-blue" alt="Project Page"></a>
<a href="https://huggingface.co/spaces/OpenBMB/VoxCPM-Demo"><img src="https://img.shields.io/badge/Live%20Playground-Demo-orange" alt="Live Playground"></a>
<a href="https://voxcpm.readthedocs.io/zh-cn/latest/"><img src="https://img.shields.io/badge/Docs-ReadTheDocs-8CA1AF" alt="Documentation"></a>
<a href="https://huggingface.co/openbmb/VoxCPM2"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-VoxCPM2-yellow" alt="Hugging Face"></a>
<a href="https://modelscope.cn/models/OpenBMB/VoxCPM2"><img src="https://img.shields.io/badge/ModelScope-VoxCPM2-purple" alt="ModelScope"></a>
<a href="https://openbmb.github.io/voxcpm2-demopage/"><img src="https://img.shields.io/badge/DemoPage-Audio Samples-red"></a>
</p>
<div align="center">
<img src="assets/voxcpm_logo.png" alt="VoxCPM Logo" width="35%">
<br><br>
<a href="https://trendshift.io/repositories/17704" target="_blank"><img src="https://trendshift.io/api/badge/repositories/17704" alt="OpenBMB%2FVoxCPM | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
</div>
<br>
<p align="center">
👋 欢迎加入社区,参与讨论与交流!
<br>
<a href="./assets/feishu-group.png" style="display:inline-block;vertical-align:middle; margin-left: 10px;">
<img src="./assets/feishu-logo.png" width="16" height="16" style="vertical-align:middle;"> 飞书群
</a>
&nbsp;|&nbsp;
<a href="https://discord.gg/KZUx7tVNwz" style="display:inline-block;vertical-align:middle;">
<img src="./assets/discord-logo.png" width="16" height="16" style="vertical-align:middle;"> Discord
</a>
</p>
VoxCPM 是一个**无离散音频分词器**Tokenizer-Free)的语音合成系统,通过端到端的**扩散自回归架构**直接生成连续语音表征,绕过对音频的离散编码步骤,实现高度自然且富有表现力的语音合成。
**VoxCPM2** 是最新的版本 — 基于 [MiniCPM-4](https://github.com/OpenBMB/MiniCPM) 基座构建,总计 **20亿** 参数,在超过 **200万小时** 的多语种音频数据上训练,支持 **30种全球语言+9种中文方言**、**音色设计**、**可控声音克隆**,原生输出 **48kHz** 高质量音频。
### ✨ 核心特性
- 🌍 **30种语言语音合成** — 直接输入原始文本即可合成(支持语言详见下文),无需额外语言标签
- 🎨 **音色设计** — 用自然语言描述(性别、年龄、音色、情绪、语速……)凭空创建全新音色,无需参考音频
- 🎛️ **可控声音克隆** — 从参考音频片段克隆任意声音,可叠加风格指令控制情绪、语速和表现力,同时保持原始音色
- 🎙️ **极致克隆** — 提供参考音频及其文本内容,模型接着参考音频进行无缝续写,从而精准还原声音细节特征(与 VoxCPM1.5 一致)
- 🔊 **48kHz 高质量音频** — 输入 16kHz 参考音频,通过 AudioVAE V2 的非对称编解码设计直接输出 48kHz 高质量音频,内置超分能力
- 🧠 **语境感知合成** — 根据文本内容自动推断合适的韵律和表现力
-**实时流式合成** — 在 NVIDIA RTX 4090 上 RTF 低至 ~0.3,通过 [Nano-VLLM](https://github.com/a710128/nanovllm-voxcpm) 加速后可达 ~0.13
- 📜 **完全开源,商用就绪** — 权重和代码基于 [Apache-2.0](LICENSE) 协议发布,免费商用
<summary><b>🌍 支持的语言(30种)</b></summary>
<br>
阿拉伯语、缅甸语、中文、丹麦语、荷兰语、英语、芬兰语、法语、德语、希腊语、希伯来语、印地语、印尼语、意大利语、日语、高棉语、韩语、老挝语、马来语、挪威语、波兰语、葡萄牙语、俄语、西班牙语、斯瓦希里语、瑞典语、菲律宾语、泰语、土耳其语、越南语
中国方言:四川话、粤语、吴语、东北话、河南话、陕西话、山东话、天津话、闽南话
### 最新动态
* **[2026.04]** 🔥 发布 **VoxCPM2** — 20亿参数,30种语言,音色设计与可控声音克隆,48kHz 音频输出![模型权重](https://huggingface.co/openbmb/VoxCPM2) | [使用文档](https://voxcpm.readthedocs.io/zh-cn/latest/) | [在线体验](https://huggingface.co/spaces/OpenBMB/VoxCPM-Demo) | [官网体验](https://voxcpm.modelbest.cn/) (适用国内访问)
* **[2025.12]** 🎉 开源 **VoxCPM1.5** [模型权重](https://huggingface.co/openbmb/VoxCPM1.5),支持 SFT 和 LoRA 微调。(**🏆 GitHub Trending #1**)
* **[2025.09]** 🔥 发布 VoxCPM [技术报告](https://arxiv.org/abs/2509.24650)。
* **[2025.09]** 🎉 开源 **VoxCPM-0.5B** [模型权重](https://huggingface.co/openbmb/VoxCPM-0.5B) (**🏆 HuggingFace Trending #1**)
---
## 目录
- [快速开始](#-快速开始)
- [安装](#安装)
- [Python API](#python-api)
- [命令行使用](#命令行使用)
- [Web Demo](#web-demo)
- [生产部署](#-生产部署nano-vllm)
- [模型与版本](#-模型与版本)
- [性能评测](#-性能评测)
- [微调](#%EF%B8%8F-微调)
- [文档](#-文档)
- [生态与社区](#-生态与社区)
- [风险与局限性](#%EF%B8%8F-风险与局限性)
- [引用](#-引用)
---
## 🚀 快速开始
### 安装
```sh
pip install voxcpm
```
> **环境要求:** Python ≥ 3.10PyTorch ≥ 2.5.0CUDA ≥ 12.0。详见 [快速开始文档](https://voxcpm.readthedocs.io/zh-cn/latest/quickstart.html)。
### Python API
#### 🗣️ 文本转语音
```python
from voxcpm import VoxCPM
import soundfile as sf
model = VoxCPM.from_pretrained(
"openbmb/VoxCPM2",
load_denoiser=False,
)
wav = model.generate(
text="VoxCPM2 是目前推荐使用的多语言语音合成版本。",
cfg_value=2.0,
inference_timesteps=10,
)
sf.write("demo.wav", wav, model.tts_model.sample_rate)
print("已保存: demo.wav")
```
如果你希望先从 ModelScope 下载模型到本地(适用于国内网络访问),可以使用:
```bash
pip install modelscope
```
```python
from modelscope.hub.snapshot_download import snapshot_download
from voxcpm import VoxCPM
import soundfile as sf
local_model_dir = snapshot_download("OpenBMB/VoxCPM2")
model = VoxCPM.from_pretrained(local_model_dir, load_denoiser=False)
wav = model.generate(
text="VoxCPM2 是目前推荐使用的多语言语音合成版本。",
cfg_value=2.0,
inference_timesteps=10,
)
sf.write("demo.wav", wav, model.tts_model.sample_rate)
```
#### 🎨 音色设计
用自然语言描述创建全新音色,无需参考音频。**格式:** 在 `text` 开头用括号写入音色描述(如 `"(音色描述)要合成的文本。"`):
```python
wav = model.generate(
text="(年轻女性,声音温柔甜美)你好,欢迎使用VoxCPM2!",
cfg_value=2.0,
inference_timesteps=10,
)
sf.write("voice_design.wav", wav, model.tts_model.sample_rate)
```
#### 🎛️ 可控声音克隆
上传一段参考音频,模型克隆其音色,同时可以使用控制指令调节语速、情绪或风格。
```python
wav = model.generate(
text="这是VoxCPM2生成的克隆语音。",
reference_wav_path="path/to/voice.wav",
)
sf.write("clone.wav", wav, model.tts_model.sample_rate)
wav = model.generate(
text="(稍快一点,欢快的语气)这是带风格控制的克隆语音。",
reference_wav_path="path/to/voice.wav",
cfg_value=2.0,
inference_timesteps=10,
)
sf.write("controllable_clone.wav", wav, model.tts_model.sample_rate)
```
#### 🎙️ 极致克隆
提供参考音频及其精确文本转录,实现基于音频续写的高保真克隆。为获得最高克隆相似度,可将同一音频同时传给 `reference_wav_path``prompt_wav_path`
```python
wav = model.generate(
text="这是使用VoxCPM2的极致克隆演示。",
prompt_wav_path="path/to/voice.wav",
prompt_text="参考音频的文本转录。",
reference_wav_path="path/to/voice.wav", # 可选,提升相似度
)
sf.write("hifi_clone.wav", wav, model.tts_model.sample_rate)
```
<details>
<summary><b>🔄 流式 API</b></summary>
```python
import numpy as np
chunks = []
for chunk in model.generate_streaming(
text="使用VoxCPM进行流式语音合成非常简单!",
):
chunks.append(chunk)
wav = np.concatenate(chunks)
sf.write("streaming.wav", wav, model.tts_model.sample_rate)
```
</details>
### 命令行使用
```bash
# 音色设计(无需参考音频)
voxcpm design \
--text "VoxCPM2带来全新语音合成体验。" \
--output out.wav
# 可控声音克隆(带风格控制)
voxcpm design \
--text "VoxCPM2带来全新语音合成体验。" \
--control "年轻女声,温暖温柔,略带微笑" \
--output out.wav
# 声音克隆(参考音频)
voxcpm clone \
--text "这是一个声音克隆的演示。" \
--reference-audio path/to/voice.wav \
--output out.wav
# 极致克隆(提示音频 + 转录文本)
voxcpm clone \
--text "这是一个声音克隆的演示。" \
--prompt-audio path/to/voice.wav \
--prompt-text "参考音频转录文本" \
--reference-audio path/to/voice.wav \
--output out.wav
# 批量处理
voxcpm batch --input examples/input.txt --output-dir outs
# 帮助
voxcpm --help
```
### Web Demo
```bash
python app.py --model-dir /path/to/VoxCPM2 --port 8808 # 指定本地模型路径,然后打开 http://localhost:8808
```
### 🚢 生产部署(Nano-vLLM
如需高吞吐量部署,使用 [**Nano-vLLM-VoxCPM**](https://github.com/a710128/nanovllm-voxcpm) — 基于 Nano-vLLM 构建的专用推理引擎,支持并发请求和异步 API。
```bash
pip install nano-vllm-voxcpm
```
```python
from nanovllm_voxcpm import VoxCPM
import numpy as np, soundfile as sf
server = VoxCPM.from_pretrained(model="/path/to/VoxCPM", devices=[0])
chunks = list(server.generate(target_text="你好,我来自VoxCPM"))
sf.write("out.wav", np.concatenate(chunks), 48000)
server.stop()
```
> **在 NVIDIA RTX 4090 上 RTF 低至 ~0.13**(标准 PyTorch 实现约 ~0.3),支持批量并发请求和 FastAPI HTTP 服务。详见 [Nano-vLLM-VoxCPM 仓库](https://github.com/a710128/nanovllm-voxcpm)。
> **完整参数说明、多场景示例与声音克隆技巧 →** [快速开始指南](https://voxcpm.readthedocs.io/zh-cn/latest/quickstart.html) | [使用指南](https://voxcpm.readthedocs.io/zh-cn/latest/usage_guide.html) | [Cookbook](https://voxcpm.readthedocs.io/zh-cn/latest/cookbook.html)
---
## 📦 模型与版本
| | **VoxCPM2** | **VoxCPM1.5** | **VoxCPM-0.5B** |
|---|:---:|:---:|:---:|
| **状态** | 🟢 最新版本 | 稳定版 | 旧版 |
| **主模型参数量** | 2B | 0.6B | 0.5B |
| **音频采样率** | 48kHz | 44.1kHz | 16kHz |
| **LM处理码率** | 6.25Hz | 6.25Hz | 12.5Hz |
| **语言支持数量** | 30 | 2(中文、英文) | 2(中文、英文) |
| **克隆模式** | 隔离参考音频(无需文本) & 音频续写 | 仅音频续写 | 仅音频续写 |
| **音色设计** | ✅ | — | — |
| **可控声音克隆** | ✅ | — | — |
| **SFT / LoRA** | ✅ | ✅ | ✅ |
| **RTF (RTX 4090)** | ~0.30 | ~0.15 | ~0.17 |
| **RTF Nano-VLLM (RTX 4090)** | ~0.13 | ~0.08 | ~0.10 |
| **显存占用** | ~8 GB | ~6 GB | ~5 GB |
| **模型权重** | [🤗 HF](https://huggingface.co/openbmb/VoxCPM2) / [MS](https://modelscope.cn/models/OpenBMB/VoxCPM2) | [🤗 HF](https://huggingface.co/openbmb/VoxCPM1.5) / [MS](https://modelscope.cn/models/OpenBMB/VoxCPM1.5) | [🤗 HF](https://huggingface.co/openbmb/VoxCPM-0.5B) / [MS](https://modelscope.cn/models/OpenBMB/VoxCPM-0.5B) |
| **技术报告** | 即将发布 | — | [arXiv](https://arxiv.org/abs/2509.24650) [ICLR 2026](https://openreview.net/forum?id=h5KLpGoqzC) |
| **Demo 页面** | [音频示例](https://openbmb.github.io/voxcpm2-demopage) | — | [音频示例](https://openbmb.github.io/VoxCPM-demopage) |
VoxCPM2 采用**连续音频表征、扩散自回归**范式,模型在 **AudioVAE** 的连续隐空间中通过四阶段处理:**LocEnc → TSLM → RALM → LocDiT**,实现丰富的表现力语音合成和 48kHz 原生音频输出。
<div align="center">
<img src="assets/voxcpm_model.png" alt="VoxCPM2 模型架构" width="90%">
</div>
> 完整架构细节、VoxCPM2 升级内容和模型对比表见 [架构设计文档](https://voxcpm.readthedocs.io/zh-cn/latest/models/architecture.html)。
---
## 📊 性能评测
VoxCPM2 在公开的零样本和可控 TTS 基准测试中取得了 SOTA 或可比的结果。
### Seed-TTS-eval
<details>
<summary><b>Seed-TTS-eval WER(⬇)&SIM(⬆) 结果(点击展开)</b></summary>
| Model | Parameters | Open-Source | test-EN | | test-ZH | | test-Hard | |
|------|------|------|:------------:|:--:|:------------:|:--:|:-------------:|:--:|
| | | | WER/%⬇ | SIM/%⬆| CER/%⬇| SIM/%⬆ | CER/%⬇ | SIM/%⬆ |
| MegaTTS3 | 0.5B | ❌ | 2.79 | 77.1 | 1.52 | 79.0 | - | - |
| DiTAR | 0.6B | ❌ | 1.69 | 73.5 | 1.02 | 75.3 | - | - |
| CosyVoice3 | 0.5B | ❌ | 2.02 | 71.8 | 1.16 | 78.0 | 6.08 | 75.8 |
| CosyVoice3 | 1.5B | ❌ | 2.22 | 72.0 | 1.12 | 78.1 | 5.83 | 75.8 |
| Seed-TTS | - | ❌ | 2.25 | 76.2 | 1.12 | 79.6 | 7.59 | 77.6 |
| MiniMax-Speech | - | ❌ | 1.65 | 69.2 | 0.83 | 78.3 | - | - |
| F5-TTS | 0.3B | ✅ | 2.00 | 67.0 | 1.53 | 76.0 | 8.67 | 71.3 |
| MaskGCT | 1B | ✅ | 2.62 | 71.7 | 2.27 | 77.4 | - | - |
| CosyVoice | 0.3B | ✅ | 4.29 | 60.9 | 3.63 | 72.3 | 11.75 | 70.9 |
| CosyVoice2 | 0.5B | ✅ | 3.09 | 65.9 | 1.38 | 75.7 | 6.83 | 72.4 |
| SparkTTS | 0.5B | ✅ | 3.14 | 57.3 | 1.54 | 66.0 | - | - |
| FireRedTTS | 0.5B | ✅ | 3.82 | 46.0 | 1.51 | 63.5 | 17.45 | 62.1 |
| FireRedTTS-2 | 1.5B | ✅ | 1.95 | 66.5 | 1.14 | 73.6 | - | - |
| Qwen2.5-Omni | 7B | ✅ | 2.72 | 63.2 | 1.70 | 75.2 | 7.97 | 74.7 |
| Qwen3-Omni | 30B-A3B | ✅ | 1.39 | - | 1.07 | - | - | - |
| OpenAudio-s1-mini | 0.5B | ✅ | 1.94 | 55.0 | 1.18 | 68.5 | 23.37 | 64.3 |
| IndexTTS2 | 1.5B | ✅ | 2.23 | 70.6 | 1.03 | 76.5 | 7.12 | 75.5 |
| VibeVoice | 1.5B | ✅ | 3.04 | 68.9 | 1.16 | 74.4 | - | - |
| HiggsAudio-v2 | 3B | ✅ | 2.44 | 67.7 | 1.50 | 74.0 | 55.07 | 65.6 |
| VoxCPM-0.5B | 0.6B | ✅ | 1.85 | 72.9 | 0.93 | 77.2 | 8.87 | 73.0 |
| VoxCPM1.5 | 0.8B | ✅ | 2.12 | 71.4 | 1.18 | 77.0 | 7.74 | 73.1 |
| MOSS-TTS | | ✅ | 1.85 | 73.4 | 1.20 | 78.8 | - | - |
| Qwen3-TTS | 1.7B | ✅ | 1.23 | 71.7 | 1.22 | 77.0 | 6.76 | 74.8 |
| FishAudio S2 | 4B | ✅ | 0.99 | - | 0.54 | - | 5.99 | - |
| LongCat-Audio-DiT | 3.5B | ✅ | 1.50 | 78.6 | 1.09 | 81.8 | 6.04 | 79.7 |
| **VoxCPM2** | 2B | ✅ | 1.84 | 75.3 | 0.97| 79.5| 8.13 | 75.3 |
</details>
### CV3-eval
<details>
<summary><b>CV3-eval 多语言 WER/CER(⬇) 结果(点击展开)</b></summary>
| Model | zh | en | hard-zh | hard-en | ja | ko | de | es | fr | it | ru |
|-------|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|
| CosyVoice2 | 4.08 | 6.32 | 12.58| 11.96| 9.13 | 19.7 |- | - | - | - | - |
| CosyVoice3-1.5B | 3.91 | 4.99 | 9.77 | 10.55 | 7.57 | 5.69 | 6.43 | 4.47 | 11.8 | 10.5 | 6.64 |
| Fish Audio S2 | 2.65 | 2.43 | 9.10 | 4.40 | 3.96 | 2.76 | 2.22 | 2.00 | 6.26 | 2.04 | 2.78 |
| **VoxCPM2** | 3.65 | 5.00 | 8.55 | 8.48 | 5.96 | 5.69 | 4.77 | 3.80 | 9.85 | 4.25 | 5.21 |
</details>
### MiniMax-Multilingual-Test
<details>
<summary><b>Minimax-MLS-test WER(⬇) 结果(点击展开)</b></summary>
| Language | Minimax | ElevenLabs | Qwen3-TTS | FishAudio S2 | **VoxCPM2** |
|----------|:-------:|:----------:|:--------------------:|:------------:|:-----------:|
| Arabic | **1.665** | 1.666 | | 3.500 | 13.046 |
| Cantonese | 34.111 | 51.513 | | **30.670** | 38.584 |
| Chinese | 2.252 | 16.026 | 0.928 | **0.730** | 1.136 |
| Czech | 3.875 | **2.108** | | 2.840 | 24.132 |
| Dutch | 1.143 | **0.803** | | 0.990 | 0.913 |
| English | 2.164 | 2.339 | **0.934** | 1.620 | 2.289 |
| Finnish | 4.666 | 2.964 | | 3.330 | **2.632** |
| French | 4.099 | 5.216 | **2.858** | 3.050 | 4.534 |
| German | 1.906 | 0.572 | 1.235 | **0.550** | 0.679 |
| Greek | 2.016 | **0.991** | | 5.740 | 2.844 |
| Hindi | 6.962 | **5.827** | | 14.640 | 19.699 |
| Indonesian | 1.237 | **1.059** | | 1.460 | 1.084 |
| Italian | 1.543 | 1.743 | **0.948** | 1.270 | 1.563 |
| Japanese | 3.519 | 10.646 | 3.823 | **2.760** | 4.628 |
| Korean | 1.747 | 1.865 | 1.755 | **1.180** | 1.962 |
| Polish | 1.415 | **0.766** | | 1.260 | 1.141 |
| Portuguese | 1.877 | 1.331 | 1.526 | **1.140** | 1.938 |
| Romanian | 2.878 | **1.347** | | 10.740 | 21.577 |
| Russian | 4.281 | 3.878 | 3.212 | **2.400** | 3.634 |
| Spanish | 1.029 | 1.084 | 1.126 | **0.910** | 1.438 |
| Thai | 2.701 | 73.936 | | 4.230 | 2.961 |
| Turkish | 1.52 | 0.699 | | 0.870 | 0.817 |
| Ukrainian | 1.082 | **0.997** | | 2.300 | 6.316 |
| Vietnamese | **0.88** | 73.415 | | 7.410 | 3.307 |
</details>
<details>
<summary><b>Minimax-MLS-test SIM(⬆) 结果(点击展开)</b></summary>
| Language | Minimax | ElevenLabs | Qwen3-TTS | FishAudio S2 | **VoxCPM2** |
|----------|:-------:|:----------:|:--------------------:|:------------:|:-----------:|
| Arabic | 73.6 | 70.6 | | 75.0 | **79.1** |
| Cantonese | 77.8 | 67.0 | | 80.5 | **83.5** |
| Chinese | 78.0 | 67.7 | 79.9 | 81.6 | **82.5** |
| Czech | 79.6 | 68.5 | | **79.8** | 78.3 |
| Dutch | 73.8 | 68.0 | | 73.0 | **80.8** |
| English | 75.6 | 61.3 | 77.5 | 79.7 | **85.4** |
| Finnish | 83.5 | 75.9 | | 81.9 | **89.0** |
| French | 62.8 | 53.5 | 62.8 | 69.8 | **73.5** |
| German | 73.3 | 61.4 | 77.5 | 76.7 | **80.3** |
| Greek | 82.6 | 73.3 | | 79.5 | **86.0** |
| Hindi | 81.8 | 73.0 | | 82.1 | **85.6** |
| Indonesian | 72.9 | 66.0 | | 76.3 | **80.0** |
| Italian | 69.9 | 57.9 | 81.7 | 74.7 | **78.0** |
| Japanese | 77.6 | 73.8 | 78.8 | 79.6 | **82.8** |
| Korean | 77.6 | 70.0 | 79.9 | 81.7 | **83.3** |
| Polish | 80.2 | 72.9 | | 81.9 | **88.4** |
| Portuguese | 80.5 | 71.1 | 81.7 | 78.1 | **83.7** |
| Romanian | **80.9** | 69.9 | | 73.3 | 79.7 |
| Russian | 76.1 | 67.6 | 79.2 | 79.0 | **81.1** |
| Spanish | 76.2 | 61.5 | 81.4 | 77.6 | **83.1** |
| Thai | 80.0 | 58.8 | | 78.6 | **84.0** |
| Turkish | 77.9 | 59.6 | | 83.5 | **87.1** |
| Ukrainian | 73.0 | 64.7 | | 74.7 | **79.8** |
| Vietnamese | 74.3 | 36.9 | | 74.0 | **80.6** |
</details>
### Internal 30-Language ASR Benchmark
我们额外进行了内部多语言可懂度评测:**30 语种 × 500 样本**,ASR 转写评估使用 **Gemini 3.1 Flash Lite API**
<details>
<summary><b>内部30语种评测集ASR结果(点击展开)</b></summary>
| 语言 | 指标 | VoxCPM2 | Fish S2-Pro |
|---|---:|---:|---:|
| ar (阿拉伯语) | CER | 1.23% | 0.30% |
| da (丹麦语) | WER | 2.70% | 3.52% |
| de (德语) | WER | 0.96% | 0.64% |
| el (希腊语) | WER | 3.17% | 4.61% |
| en (英语) | WER | 0.42% | 1.03% |
| es (西班牙语) | WER | 1.33% | 0.64% |
| fi (芬兰语) | WER | 2.24% | 2.80% |
| fr (法语) | WER | 2.16% | 2.34% |
| he (希伯来语) | CER | 2.98% | 15.27% |
| hi (印地语) | CER | 0.79% | 0.91% |
| id (印尼语) | WER | 1.36% | 1.68% |
| it (意大利语) | WER | 1.65% | 1.08% |
| ja (日语) | CER | 2.40% | 1.82% |
| km (高棉语) | CER | 2.05% | 75.15% |
| ko (韩语) | CER | 0.95% | 0.29% |
| lo (老挝语) | CER | 1.90% | 87.40% |
| ms (马来语) | WER | 1.75% | 1.41% |
| my (缅甸语) | CER | 1.42% | 85.27% |
| nl (荷兰语) | WER | 1.25% | 1.68% |
| no (挪威语) | WER | 2.49% | 3.76% |
| pl (波兰语) | WER | 1.90% | 1.65% |
| pt (葡萄牙语) | WER | 1.48% | 1.49% |
| ru (俄语) | WER | 0.90% | 0.86% |
| sv (瑞典语) | WER | 2.22% | 2.63% |
| sw (斯瓦希里语) | CER | 1.07% | 2.02% |
| th (泰语) | CER | 0.94% | 1.92% |
| tl (菲律宾语) | WER | 2.63% | 4.00% |
| tr (土耳其语) | WER | 1.65% | 1.65% |
| vi (越南语) | WER | 1.56% | 5.56% |
| zh (中文) | CER | 0.92% | 1.02% |
| 平均(30 语种) | | **1.68%** | - |
</details>
### InstructTTSEval
<details>
<summary><b>指令驱动音色设计结果 (点击展开)</b></summary>
| Model | InstructTTSEval-ZH | | | InstructTTSEval-EN | | |
|-------|:---:|:----:|:----:|:----:|:----:|:----:|
| | APS⬆| DSD⬆ | RP⬆| APS⬆ | DSD⬆ | RP⬆ |
| Hume | | | | 83.0 | 75.3 | 54.3 |
| VoxInstruct | 47.5 | 52.3 | 42.6 | 54.9 | 57.0 | 39.3 |
| Parler-tts-mini | | | | 63.4 | 48.7 | 28.6 |
| Parler-tts-large | | | | 60.0 | 45.9 | 31.2 |
| PromptTTS | | | | 64.3 | 47.2 | 31.4 |
| PromptStyle | | | | 57.4 | 46.4 | 30.9 |
| VoiceSculptor | 75.7 | 64.7 | 61.5 | | | |
| Mimo-Audio-7B-Instruct | 75.7 | 74.3 | 61.5 | 80.6 | 77.6 | 59.5 |
| Qwen3TTS-12Hz-1.7B-VD | **85.2** | **81.1** | **65.1** | 82.9 | 82.4 | 68.4 |
| **VoxCPM2** | **85.2** | 71.5 | 60.8 | **84.2** | **83.2** | **71.4** |
</details>
---
## ⚙️ 微调
VoxCPM 支持**全参数微调(SFT** 和 **LoRA 微调**。仅需 **5-10分钟** 的音频数据,即可适配特定说话人、语言或领域。
```bash
# LoRA 微调(参数高效,推荐)
python scripts/train_voxcpm_finetune.py \
--config_path conf/voxcpm_v2/voxcpm_finetune_lora.yaml
# 全参数微调
python scripts/train_voxcpm_finetune.py \
--config_path conf/voxcpm_v2/voxcpm_finetune_all.yaml
# WebUI 训练与推理
python lora_ft_webui.py # 然后打开 http://localhost:7860
```
> **完整指南 →** [微调文档](https://voxcpm.readthedocs.io/zh-cn/latest/finetuning/finetune.html)(数据准备、配置、训练、LoRA 热切换、常见问题)
---
## 📚 文档
完整文档:**[voxcpm.readthedocs.io](https://voxcpm.readthedocs.io/zh-cn/latest/)**
| 主题 | 链接 |
|---|---|
| 快速开始与安装 | [快速开始](https://voxcpm.readthedocs.io/zh-cn/latest/quickstart.html) |
| 使用指南与 Cookbook | [使用指南](https://voxcpm.readthedocs.io/zh-cn/latest/usage_guide.html) |
| VoxCPM 系列模型 | [模型列表](https://voxcpm.readthedocs.io/zh-cn/latest/models/version_history.html) |
| 微调(SFT & LoRA | [微调指南](https://voxcpm.readthedocs.io/zh-cn/latest/finetuning/finetune.html) |
| 常见问题 | [FAQ](https://voxcpm.readthedocs.io/zh-cn/latest/faq.html) |
---
## 🌟 生态与社区
| 项目 | 说明 |
|---|---|
| [**Nano-vLLM**](https://github.com/a710128/nanovllm-voxcpm) | 高吞吐快速 GPU 推理引擎 |
| [**VoxCPM.cpp**](https://github.com/bluryar/VoxCPM.cpp) | GGML/GGUFCPU、CUDA、Vulkan 推理 |
| [**VoxCPM-ONNX**](https://github.com/bluryar/VoxCPM-ONNX) | ONNX 导出,支持 CPU 推理 |
| [**VoxCPMANE**](https://github.com/0seba/VoxCPMANE) | Apple Neural Engine 后端 |
| [**voxcpm_rs**](https://github.com/madushan1000/voxcpm_rs) | Rust 重新实现 |
| [**ComfyUI-VoxCPM**](https://github.com/wildminder/ComfyUI-VoxCPM) | ComfyUI 节点工作流 |
| [**ComfyUI-VoxCPMTTS**](https://github.com/1038lab/ComfyUI-VoxCPMTTS) | ComfyUI TTS 扩展 |
| [**TTS WebUI**](https://github.com/rsxdalv/tts_webui_extension.vox_cpm) | 浏览器端 TTS 扩展 |
> 完整生态见[文档](https://voxcpm.readthedocs.io/zh-cn/latest/)。社区项目非 OpenBMB 官方维护。做了什么有趣的东西?[提 Issue 或 PR](https://github.com/OpenBMB/VoxCPM/issues) 把它加进来!
---
## ⚠️ 风险与局限性
- **滥用风险:** VoxCPM 的声音克隆能力可生成高度逼真的合成语音。**严禁**将 VoxCPM 用于冒充他人、欺诈或虚假信息传播。我们强烈建议对所有 AI 生成的内容进行明确标注。
- **可控生成稳定性:** 音色设计和可控声音克隆的结果可能因生成次数而异 — 建议尝试生成 1~3 次以获得理想的音色或风格。我们正在积极提升可控性的一致性。
- **语言覆盖:** VoxCPM2 官方支持 30 种语言。对于未列入的语言,欢迎直接测试或使用自有数据进行微调。我们计划在未来版本中扩展语言覆盖。
- **使用说明:** 本模型基于 Apache-2.0 协议发布。用于生产部署时,我们建议针对具体场景进行充分的测试和安全评估。
---
## 📖 引用
如果 VoxCPM 对您有帮助,请考虑引用我们的工作并为仓库加星 ⭐!
```bib
@article{voxcpm2_2026,
title = {VoxCPM2: Tokenizer-Free TTS for Multilingual Speech Generation, Creative Voice Design, and True-to-Life Cloning},
author = {VoxCPM Team},
journal = {GitHub},
year = {2026},
}
@article{voxcpm2025,
title = {VoxCPM: Tokenizer-Free TTS for Context-Aware Speech Generation
and True-to-Life Voice Cloning},
author = {Zhou, Yixuan and Zeng, Guoyang and Liu, Xin and Li, Xiang and
Yu, Renjie and Wang, Ziyang and Ye, Runchuan and Sun, Weiyue and
Gui, Jiancheng and Li, Kehan and Wu, Zhiyong and Liu, Zhiyuan},
journal = {arXiv preprint arXiv:2509.24650},
year = {2025},
}
```
## 📄 许可证
VoxCPM 模型权重和代码基于 [Apache-2.0](LICENSE) 协议开源。
## 🙏 致谢
- [DiTAR](https://arxiv.org/abs/2502.03930) 扩散自回归骨干架构
- [MiniCPM-4](https://github.com/OpenBMB/MiniCPM) 语言模型基座
- [CosyVoice](https://github.com/FunAudioLLM/CosyVoice) 基于 Flow Matching 的 LocDiT 实现
- [DAC](https://github.com/descriptinc/descript-audio-codec) Audio VAE 骨干
- 感谢所有社区用户试用 VoxCPM、反馈问题、分享想法和贡献——你们的支持让项目持续进步
## 机构
<p>
<a href="https://modelbest.cn/"><img src="assets/modelbest_logo.png" width="28px"> 面壁智能</a>
&nbsp;&nbsp;&nbsp;
<a href="https://github.com/thuhcsi"><img src="assets/thuhcsi_logo.png" width="28px"> 清华大学人机交互实验室</a>
</p>
## ⭐ Star 历史
[![Star History Chart](https://api.star-history.com/svg?repos=OpenBMB/VoxCPM&type=Date)](https://star-history.com/#OpenBMB/VoxCPM&Date)
BIN
View File
Binary file not shown.

Before

Width:  |  Height:  |  Size: 9.5 KiB

+3 -1
View File
@@ -1,7 +1,8 @@
pretrained_path: /path/to/VoxCPM2/
train_manifest: /path/to/train.jsonl
val_manifest: null
sample_rate: 48000
sample_rate: 16000 # AudioVAE encoder input rate; must match audio_vae_config.sample_rate
out_sample_rate: 48000 # AudioVAE decoder output rate; used for TensorBoard audio logging
batch_size: 2
grad_accum_steps: 8 # effective batch size = batch_size × grad_accum_steps = 16
num_workers: 8
@@ -14,6 +15,7 @@ weight_decay: 0.01
warmup_steps: 100
max_steps: 1000
max_batch_tokens: 8192
max_grad_norm: 1.0 # gradient clipping max norm; 0 = disabled
save_path: /path/to/checkpoints/finetune_all
tensorboard: /path/to/logs/finetune_all
lambdas:
+3 -1
View File
@@ -1,7 +1,8 @@
pretrained_path: /path/to/VoxCPM2/
train_manifest: /path/to/train.jsonl
val_manifest: null
sample_rate: 48000
sample_rate: 16000 # AudioVAE encoder input rate; must match audio_vae_config.sample_rate
out_sample_rate: 48000 # AudioVAE decoder output rate; used for TensorBoard audio logging
batch_size: 2
grad_accum_steps: 8 # effective batch size = batch_size × grad_accum_steps = 16
num_workers: 8
@@ -14,6 +15,7 @@ weight_decay: 0.01
warmup_steps: 100
max_steps: 1000
max_batch_tokens: 8192
max_grad_norm: 1.0 # gradient clipping max norm; 0 = disabled
save_path: /path/to/checkpoints/finetune_lora
tensorboard: /path/to/logs/finetune_lora
lambdas:
+74 -8
View File
@@ -14,8 +14,10 @@ from typing import Optional
project_root = Path(__file__).parent
sys.path.insert(0, str(project_root / "src"))
# Default pretrained model path relative to this repo
default_pretrained_path = str(project_root / "models" / "openbmb__VoxCPM1.5")
# Default pretrained model path: prefer VoxCPM2 if it exists, fallback to VoxCPM1.5
_v2_path = project_root / "models" / "openbmb__VoxCPM2"
_v15_path = project_root / "models" / "openbmb__VoxCPM1.5"
default_pretrained_path = str(_v2_path if _v2_path.exists() else _v15_path)
from voxcpm.core import VoxCPM
from voxcpm.model.voxcpm import LoRAConfig
@@ -99,6 +101,24 @@ def get_timestamp_str():
return datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
def detect_sample_rate(pretrained_path: str) -> Optional[int]:
"""Read audio_vae_config.sample_rate from the model's config.json.
This is the AudioVAE *encoder* input rate, which is the correct rate for
resampling training data. Returns None when detection fails.
"""
config_file = os.path.join(pretrained_path, "config.json")
if not os.path.isfile(config_file):
return None
try:
with open(config_file, "r", encoding="utf-8") as f:
cfg = json.load(f)
return int(cfg["audio_vae_config"]["sample_rate"])
except (KeyError, ValueError, json.JSONDecodeError) as e:
print(f"Warning: failed to detect sample_rate from {config_file}: {e}", file=sys.stderr)
return None
def get_or_load_asr_model():
global asr_model
if asr_model is None:
@@ -350,6 +370,7 @@ def start_training(
warmup_steps=100,
max_steps=None,
sample_rate=44100,
max_grad_norm=1.0,
# LoRA advanced
enable_lm=True,
enable_dit=True,
@@ -377,15 +398,39 @@ def start_training(
os.makedirs(checkpoints_dir, exist_ok=True)
os.makedirs(logs_dir, exist_ok=True)
# Auto-detect sample_rate from model config.json to prevent mismatch
detected_sr = detect_sample_rate(pretrained_path)
if detected_sr is not None:
if int(sample_rate) != detected_sr:
training_log += (
f"[Auto-fix] sample_rate changed from {int(sample_rate)} to {detected_sr} "
f"(read from {pretrained_path}/config.json audio_vae_config.sample_rate)\n"
)
sample_rate = detected_sr
# Create config dictionary
# Resolve max_steps default
resolved_max_steps = int(max_steps) if max_steps not in (None, "", 0) else int(num_iters)
# Auto-detect out_sample_rate from model config
out_sample_rate = 0
config_file = os.path.join(pretrained_path, "config.json")
if os.path.isfile(config_file):
try:
with open(config_file, "r", encoding="utf-8") as f:
cfg = json.load(f)
out_sr = cfg.get("audio_vae_config", {}).get("out_sample_rate")
if out_sr:
out_sample_rate = int(out_sr)
except Exception:
pass
config = {
"pretrained_path": pretrained_path,
"train_manifest": train_manifest,
"val_manifest": val_manifest,
"sample_rate": int(sample_rate),
"out_sample_rate": out_sample_rate,
"batch_size": int(batch_size),
"grad_accum_steps": int(grad_accum_steps),
"num_workers": int(num_workers),
@@ -397,6 +442,7 @@ def start_training(
"weight_decay": float(weight_decay),
"warmup_steps": int(warmup_steps),
"max_steps": resolved_max_steps,
"max_grad_norm": float(max_grad_norm),
"save_path": checkpoints_dir,
"tensorboard": tensorboard_path if tensorboard_path else logs_dir,
"lambdas": {"loss/diff": 1.0, "loss/stop": 1.0},
@@ -904,17 +950,19 @@ with gr.Blocks(title="VoxCPM LoRA WebUI", theme=gr.themes.Soft(), css=custom_css
with gr.Row():
max_steps = gr.Number(label="最大步数 (max_steps, 0→默认num_iters)", value=0, precision=0)
sample_rate = gr.Number(label="采样率 (sample_rate)", value=44100, precision=0)
tensorboard_path = gr.Textbox(label="Tensorboard 路径 (可选)", value="")
max_grad_norm = gr.Number(label="梯度裁剪 (max_grad_norm, 0=关闭)", value=1.0)
with gr.Row():
tensorboard_path = gr.Textbox(label="Tensorboard 路径 (可选)", value="")
enable_lm = gr.Checkbox(label="启用 LoRA LM (enable_lm)", value=True)
enable_dit = gr.Checkbox(label="启用 LoRA DIT (enable_dit)", value=True)
with gr.Row():
enable_proj = gr.Checkbox(label="启用投影 (enable_proj)", value=False)
dropout = gr.Number(label="LoRA Dropout", value=0.0)
gr.Markdown("#### 分发选项 (Distribution)")
with gr.Row():
hf_model_id = gr.Textbox(
label="HuggingFace Model ID (e.g., openbmb/VoxCPM1.5)", value="openbmb/VoxCPM1.5"
label="HuggingFace Model ID (e.g., openbmb/VoxCPM2)", value=""
)
distribute = gr.Checkbox(label="分发模式 (distribute)", value=False)
@@ -929,6 +977,19 @@ with gr.Blocks(title="VoxCPM LoRA WebUI", theme=gr.themes.Soft(), css=custom_css
show_label=False,
)
def on_pretrained_path_change(path):
"""Auto-detect sample_rate when pretrained model path changes."""
sr = detect_sample_rate(path)
if sr is not None:
return gr.update(value=sr)
return gr.update()
train_pretrained_path.change(
on_pretrained_path_change,
inputs=[train_pretrained_path],
outputs=[sample_rate],
)
start_btn.click(
start_training,
inputs=[
@@ -951,6 +1012,7 @@ with gr.Blocks(title="VoxCPM LoRA WebUI", theme=gr.themes.Soft(), css=custom_css
warmup_steps,
max_steps,
sample_rate,
max_grad_norm,
enable_lm,
enable_dit,
enable_proj,
@@ -1109,12 +1171,13 @@ with gr.Blocks(title="VoxCPM LoRA WebUI", theme=gr.themes.Soft(), css=custom_css
"warmup_steps": "warmup_steps",
"max_steps": "最大步数 (max_steps)",
"sample_rate": "采样率 (sample_rate)",
"max_grad_norm": "梯度裁剪 (max_grad_norm, 0=关闭)",
"enable_lm": "启用 LoRA LM (enable_lm)",
"enable_dit": "启用 LoRA DIT (enable_dit)",
"enable_proj": "启用投影 (enable_proj)",
"dropout": "LoRA Dropout",
"tensorboard_path": "Tensorboard 路径 (可选)",
"hf_model_id": "HuggingFace Model ID (e.g., openbmb/VoxCPM1.5)",
"hf_model_id": "HuggingFace Model ID (e.g., openbmb/VoxCPM2)",
"distribute": "分发模式 (distribute)",
}
else:
@@ -1127,12 +1190,13 @@ with gr.Blocks(title="VoxCPM LoRA WebUI", theme=gr.themes.Soft(), css=custom_css
"warmup_steps": "Warmup Steps",
"max_steps": "Max Steps",
"sample_rate": "Sample Rate",
"max_grad_norm": "Max Grad Norm (0=disabled)",
"enable_lm": "Enable LoRA LM",
"enable_dit": "Enable LoRA DIT",
"enable_proj": "Enable Projection",
"dropout": "LoRA Dropout",
"tensorboard_path": "Tensorboard Path (Optional)",
"hf_model_id": "HuggingFace Model ID (e.g., openbmb/VoxCPM1.5)",
"hf_model_id": "HuggingFace Model ID (e.g., openbmb/VoxCPM2)",
"distribute": "Distribute Mode",
}
@@ -1162,11 +1226,12 @@ with gr.Blocks(title="VoxCPM LoRA WebUI", theme=gr.themes.Soft(), css=custom_css
gr.update(label=adv["warmup_steps"]),
gr.update(label=adv["max_steps"]),
gr.update(label=adv["sample_rate"]),
gr.update(label=adv["max_grad_norm"]),
gr.update(label=adv["tensorboard_path"]),
gr.update(label=adv["enable_lm"]),
gr.update(label=adv["enable_dit"]),
gr.update(label=adv["enable_proj"]),
gr.update(label=adv["dropout"]),
gr.update(label=adv["tensorboard_path"]),
# Distribution options
gr.update(label=adv["hf_model_id"]),
gr.update(label=adv["distribute"]),
@@ -1213,11 +1278,12 @@ with gr.Blocks(title="VoxCPM LoRA WebUI", theme=gr.themes.Soft(), css=custom_css
warmup_steps,
max_steps,
sample_rate,
max_grad_norm,
tensorboard_path,
enable_lm,
enable_dit,
enable_proj,
dropout,
tensorboard_path,
# distribution outputs
hf_model_id,
distribute,
+31 -9
View File
@@ -30,7 +30,8 @@ except ImportError:
import json
from voxcpm.model import VoxCPMModel, VoxCPM2Model
from voxcpm.model.voxcpm import LoRAConfig
from voxcpm.model.voxcpm import LoRAConfig as LoRAConfigV1
from voxcpm.model.voxcpm2 import LoRAConfig as LoRAConfigV2
from voxcpm.training import (
Accelerator,
BatchProcessor,
@@ -46,6 +47,7 @@ def train(
train_manifest: str,
val_manifest: str = "",
sample_rate: int = 16_000,
out_sample_rate: int = 0, # AudioVAE decoder output rate; used for TensorBoard audio logging
batch_size: int = 1,
grad_accum_steps: int = 1,
num_workers: int = 2,
@@ -63,6 +65,7 @@ def train(
lambdas: Dict[str, float] = {"loss/diff": 1.0, "loss/stop": 1.0},
lora: dict = None,
config_path: str = "",
max_grad_norm: float = 0.0, # gradient clipping; 0 = disabled (backward compat)
# Distribution options (for LoRA checkpoints)
hf_model_id: str = "", # HuggingFace model ID (e.g., "openbmb/VoxCPM1.5")
distribute: bool = False, # If True, save hf_model_id as base_model; otherwise save pretrained_path
@@ -91,6 +94,7 @@ def train(
with open(os.path.join(pretrained_path, "config.json"), "r", encoding="utf-8") as _f:
_arch = json.load(_f).get("architecture", "voxcpm").lower()
_model_cls = VoxCPM2Model if _arch == "voxcpm2" else VoxCPMModel
LoRAConfig = LoRAConfigV2 if _arch == "voxcpm2" else LoRAConfigV1
if accelerator.rank == 0:
print(f"Detected architecture: {_arch} -> {_model_cls.__name__}", file=sys.stderr)
base_model = _model_cls.from_local(
@@ -98,6 +102,12 @@ def train(
)
tokenizer = base_model.text_tokenizer
expected_sr = base_model.audio_vae.sample_rate
assert sample_rate == expected_sr, (
f"sample_rate mismatch: config says {sample_rate}, but the AudioVAE encoder expects {expected_sr}. "
f"Please set sample_rate: {expected_sr} in your training config. "
)
train_ds, val_ds = load_audio_text_datasets(
train_manifest=train_manifest,
val_manifest=val_manifest,
@@ -170,8 +180,12 @@ def train(
dataset_cnt=dataset_cnt,
device=accelerator.device,
)
# Save audio_vae for audio generation
# Save audio_vae and output sample rate for audio generation.
# Prefer model's actual output rate; fall back to YAML out_sample_rate or encode rate.
audio_vae_for_gen = base_model.audio_vae
out_sr = base_model.sample_rate # decoder output rate (e.g. 48000 for V2)
if out_sr == 0 and out_sample_rate > 0:
out_sr = out_sample_rate
del base_model.audio_vae
model = accelerator.prepare_model(base_model)
unwrapped_model = accelerator.unwrap(model)
@@ -304,8 +318,8 @@ def train(
scaler = getattr(accelerator, "scaler", None)
if scaler is not None:
scaler.unscale_(optimizer)
# Use large max_norm to only compute grad_norm without actual clipping
grad_norm = torch.nn.utils.clip_grad_norm_(unwrapped_model.parameters(), max_norm=1e9)
effective_max_norm = max_grad_norm if max_grad_norm > 0 else 1e9
grad_norm = torch.nn.utils.clip_grad_norm_(unwrapped_model.parameters(), max_norm=effective_max_norm)
accelerator.step(optimizer)
accelerator.update()
@@ -333,6 +347,7 @@ def train(
val_ds=val_ds,
audio_vae=audio_vae_for_gen,
sample_rate=sample_rate,
out_sample_rate=out_sr,
val_texts=val_texts,
tokenizer=tokenizer,
valid_interval=valid_interval,
@@ -359,6 +374,7 @@ def validate(
val_ds=None,
audio_vae=None,
sample_rate=22050,
out_sample_rate=0,
val_texts=None,
tokenizer=None,
valid_interval=1000,
@@ -424,6 +440,7 @@ def validate(
step,
accelerator,
sample_rate,
out_sample_rate=out_sample_rate,
val_texts=val_texts,
tokenizer=tokenizer,
valid_interval=valid_interval,
@@ -526,6 +543,7 @@ def generate_sample_audio(
step,
accelerator,
sample_rate=22050,
out_sample_rate=0,
val_texts=None,
tokenizer=None,
pretrained_path=None,
@@ -540,6 +558,10 @@ def generate_sample_audio(
log(f"[Audio] Starting audio generation for {num_samples} samples at step {step}")
unwrapped_model = accelerator.unwrap(model)
# Determine the correct output sample rate for generated audio.
# out_sample_rate is the decoder output rate (e.g. 48kHz for V2);
# sample_rate is the encoder input rate (e.g. 16kHz for V2).
gen_sr = out_sample_rate if out_sample_rate > 0 else sample_rate
for i in range(num_samples):
sample = val_ds[i]
@@ -596,10 +618,10 @@ def generate_sample_audio(
gen_audio_np = normalize_audio(gen_audio_np)
tag = f"val_sample_{i}"
writer.add_audio(f"{tag}/generated_audio", gen_audio_np, global_step=step, sample_rate=sample_rate)
log(f"[Audio] Generated audio for sample {i}: duration={len(gen_audio_np)/sample_rate:.2f}s")
writer.add_audio(f"{tag}/generated_audio", gen_audio_np, global_step=step, sample_rate=gen_sr)
log(f"[Audio] Generated audio for sample {i}: duration={len(gen_audio_np)/gen_sr:.2f}s")
# Log reference audio
# Log reference audio (at encoder input rate, which is what val_ds provides)
if ref_audio_np is not None:
writer.add_audio(
f"{tag}/reference_audio", normalize_audio(ref_audio_np), global_step=step, sample_rate=sample_rate
@@ -607,9 +629,9 @@ def generate_sample_audio(
# Generate mel spectrogram figure
try:
mel_gen = compute_mel_spectrogram(gen_audio_np, sample_rate)
mel_gen = compute_mel_spectrogram(gen_audio_np, gen_sr)
mel_ref = compute_mel_spectrogram(ref_audio_np, sample_rate) if ref_audio_np is not None else None
fig = create_mel_figure(gen_audio_np, mel_gen, sample_rate, step, ref_audio_np, mel_ref)
fig = create_mel_figure(gen_audio_np, mel_gen, gen_sr, step, ref_audio_np, mel_ref)
writer.add_figure(f"{tag}/mel_spectrogram", fig, global_step=step)
log(f"[Audio] Created mel spectrogram figure for sample {i}")
except Exception as e:
Binary file not shown.
Binary file not shown.
Binary file not shown.
+66 -54
View File
@@ -48,25 +48,8 @@ from ..modules.minicpm4 import MiniCPM4Config, MiniCPMModel
from .utils import get_dtype, mask_multichar_chinese_tokens
def _trim_audio_silence_vad(
audio: torch.Tensor,
sample_rate: int,
max_silence_ms: float = 200.0,
top_db: float = 35.0,
) -> torch.Tensor:
"""使用能量阈值(VAD 方式)截取首尾静音及尾部长段伪静音,首尾各最多保留 max_silence_ms 毫秒静音。
会同时截掉末尾的长段伪静音(低能量但非完全静音的段落,如长时间底噪)。
Args:
audio: (1, T) 的音频 tensor
sample_rate: 采样率
max_silence_ms: 首尾允许保留的最大静音长度(毫秒)
top_db: 低于参考电平多少 dB 视为静音
Returns:
截取后的 (1, T') tensor
"""
# A simple function to trim audio silence using VAD, not used default
def _trim_audio_silence_vad(audio: torch.Tensor, sample_rate: int, max_silence_ms: float = 200.0, top_db: float = 35.0) -> torch.Tensor:
if audio.numel() == 0:
return audio
y = audio.squeeze(0).numpy()
@@ -85,7 +68,7 @@ def _trim_audio_silence_vad(
except Exception:
start, end = 0, n
# 用逐帧 RMS 找「最后一段有持续能量的位置」,截掉末尾长伪静音(低能量底噪等)
# Find the last frame with continuous energy, trim the long pseudo-silence at the end (low energy background noise, etc.)
n_frames = max(0, (n - frame_length) // hop_length + 1)
last_voice_frame = -1
for j in range(n_frames):
@@ -246,6 +229,7 @@ class VoxCPM2Model(nn.Module):
# Audio VAE
self.audio_vae = audio_vae
self.chunk_size = audio_vae.chunk_size
self._decode_chunk_size = getattr(audio_vae, "decode_chunk_size", audio_vae.chunk_size)
self._encode_sample_rate = audio_vae.sample_rate
self.sample_rate = getattr(audio_vae, "out_sample_rate", audio_vae.sample_rate)
@@ -382,11 +366,7 @@ class VoxCPM2Model(nn.Module):
mu=dit_hidden,
patch_size=self.patch_size,
cond=feat_cond_for_sample,
n_timesteps=(
self.config.dit_config.cfm_config.inference_cfg_rate
if hasattr(self.config.dit_config.cfm_config, "inference_cfg_rate")
else 10
),
n_timesteps=10,
)
feat_pred = rearrange(feat_pred_seq.transpose(1, 2), "(b t) d p -> b d (t p)", b=B, p=self.patch_size)
@@ -402,19 +382,26 @@ class VoxCPM2Model(nn.Module):
def _dtype(self):
return get_dtype(self.config.dtype)
def _encode_wav(self, wav_path: str, padding_mode: str = "right") -> torch.Tensor:
def _encode_wav(
self,
wav_path: str,
padding_mode: str = "right",
trim_silence_vad: bool = False,
) -> torch.Tensor:
"""Load, trim, pad and VAE-encode an audio file.
Args:
wav_path: path to the audio file.
padding_mode: "right" (default) or "left" padding for alignment.
trim_silence_vad: whether to apply VAD-based silence trimming.
Returns:
audio_feat: (T, P, D) tensor of latent patches.
"""
audio, _ = librosa.load(wav_path, sr=self._encode_sample_rate, mono=True)
audio = torch.from_numpy(audio).unsqueeze(0)
audio = _trim_audio_silence_vad(audio, self._encode_sample_rate, max_silence_ms=200.0)
if trim_silence_vad:
audio = _trim_audio_silence_vad(audio, self._encode_sample_rate, max_silence_ms=200.0)
patch_len = self.patch_size * self.chunk_size
if audio.size(1) % patch_len != 0:
padding_size = patch_len - audio.size(1) % patch_len
@@ -475,6 +462,7 @@ class VoxCPM2Model(nn.Module):
retry_badcase: bool = False,
retry_badcase_max_times: int = 3,
retry_badcase_ratio_threshold: float = 6.0,
trim_silence_vad: bool = False,
streaming: bool = False,
streaming_prefix_len: int = 4,
) -> Generator[torch.Tensor, None, None]:
@@ -495,8 +483,12 @@ class VoxCPM2Model(nn.Module):
)
text_length = text_token.shape[0]
ref_feat = self._encode_wav(reference_wav_path, padding_mode="right")
prompt_feat = self._encode_wav(prompt_wav_path, padding_mode="left")
ref_feat = self._encode_wav(
reference_wav_path,
padding_mode="right",
trim_silence_vad=trim_silence_vad,
)
prompt_feat = self._encode_wav(prompt_wav_path, padding_mode="left", trim_silence_vad=trim_silence_vad)
prompt_audio_length = prompt_feat.size(0)
ref_tokens, ref_feats, ref_t_mask, ref_a_mask = self._make_ref_prefix(ref_feat, text_token.device)
@@ -538,7 +530,11 @@ class VoxCPM2Model(nn.Module):
)
text_length = text_token.shape[0]
ref_feat = self._encode_wav(reference_wav_path, padding_mode="right")
ref_feat = self._encode_wav(
reference_wav_path,
padding_mode="right",
trim_silence_vad=trim_silence_vad,
)
ref_tokens, ref_feats, ref_t_mask, ref_a_mask = self._make_ref_prefix(ref_feat, text_token.device)
text_pad_feat = torch.zeros(
@@ -595,7 +591,7 @@ class VoxCPM2Model(nn.Module):
)
text_length = text_token.shape[0]
prompt_feat = self._encode_wav(prompt_wav_path, padding_mode="left")
prompt_feat = self._encode_wav(prompt_wav_path, padding_mode="left", trim_silence_vad=trim_silence_vad)
prompt_audio_length = prompt_feat.size(0)
prompt_pad_token = torch.zeros(prompt_audio_length, dtype=torch.int32, device=text_token.device)
text_pad_feat = torch.zeros(
@@ -640,14 +636,14 @@ class VoxCPM2Model(nn.Module):
streaming_prefix_len=streaming_prefix_len,
)
if streaming:
patch_len = self.patch_size * self.chunk_size
for latent_pred, _ in inference_result:
decode_patch_len = self.patch_size * self._decode_chunk_size
for latent_pred, _, _ctx in inference_result:
decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32))
decode_audio = decode_audio[..., -patch_len:].squeeze(1).cpu()
decode_audio = decode_audio[..., -decode_patch_len:].squeeze(1).cpu()
yield decode_audio
break
else:
latent_pred, pred_audio_feat = next(inference_result)
latent_pred, pred_audio_feat, context_len = next(inference_result)
if retry_badcase:
if pred_audio_feat.shape[0] >= target_text_length * retry_badcase_ratio_threshold:
print(
@@ -663,10 +659,9 @@ class VoxCPM2Model(nn.Module):
if not streaming:
decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32))
patch_len = self.patch_size * self.chunk_size
has_continuation = bool(prompt_wav_path)
if has_continuation:
decode_audio = decode_audio[..., patch_len * (streaming_prefix_len - 1):].squeeze(1).cpu()
decode_patch_len = self.patch_size * self._decode_chunk_size
if context_len > 0:
decode_audio = decode_audio[..., decode_patch_len * context_len:].squeeze(1).cpu()
else:
decode_audio = decode_audio.squeeze(1).cpu()
yield decode_audio
@@ -677,6 +672,7 @@ class VoxCPM2Model(nn.Module):
prompt_text: str = None,
prompt_wav_path: str = None,
reference_wav_path: str = None,
trim_silence_vad: bool = False,
):
"""
Build prompt cache for subsequent generation.
@@ -693,6 +689,8 @@ class VoxCPM2Model(nn.Module):
Must be paired with ``prompt_text``.
reference_wav_path: reference audio path for voice cloning
(structurally isolated via ref_audio tokens).
trim_silence_vad: whether to apply VAD-based silence trimming
before encoding prompt/reference audio.
Returns:
prompt_cache: dict used by ``_generate_with_prompt_cache``.
@@ -705,11 +703,19 @@ class VoxCPM2Model(nn.Module):
cache = {}
if reference_wav_path:
cache["ref_audio_feat"] = self._encode_wav(reference_wav_path, padding_mode="right")
cache["ref_audio_feat"] = self._encode_wav(
reference_wav_path,
padding_mode="right",
trim_silence_vad=trim_silence_vad,
)
if prompt_wav_path and prompt_text is not None:
cache["prompt_text"] = prompt_text
cache["audio_feat"] = self._encode_wav(prompt_wav_path, padding_mode="left")
cache["audio_feat"] = self._encode_wav(
prompt_wav_path,
padding_mode="left",
trim_silence_vad=trim_silence_vad,
)
has_ref = "ref_audio_feat" in cache
has_prompt = "audio_feat" in cache
@@ -917,14 +923,14 @@ class VoxCPM2Model(nn.Module):
streaming_prefix_len=streaming_prefix_len,
)
if streaming:
patch_len = self.patch_size * self.chunk_size
for latent_pred, pred_audio_feat in inference_result:
decode_patch_len = self.patch_size * self._decode_chunk_size
for latent_pred, pred_audio_feat, _ctx in inference_result:
decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32))
decode_audio = decode_audio[..., -patch_len:].squeeze(1).cpu()
decode_audio = decode_audio[..., -decode_patch_len:].squeeze(1).cpu()
yield (decode_audio, target_text_token, pred_audio_feat)
break
else:
latent_pred, pred_audio_feat = next(inference_result)
latent_pred, pred_audio_feat, context_len = next(inference_result)
if retry_badcase:
if pred_audio_feat.shape[0] >= target_text_length * retry_badcase_ratio_threshold:
print(
@@ -939,18 +945,20 @@ class VoxCPM2Model(nn.Module):
break
if not streaming:
decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32))
patch_len = self.patch_size * self.chunk_size
if mode in ("continuation", "ref_continuation"):
decode_audio = decode_audio[..., patch_len * (streaming_prefix_len - 1) :].squeeze(1).cpu()
decode_patch_len = self.patch_size * self._decode_chunk_size
if context_len > 0:
decode_audio = decode_audio[..., decode_patch_len * context_len:].squeeze(1).cpu()
else:
decode_audio = decode_audio[..., :].squeeze(1).cpu()
decode_audio = decode_audio.squeeze(1).cpu()
yield (decode_audio, target_text_token, pred_audio_feat)
def inference(self, *args, **kwargs) -> Tuple[torch.Tensor, torch.Tensor]:
return next(self._inference(*args, streaming=False, **kwargs))
feat_pred, generated_feat, _ = next(self._inference(*args, streaming=False, **kwargs))
return feat_pred, generated_feat
def inference_streaming(self, *args, **kwargs) -> Generator[Tuple[torch.Tensor, List[torch.Tensor]], None, None]:
return self._inference(*args, streaming=True, **kwargs)
for feat_pred, pred_feat_seq, _ in self._inference(*args, streaming=True, **kwargs):
yield feat_pred, pred_feat_seq
@torch.inference_mode()
def _inference(
@@ -1009,6 +1017,7 @@ class VoxCPM2Model(nn.Module):
# trailing audio patches as initial context so the VAE can decode smoothly.
# - Reference-only / zero-shot (feat_mask ends with 0): start from scratch.
has_continuation_audio = feat_mask[0, -1].item() == 1
context_len = 0
if has_continuation_audio:
audio_indices = feat_mask.squeeze(0).nonzero(as_tuple=True)[0]
context_len = min(streaming_prefix_len - 1, len(audio_indices))
@@ -1058,11 +1067,13 @@ class VoxCPM2Model(nn.Module):
prefix_feat_cond = pred_feat
if streaming:
# return the last three predicted latent features to provide enough context for smooth decoding
pred_feat_chunk = torch.cat(pred_feat_seq[-streaming_prefix_len:], dim=1)
feat_pred = rearrange(pred_feat_chunk, "b t p d -> b d (t p)", b=B, p=self.patch_size)
yield feat_pred, pred_feat_seq
yield feat_pred, pred_feat_seq, context_len
if len(pred_feat_seq) > streaming_prefix_len:
pred_feat_seq = pred_feat_seq[-streaming_prefix_len:]
stop_flag = self.stop_head(self.stop_actn(self.stop_proj(lm_hidden))).argmax(dim=-1)[0].cpu().item()
if i > min_len and stop_flag == 1:
@@ -1081,7 +1092,8 @@ class VoxCPM2Model(nn.Module):
if not streaming:
pred_feat_seq = torch.cat(pred_feat_seq, dim=1) # b, t, p, d
feat_pred = rearrange(pred_feat_seq, "b t p d -> b d (t p)", b=B, p=self.patch_size)
yield feat_pred, pred_feat_seq.squeeze(0).cpu()
generated_feat = pred_feat_seq[:, context_len:, :, :].squeeze(0).cpu()
yield feat_pred, generated_feat, context_len
@classmethod
def from_local(cls, path: str, optimize: bool = True, training: bool = False, lora_config: LoRAConfig = None):
@@ -436,6 +436,7 @@ class AudioVAE(nn.Module):
self.out_sample_rate = out_sample_rate
self.sr_bin_boundaries = sr_bin_boundaries
self.chunk_size = math.prod(encoder_rates)
self.decode_chunk_size = math.prod(decoder_rates)
def preprocess(self, audio_data, sample_rate):
if sample_rate is None:
+1 -1
View File
@@ -225,7 +225,7 @@ class UnifiedCFM(torch.nn.Module):
losses = F.mse_loss(u_pred, u_tgt.detach(), reduction="none").mean(dim=1)
if tgt_mask is not None:
weights = self.adaptive_loss_weighting(losses, tgt_mask.squeeze(1))
loss = (weights * losses).sum() / torch.sum(tgt_mask)
loss = (weights * losses).sum() / torch.clamp(torch.sum(tgt_mask), min=1.0)
else:
loss = losses.mean()