From 746631c38d5a204e596f5aac0edbb2f0d5a1e8c3 Mon Sep 17 00:00:00 2001 From: Labmem-Zhouyx <913703649@qq.com> Date: Mon, 6 Apr 2026 16:10:50 +0800 Subject: [PATCH] update --- README.md | 74 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 40 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index b91e402..800f405 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@
@@ -43,7 +43,7 @@ Chinese Dialect: 四川话, 粤语, 吴语, 东北话, 河南话, 陕西话, 山 ### News -* **[2026.04]** 🔥 We release **VoxCPM2** — 2B, 30 languages, Voice Design & Controllable Voice Cloning, 48kHz audio output! [Weights](https://huggingface.co/openbmb/VoxCPM2) | [Docs](https://voxcpm.readthedocs.io/en/dev_2.0/) +* **[2026.04]** 🔥 We release **VoxCPM2** — 2B, 30 languages, Voice Design & Controllable Voice Cloning, 48kHz audio output! [Weights](https://huggingface.co/openbmb/VoxCPM2) | [Docs](https://voxcpm.readthedocs.io/en/latest/) * **[2025.12]** 🎉 Open-source **VoxCPM1.5** [weights](https://huggingface.co/openbmb/VoxCPM1.5) with SFT & LoRA fine-tuning. (**🏆 #1 GitHub Trending**) * **[2025.09]** 🔥 Release VoxCPM [Technical Report](https://arxiv.org/abs/2509.24650). * **[2025.09]** 🎉 Open-source **VoxCPM-0.5B** [weights](https://huggingface.co/openbmb/VoxCPM-0.5B) & [Playground](https://huggingface.co/spaces/OpenBMB/VoxCPM-Demo). (**🏆 #1 HuggingFace Trending**) @@ -73,10 +73,10 @@ Chinese Dialect: 四川话, 粤语, 吴语, 东北话, 河南话, 陕西话, 山 ### Installation ```sh -pip install voxcpm +uv pip install voxcpm ``` -> **Requirements:** Python ≥ 3.10, PyTorch ≥ 2.5.0, CUDA ≥ 12.0. See [Quick Start Docs](https://voxcpm.readthedocs.io/en/dev_2.0/quickstart.html) for details. +> **Requirements:** Python ≥ 3.10, PyTorch ≥ 2.5.0, CUDA ≥ 12.0. See [Quick Start Docs](https://voxcpm.readthedocs.io/en/latest/quickstart.html) for details. ### Python API @@ -86,14 +86,18 @@ pip install voxcpm from voxcpm import VoxCPM import soundfile as sf -model = VoxCPM.from_pretrained("openbmb/VoxCPM2") +model = VoxCPM.from_pretrained( + "openbmb/VoxCPM2" + load_denoiser=False, +) wav = model.generate( - text="VoxCPM2 brings multilingual support, creative voice design, and controllable voice cloning.", + text="VoxCPM2 is the current recommended release for realistic multilingual speech synthesis.", cfg_value=2.0, inference_timesteps=10, ) -sf.write("output.wav", wav, 48000) +sf.write("demo.wav", wav, model.tts_model.sample_rate) +print("saved: demo.wav") ``` #### 🎨 Voice Design @@ -102,34 +106,36 @@ Create a voice from a natural-language description — no reference audio needed ```python wav = model.generate( - text="(A warm, gentle female voice in her 30s with a calm and soothing tone)" - "Welcome to VoxCPM2, the next generation of realistic speech synthesis.", + text="(A young woman, gentle and sweet voice)Hello, welcome to VoxCPM2!", + cfg_value=2.0, + inference_timesteps=10, ) -sf.write("voice_design.wav", wav, 48000) +sf.write("voice_design.wav", wav, model.tts_model.sample_rate) ``` -#### 🎛️ Controllable Cloning +#### 🎛️ Controllable Voice Cloning -Clone any voice from a short reference clip, with optional style guidance: +Upload a reference audio. The model clones the timbre, and you can still use control instructions to adjust speed, emotion, or style. ```python wav = model.generate( - text="This is a voice cloning demonstration using VoxCPM2.", - reference_wav_path="speaker_reference.wav", + text="This is a cloned voice generated by VoxCPM2.", + reference_wav_path="speaker.wav", ) -sf.write("cloned.wav", wav, 48000) +sf.write("clone.wav", wav, model.tts_model.sample_rate) wav = model.generate( - text="(Speaking slowly with a whispering, mysterious tone)" - "The secret lies hidden in the ancient library, waiting to be discovered.", - reference_wav_path="speaker_reference.wav", + text="(slightly faster, cheerful tone)This is a cloned voice with style control.", + reference_wav_path="speaker.wav", + cfg_value=2.0, + inference_timesteps=10, ) -sf.write("style_control.wav", wav, 48000) +sf.write("controllable_clone.wav", wav, model.tts_model.sample_rate) ``` #### 🎙️ Ultimate Cloning -Provide both the reference audio and its transcript for audio-continuation-based cloning with every vocal nuance reproduced. For maximum cloning similarity, pass the same reference clip to both `reference_wav_path` and `prompt_wav_path` as shown below: +Provide both the reference audio and its exact transcript for audio-continuation-based cloning with every vocal nuance reproduced. For maximum cloning similarity, pass the same reference clip to both `reference_wav_path` and `prompt_wav_path` as shown below: ```python wav = model.generate( @@ -138,7 +144,7 @@ wav = model.generate( prompt_text="The transcript of the reference audio.", reference_wav_path="speaker_reference.wav", ) -sf.write("ultimate_cloned.wav", wav, 48000) +sf.write("hifi_clone.wav", wav, model.tts_model.sample_rate) ```
-> For full architectural details, VoxCPM2-specific upgrades, and a model comparison table, see the [Architecture & Design Docs](https://voxcpm.readthedocs.io/en/dev_2.0/models/version_history.html).
+> For full architectural details, VoxCPM2-specific upgrades, and a model comparison table, see the [Architecture & Design Docs](https://voxcpm.readthedocs.io/en/latest/models/version_history.html).
---
@@ -324,7 +330,7 @@ VoxCPM2 achieves state-of-the-art or comparable results on public zero-shot TTS
| Hindi | 6.962 | **5.827** | – | 14.640 | 19.699 |
| Indonesian | 1.237 | **1.059** | – | 1.460 | 1.084 |
| Italian | 1.543 | 1.743 | **0.948** | 1.270 | 1.563 |
-| Japanese | 3.519 | 10.646 | 3.823 | **2.760** | 4.083 |
+| Japanese | 3.519 | 10.646 | 3.823 | **2.760** | 4.628 |
| Korean | 1.747 | 1.865 | 1.755 | **1.180** | 1.962 |
| Polish | 1.415 | **0.766** | – | 1.260 | 1.141 |
| Portuguese | 1.877 | 1.331 | 1.526 | **1.140** | 1.938 |
@@ -416,21 +422,21 @@ python scripts/train_voxcpm_finetune.py \
python lora_ft_webui.py # then open http://localhost:7860
```
-> **Full guide →** [Fine-tuning Guide](https://voxcpm.readthedocs.io/en/dev_2.0/finetuning/finetune.html) (data preparation, configuration, training, LoRA hot-swapping, FAQ)
+> **Full guide →** [Fine-tuning Guide](https://voxcpm.readthedocs.io/en/latest/finetuning/finetune.html) (data preparation, configuration, training, LoRA hot-swapping, FAQ)
---
## 📚 Documentation
-Full documentation: **[voxcpm.readthedocs.io](https://voxcpm.readthedocs.io/en/dev_2.0/)**
+Full documentation: **[voxcpm.readthedocs.io](https://voxcpm.readthedocs.io/en/latest/)**
| Topic | Link |
|---|---|
-| Quick Start & Installation | [Quick Start](https://voxcpm.readthedocs.io/en/dev_2.0/quickstart.html) |
-| Usage Guide & Cookbook | [User Guide](https://voxcpm.readthedocs.io/en/dev_2.0/usage_guide.html) |
-| VoxCPM Series | [Models](https://voxcpm.readthedocs.io/en/dev_2.0/models/version_history.html) |
-| Fine-tuning (SFT & LoRA) | [Fine-tuning Guide](https://voxcpm.readthedocs.io/en/dev_2.0/finetuning/finetune.html) |
-| FAQ & Troubleshooting | [FAQ](https://voxcpm.readthedocs.io/en/dev_2.0/faq.html) |
+| Quick Start & Installation | [Quick Start](https://voxcpm.readthedocs.io/en/latest/quickstart.html) |
+| Usage Guide & Cookbook | [User Guide](https://voxcpm.readthedocs.io/en/latest/usage_guide.html) |
+| VoxCPM Series | [Models](https://voxcpm.readthedocs.io/en/latest/models/version_history.html) |
+| Fine-tuning (SFT & LoRA) | [Fine-tuning Guide](https://voxcpm.readthedocs.io/en/latest/finetuning/finetune.html) |
+| FAQ & Troubleshooting | [FAQ](https://voxcpm.readthedocs.io/en/latest/faq.html) |
---
@@ -447,7 +453,7 @@ Full documentation: **[voxcpm.readthedocs.io](https://voxcpm.readthedocs.io/en/d
| [**ComfyUI-VoxCPMTTS**](https://github.com/1038lab/ComfyUI-VoxCPMTTS) | ComfyUI TTS extension |
| [**TTS WebUI**](https://github.com/rsxdalv/tts_webui_extension.vox_cpm) | Browser-based TTS extension |
-> See the full [Ecosystem](https://voxcpm.readthedocs.io/en/dev_2.0/) in the docs. Community projects are not officially maintained by OpenBMB. Built something cool? [Open an issue or PR](https://github.com/OpenBMB/VoxCPM/issues) to add it!
+> See the full [Ecosystem](https://voxcpm.readthedocs.io/en/latest/) in the docs. Community projects are not officially maintained by OpenBMB. Built something cool? [Open an issue or PR](https://github.com/OpenBMB/VoxCPM/issues) to add it!
---