From 746631c38d5a204e596f5aac0edbb2f0d5a1e8c3 Mon Sep 17 00:00:00 2001 From: Labmem-Zhouyx <913703649@qq.com> Date: Mon, 6 Apr 2026 16:10:50 +0800 Subject: [PATCH] update --- README.md | 74 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 40 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index b91e402..800f405 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@

Project Page Live Playground - Documentation + Documentation Hugging Face ModelScope

@@ -43,7 +43,7 @@ Chinese Dialect: 四川话, 粤语, 吴语, 东北话, 河南话, 陕西话, 山 ### News -* **[2026.04]** 🔥 We release **VoxCPM2** — 2B, 30 languages, Voice Design & Controllable Voice Cloning, 48kHz audio output! [Weights](https://huggingface.co/openbmb/VoxCPM2) | [Docs](https://voxcpm.readthedocs.io/en/dev_2.0/) +* **[2026.04]** 🔥 We release **VoxCPM2** — 2B, 30 languages, Voice Design & Controllable Voice Cloning, 48kHz audio output! [Weights](https://huggingface.co/openbmb/VoxCPM2) | [Docs](https://voxcpm.readthedocs.io/en/latest/) * **[2025.12]** 🎉 Open-source **VoxCPM1.5** [weights](https://huggingface.co/openbmb/VoxCPM1.5) with SFT & LoRA fine-tuning. (**🏆 #1 GitHub Trending**) * **[2025.09]** 🔥 Release VoxCPM [Technical Report](https://arxiv.org/abs/2509.24650). * **[2025.09]** 🎉 Open-source **VoxCPM-0.5B** [weights](https://huggingface.co/openbmb/VoxCPM-0.5B) & [Playground](https://huggingface.co/spaces/OpenBMB/VoxCPM-Demo). (**🏆 #1 HuggingFace Trending**) @@ -73,10 +73,10 @@ Chinese Dialect: 四川话, 粤语, 吴语, 东北话, 河南话, 陕西话, 山 ### Installation ```sh -pip install voxcpm +uv pip install voxcpm ``` -> **Requirements:** Python ≥ 3.10, PyTorch ≥ 2.5.0, CUDA ≥ 12.0. See [Quick Start Docs](https://voxcpm.readthedocs.io/en/dev_2.0/quickstart.html) for details. +> **Requirements:** Python ≥ 3.10, PyTorch ≥ 2.5.0, CUDA ≥ 12.0. See [Quick Start Docs](https://voxcpm.readthedocs.io/en/latest/quickstart.html) for details. ### Python API @@ -86,14 +86,18 @@ pip install voxcpm from voxcpm import VoxCPM import soundfile as sf -model = VoxCPM.from_pretrained("openbmb/VoxCPM2") +model = VoxCPM.from_pretrained( + "openbmb/VoxCPM2" + load_denoiser=False, +) wav = model.generate( - text="VoxCPM2 brings multilingual support, creative voice design, and controllable voice cloning.", + text="VoxCPM2 is the current recommended release for realistic multilingual speech synthesis.", cfg_value=2.0, inference_timesteps=10, ) -sf.write("output.wav", wav, 48000) +sf.write("demo.wav", wav, model.tts_model.sample_rate) +print("saved: demo.wav") ``` #### 🎨 Voice Design @@ -102,34 +106,36 @@ Create a voice from a natural-language description — no reference audio needed ```python wav = model.generate( - text="(A warm, gentle female voice in her 30s with a calm and soothing tone)" - "Welcome to VoxCPM2, the next generation of realistic speech synthesis.", + text="(A young woman, gentle and sweet voice)Hello, welcome to VoxCPM2!", + cfg_value=2.0, + inference_timesteps=10, ) -sf.write("voice_design.wav", wav, 48000) +sf.write("voice_design.wav", wav, model.tts_model.sample_rate) ``` -#### 🎛️ Controllable Cloning +#### 🎛️ Controllable Voice Cloning -Clone any voice from a short reference clip, with optional style guidance: +Upload a reference audio. The model clones the timbre, and you can still use control instructions to adjust speed, emotion, or style. ```python wav = model.generate( - text="This is a voice cloning demonstration using VoxCPM2.", - reference_wav_path="speaker_reference.wav", + text="This is a cloned voice generated by VoxCPM2.", + reference_wav_path="speaker.wav", ) -sf.write("cloned.wav", wav, 48000) +sf.write("clone.wav", wav, model.tts_model.sample_rate) wav = model.generate( - text="(Speaking slowly with a whispering, mysterious tone)" - "The secret lies hidden in the ancient library, waiting to be discovered.", - reference_wav_path="speaker_reference.wav", + text="(slightly faster, cheerful tone)This is a cloned voice with style control.", + reference_wav_path="speaker.wav", + cfg_value=2.0, + inference_timesteps=10, ) -sf.write("style_control.wav", wav, 48000) +sf.write("controllable_clone.wav", wav, model.tts_model.sample_rate) ``` #### 🎙️ Ultimate Cloning -Provide both the reference audio and its transcript for audio-continuation-based cloning with every vocal nuance reproduced. For maximum cloning similarity, pass the same reference clip to both `reference_wav_path` and `prompt_wav_path` as shown below: +Provide both the reference audio and its exact transcript for audio-continuation-based cloning with every vocal nuance reproduced. For maximum cloning similarity, pass the same reference clip to both `reference_wav_path` and `prompt_wav_path` as shown below: ```python wav = model.generate( @@ -138,7 +144,7 @@ wav = model.generate( prompt_text="The transcript of the reference audio.", reference_wav_path="speaker_reference.wav", ) -sf.write("ultimate_cloned.wav", wav, 48000) +sf.write("hifi_clone.wav", wav, model.tts_model.sample_rate) ```
@@ -153,7 +159,7 @@ for chunk in model.generate_streaming( ): chunks.append(chunk) wav = np.concatenate(chunks) -sf.write("streaming.wav", wav, 48000) +sf.write("streaming.wav", wav, model.tts_model.sample_rate) ```
@@ -215,9 +221,9 @@ sf.write("out.wav", np.concatenate(chunks), 48000) server.stop() ``` -> **RTF as low as ~0.13 on NVIDIA RTX 4090** (vs ~0.15 with the standard PyTorch implementation), with support for batched concurrent requests and a FastAPI HTTP server. See the [Nano-vLLM-VoxCPM repo](https://github.com/a710128/nanovllm-voxcpm) for deployment details. +> **RTF as low as ~0.13 on NVIDIA RTX 4090** (vs ~0.3 with the standard PyTorch implementation), with support for batched concurrent requests and a FastAPI HTTP server. See the [Nano-vLLM-VoxCPM repo](https://github.com/a710128/nanovllm-voxcpm) for deployment details. -> **Full parameter reference, multi-scenario examples, and voice cloning tips →** [Quick Start Guide](https://voxcpm.readthedocs.io/en/dev_2.0/quickstart.html) | [Usage Guide & Best Practices](https://voxcpm.readthedocs.io/en/dev_2.0/chefsguide.html) +> **Full parameter reference, multi-scenario examples, and voice cloning tips →** [Quick Start Guide](https://voxcpm.readthedocs.io/en/latest/quickstart.html) | [Usage Guide & Best Practices](https://voxcpm.readthedocs.io/en/latest/cookbook.html) --- @@ -247,7 +253,7 @@ VoxCPM2 is built on a **tokenizer-free, diffusion autoregressive** paradigm. The VoxCPM2 Model Architecture -> For full architectural details, VoxCPM2-specific upgrades, and a model comparison table, see the [Architecture & Design Docs](https://voxcpm.readthedocs.io/en/dev_2.0/models/version_history.html). +> For full architectural details, VoxCPM2-specific upgrades, and a model comparison table, see the [Architecture & Design Docs](https://voxcpm.readthedocs.io/en/latest/models/version_history.html). --- @@ -324,7 +330,7 @@ VoxCPM2 achieves state-of-the-art or comparable results on public zero-shot TTS | Hindi | 6.962 | **5.827** | – | 14.640 | 19.699 | | Indonesian | 1.237 | **1.059** | – | 1.460 | 1.084 | | Italian | 1.543 | 1.743 | **0.948** | 1.270 | 1.563 | -| Japanese | 3.519 | 10.646 | 3.823 | **2.760** | 4.083 | +| Japanese | 3.519 | 10.646 | 3.823 | **2.760** | 4.628 | | Korean | 1.747 | 1.865 | 1.755 | **1.180** | 1.962 | | Polish | 1.415 | **0.766** | – | 1.260 | 1.141 | | Portuguese | 1.877 | 1.331 | 1.526 | **1.140** | 1.938 | @@ -416,21 +422,21 @@ python scripts/train_voxcpm_finetune.py \ python lora_ft_webui.py # then open http://localhost:7860 ``` -> **Full guide →** [Fine-tuning Guide](https://voxcpm.readthedocs.io/en/dev_2.0/finetuning/finetune.html) (data preparation, configuration, training, LoRA hot-swapping, FAQ) +> **Full guide →** [Fine-tuning Guide](https://voxcpm.readthedocs.io/en/latest/finetuning/finetune.html) (data preparation, configuration, training, LoRA hot-swapping, FAQ) --- ## 📚 Documentation -Full documentation: **[voxcpm.readthedocs.io](https://voxcpm.readthedocs.io/en/dev_2.0/)** +Full documentation: **[voxcpm.readthedocs.io](https://voxcpm.readthedocs.io/en/latest/)** | Topic | Link | |---|---| -| Quick Start & Installation | [Quick Start](https://voxcpm.readthedocs.io/en/dev_2.0/quickstart.html) | -| Usage Guide & Cookbook | [User Guide](https://voxcpm.readthedocs.io/en/dev_2.0/usage_guide.html) | -| VoxCPM Series | [Models](https://voxcpm.readthedocs.io/en/dev_2.0/models/version_history.html) | -| Fine-tuning (SFT & LoRA) | [Fine-tuning Guide](https://voxcpm.readthedocs.io/en/dev_2.0/finetuning/finetune.html) | -| FAQ & Troubleshooting | [FAQ](https://voxcpm.readthedocs.io/en/dev_2.0/faq.html) | +| Quick Start & Installation | [Quick Start](https://voxcpm.readthedocs.io/en/latest/quickstart.html) | +| Usage Guide & Cookbook | [User Guide](https://voxcpm.readthedocs.io/en/latest/usage_guide.html) | +| VoxCPM Series | [Models](https://voxcpm.readthedocs.io/en/latest/models/version_history.html) | +| Fine-tuning (SFT & LoRA) | [Fine-tuning Guide](https://voxcpm.readthedocs.io/en/latest/finetuning/finetune.html) | +| FAQ & Troubleshooting | [FAQ](https://voxcpm.readthedocs.io/en/latest/faq.html) | --- @@ -447,7 +453,7 @@ Full documentation: **[voxcpm.readthedocs.io](https://voxcpm.readthedocs.io/en/d | [**ComfyUI-VoxCPMTTS**](https://github.com/1038lab/ComfyUI-VoxCPMTTS) | ComfyUI TTS extension | | [**TTS WebUI**](https://github.com/rsxdalv/tts_webui_extension.vox_cpm) | Browser-based TTS extension | -> See the full [Ecosystem](https://voxcpm.readthedocs.io/en/dev_2.0/) in the docs. Community projects are not officially maintained by OpenBMB. Built something cool? [Open an issue or PR](https://github.com/OpenBMB/VoxCPM/issues) to add it! +> See the full [Ecosystem](https://voxcpm.readthedocs.io/en/latest/) in the docs. Community projects are not officially maintained by OpenBMB. Built something cool? [Open an issue or PR](https://github.com/OpenBMB/VoxCPM/issues) to add it! ---