From df38f0a16762ff4445ced76dc1515244aea10e21 Mon Sep 17 00:00:00 2001 From: Labmem-Zhouyx <913703649@qq.com> Date: Wed, 8 Apr 2026 11:29:19 +0800 Subject: [PATCH] update readme for modelscope download --- README.md | 24 ++++++++++++++++++++- README_zh.md | 24 ++++++++++++++++++++- src/voxcpm/model/voxcpm2.py | 43 ++++++++++++++++++++++++++++++------- 3 files changed, 81 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 0495559..ffd0c8a 100644 --- a/README.md +++ b/README.md @@ -103,7 +103,7 @@ from voxcpm import VoxCPM import soundfile as sf model = VoxCPM.from_pretrained( - "openbmb/VoxCPM2" + "openbmb/VoxCPM2", load_denoiser=False, ) @@ -116,6 +116,28 @@ sf.write("demo.wav", wav, model.tts_model.sample_rate) print("saved: demo.wav") ``` +If you prefer downloading from ModelScope first, you can use: + +```bash +pip install modelscope +``` + +```python +from modelscope.hub.snapshot_download import snapshot_download +from voxcpm import VoxCPM +import soundfile as sf + +local_model_dir = snapshot_download("OpenBMB/VoxCPM2") +model = VoxCPM.from_pretrained(local_model_dir, load_denoiser=False) + +wav = model.generate( + text="VoxCPM2 is the current recommended release for realistic multilingual speech synthesis.", + cfg_value=2.0, + inference_timesteps=10, +) +sf.write("demo.wav", wav, model.tts_model.sample_rate) +``` + #### 🎨 Voice Design Create a voice from a natural-language description — no reference audio needed. **Format:** put the description in parentheses at the start of `text`(e.g. `"(your voice description)The text to synthesize."`): diff --git a/README_zh.md b/README_zh.md index f89abab..6cc3b6b 100644 --- a/README_zh.md +++ b/README_zh.md @@ -102,7 +102,7 @@ from voxcpm import VoxCPM import soundfile as sf model = VoxCPM.from_pretrained( - "openbmb/VoxCPM2" + "openbmb/VoxCPM2", load_denoiser=False, ) @@ -115,6 +115,28 @@ sf.write("demo.wav", wav, model.tts_model.sample_rate) print("已保存: demo.wav") ``` +如果你希望先从 ModelScope 下载模型到本地(适用于国内网络访问),可以使用: + +```bash +pip install modelscope +``` + +```python +from modelscope.hub.snapshot_download import snapshot_download +from voxcpm import VoxCPM +import soundfile as sf + +local_model_dir = snapshot_download("OpenBMB/VoxCPM2") +model = VoxCPM.from_pretrained(local_model_dir, load_denoiser=False) + +wav = model.generate( + text="VoxCPM2 是目前推荐使用的多语言语音合成版本。", + cfg_value=2.0, + inference_timesteps=10, +) +sf.write("demo.wav", wav, model.tts_model.sample_rate) +``` + #### 🎨 音色设计 用自然语言描述创建全新音色,无需参考音频。**格式:** 在 `text` 开头用括号写入音色描述(如 `"(音色描述)要合成的文本。"`): diff --git a/src/voxcpm/model/voxcpm2.py b/src/voxcpm/model/voxcpm2.py index d2323bf..45d6fc1 100644 --- a/src/voxcpm/model/voxcpm2.py +++ b/src/voxcpm/model/voxcpm2.py @@ -402,19 +402,26 @@ class VoxCPM2Model(nn.Module): def _dtype(self): return get_dtype(self.config.dtype) - def _encode_wav(self, wav_path: str, padding_mode: str = "right") -> torch.Tensor: + def _encode_wav( + self, + wav_path: str, + padding_mode: str = "right", + trim_silence_vad: bool = False, + ) -> torch.Tensor: """Load, trim, pad and VAE-encode an audio file. Args: wav_path: path to the audio file. padding_mode: "right" (default) or "left" padding for alignment. + trim_silence_vad: whether to apply VAD-based silence trimming. Returns: audio_feat: (T, P, D) tensor of latent patches. """ audio, _ = librosa.load(wav_path, sr=self._encode_sample_rate, mono=True) audio = torch.from_numpy(audio).unsqueeze(0) - audio = _trim_audio_silence_vad(audio, self._encode_sample_rate, max_silence_ms=200.0) + if trim_silence_vad: + audio = _trim_audio_silence_vad(audio, self._encode_sample_rate, max_silence_ms=200.0) patch_len = self.patch_size * self.chunk_size if audio.size(1) % patch_len != 0: padding_size = patch_len - audio.size(1) % patch_len @@ -475,6 +482,7 @@ class VoxCPM2Model(nn.Module): retry_badcase: bool = False, retry_badcase_max_times: int = 3, retry_badcase_ratio_threshold: float = 6.0, + trim_silence_vad: bool = False, streaming: bool = False, streaming_prefix_len: int = 4, ) -> Generator[torch.Tensor, None, None]: @@ -495,8 +503,12 @@ class VoxCPM2Model(nn.Module): ) text_length = text_token.shape[0] - ref_feat = self._encode_wav(reference_wav_path, padding_mode="right") - prompt_feat = self._encode_wav(prompt_wav_path, padding_mode="left") + ref_feat = self._encode_wav( + reference_wav_path, + padding_mode="right", + trim_silence_vad=trim_silence_vad, + ) + prompt_feat = self._encode_wav(prompt_wav_path, padding_mode="left", trim_silence_vad=trim_silence_vad) prompt_audio_length = prompt_feat.size(0) ref_tokens, ref_feats, ref_t_mask, ref_a_mask = self._make_ref_prefix(ref_feat, text_token.device) @@ -538,7 +550,11 @@ class VoxCPM2Model(nn.Module): ) text_length = text_token.shape[0] - ref_feat = self._encode_wav(reference_wav_path, padding_mode="right") + ref_feat = self._encode_wav( + reference_wav_path, + padding_mode="right", + trim_silence_vad=trim_silence_vad, + ) ref_tokens, ref_feats, ref_t_mask, ref_a_mask = self._make_ref_prefix(ref_feat, text_token.device) text_pad_feat = torch.zeros( @@ -595,7 +611,7 @@ class VoxCPM2Model(nn.Module): ) text_length = text_token.shape[0] - prompt_feat = self._encode_wav(prompt_wav_path, padding_mode="left") + prompt_feat = self._encode_wav(prompt_wav_path, padding_mode="left", trim_silence_vad=trim_silence_vad) prompt_audio_length = prompt_feat.size(0) prompt_pad_token = torch.zeros(prompt_audio_length, dtype=torch.int32, device=text_token.device) text_pad_feat = torch.zeros( @@ -677,6 +693,7 @@ class VoxCPM2Model(nn.Module): prompt_text: str = None, prompt_wav_path: str = None, reference_wav_path: str = None, + trim_silence_vad: bool = False, ): """ Build prompt cache for subsequent generation. @@ -693,6 +710,8 @@ class VoxCPM2Model(nn.Module): Must be paired with ``prompt_text``. reference_wav_path: reference audio path for voice cloning (structurally isolated via ref_audio tokens). + trim_silence_vad: whether to apply VAD-based silence trimming + before encoding prompt/reference audio. Returns: prompt_cache: dict used by ``_generate_with_prompt_cache``. @@ -705,11 +724,19 @@ class VoxCPM2Model(nn.Module): cache = {} if reference_wav_path: - cache["ref_audio_feat"] = self._encode_wav(reference_wav_path, padding_mode="right") + cache["ref_audio_feat"] = self._encode_wav( + reference_wav_path, + padding_mode="right", + trim_silence_vad=trim_silence_vad, + ) if prompt_wav_path and prompt_text is not None: cache["prompt_text"] = prompt_text - cache["audio_feat"] = self._encode_wav(prompt_wav_path, padding_mode="left") + cache["audio_feat"] = self._encode_wav( + prompt_wav_path, + padding_mode="left", + trim_silence_vad=trim_silence_vad, + ) has_ref = "ref_audio_feat" in cache has_prompt = "audio_feat" in cache