update readme for modelscope download
This commit is contained in:
@@ -103,7 +103,7 @@ from voxcpm import VoxCPM
|
|||||||
import soundfile as sf
|
import soundfile as sf
|
||||||
|
|
||||||
model = VoxCPM.from_pretrained(
|
model = VoxCPM.from_pretrained(
|
||||||
"openbmb/VoxCPM2"
|
"openbmb/VoxCPM2",
|
||||||
load_denoiser=False,
|
load_denoiser=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -116,6 +116,28 @@ sf.write("demo.wav", wav, model.tts_model.sample_rate)
|
|||||||
print("saved: demo.wav")
|
print("saved: demo.wav")
|
||||||
```
|
```
|
||||||
|
|
||||||
|
If you prefer downloading from ModelScope first, you can use:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install modelscope
|
||||||
|
```
|
||||||
|
|
||||||
|
```python
|
||||||
|
from modelscope.hub.snapshot_download import snapshot_download
|
||||||
|
from voxcpm import VoxCPM
|
||||||
|
import soundfile as sf
|
||||||
|
|
||||||
|
local_model_dir = snapshot_download("OpenBMB/VoxCPM2")
|
||||||
|
model = VoxCPM.from_pretrained(local_model_dir, load_denoiser=False)
|
||||||
|
|
||||||
|
wav = model.generate(
|
||||||
|
text="VoxCPM2 is the current recommended release for realistic multilingual speech synthesis.",
|
||||||
|
cfg_value=2.0,
|
||||||
|
inference_timesteps=10,
|
||||||
|
)
|
||||||
|
sf.write("demo.wav", wav, model.tts_model.sample_rate)
|
||||||
|
```
|
||||||
|
|
||||||
#### 🎨 Voice Design
|
#### 🎨 Voice Design
|
||||||
|
|
||||||
Create a voice from a natural-language description — no reference audio needed. **Format:** put the description in parentheses at the start of `text`(e.g. `"(your voice description)The text to synthesize."`):
|
Create a voice from a natural-language description — no reference audio needed. **Format:** put the description in parentheses at the start of `text`(e.g. `"(your voice description)The text to synthesize."`):
|
||||||
|
|||||||
+23
-1
@@ -102,7 +102,7 @@ from voxcpm import VoxCPM
|
|||||||
import soundfile as sf
|
import soundfile as sf
|
||||||
|
|
||||||
model = VoxCPM.from_pretrained(
|
model = VoxCPM.from_pretrained(
|
||||||
"openbmb/VoxCPM2"
|
"openbmb/VoxCPM2",
|
||||||
load_denoiser=False,
|
load_denoiser=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -115,6 +115,28 @@ sf.write("demo.wav", wav, model.tts_model.sample_rate)
|
|||||||
print("已保存: demo.wav")
|
print("已保存: demo.wav")
|
||||||
```
|
```
|
||||||
|
|
||||||
|
如果你希望先从 ModelScope 下载模型到本地(适用于国内网络访问),可以使用:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install modelscope
|
||||||
|
```
|
||||||
|
|
||||||
|
```python
|
||||||
|
from modelscope.hub.snapshot_download import snapshot_download
|
||||||
|
from voxcpm import VoxCPM
|
||||||
|
import soundfile as sf
|
||||||
|
|
||||||
|
local_model_dir = snapshot_download("OpenBMB/VoxCPM2")
|
||||||
|
model = VoxCPM.from_pretrained(local_model_dir, load_denoiser=False)
|
||||||
|
|
||||||
|
wav = model.generate(
|
||||||
|
text="VoxCPM2 是目前推荐使用的多语言语音合成版本。",
|
||||||
|
cfg_value=2.0,
|
||||||
|
inference_timesteps=10,
|
||||||
|
)
|
||||||
|
sf.write("demo.wav", wav, model.tts_model.sample_rate)
|
||||||
|
```
|
||||||
|
|
||||||
#### 🎨 音色设计
|
#### 🎨 音色设计
|
||||||
|
|
||||||
用自然语言描述创建全新音色,无需参考音频。**格式:** 在 `text` 开头用括号写入音色描述(如 `"(音色描述)要合成的文本。"`):
|
用自然语言描述创建全新音色,无需参考音频。**格式:** 在 `text` 开头用括号写入音色描述(如 `"(音色描述)要合成的文本。"`):
|
||||||
|
|||||||
@@ -402,18 +402,25 @@ class VoxCPM2Model(nn.Module):
|
|||||||
def _dtype(self):
|
def _dtype(self):
|
||||||
return get_dtype(self.config.dtype)
|
return get_dtype(self.config.dtype)
|
||||||
|
|
||||||
def _encode_wav(self, wav_path: str, padding_mode: str = "right") -> torch.Tensor:
|
def _encode_wav(
|
||||||
|
self,
|
||||||
|
wav_path: str,
|
||||||
|
padding_mode: str = "right",
|
||||||
|
trim_silence_vad: bool = False,
|
||||||
|
) -> torch.Tensor:
|
||||||
"""Load, trim, pad and VAE-encode an audio file.
|
"""Load, trim, pad and VAE-encode an audio file.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
wav_path: path to the audio file.
|
wav_path: path to the audio file.
|
||||||
padding_mode: "right" (default) or "left" padding for alignment.
|
padding_mode: "right" (default) or "left" padding for alignment.
|
||||||
|
trim_silence_vad: whether to apply VAD-based silence trimming.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
audio_feat: (T, P, D) tensor of latent patches.
|
audio_feat: (T, P, D) tensor of latent patches.
|
||||||
"""
|
"""
|
||||||
audio, _ = librosa.load(wav_path, sr=self._encode_sample_rate, mono=True)
|
audio, _ = librosa.load(wav_path, sr=self._encode_sample_rate, mono=True)
|
||||||
audio = torch.from_numpy(audio).unsqueeze(0)
|
audio = torch.from_numpy(audio).unsqueeze(0)
|
||||||
|
if trim_silence_vad:
|
||||||
audio = _trim_audio_silence_vad(audio, self._encode_sample_rate, max_silence_ms=200.0)
|
audio = _trim_audio_silence_vad(audio, self._encode_sample_rate, max_silence_ms=200.0)
|
||||||
patch_len = self.patch_size * self.chunk_size
|
patch_len = self.patch_size * self.chunk_size
|
||||||
if audio.size(1) % patch_len != 0:
|
if audio.size(1) % patch_len != 0:
|
||||||
@@ -475,6 +482,7 @@ class VoxCPM2Model(nn.Module):
|
|||||||
retry_badcase: bool = False,
|
retry_badcase: bool = False,
|
||||||
retry_badcase_max_times: int = 3,
|
retry_badcase_max_times: int = 3,
|
||||||
retry_badcase_ratio_threshold: float = 6.0,
|
retry_badcase_ratio_threshold: float = 6.0,
|
||||||
|
trim_silence_vad: bool = False,
|
||||||
streaming: bool = False,
|
streaming: bool = False,
|
||||||
streaming_prefix_len: int = 4,
|
streaming_prefix_len: int = 4,
|
||||||
) -> Generator[torch.Tensor, None, None]:
|
) -> Generator[torch.Tensor, None, None]:
|
||||||
@@ -495,8 +503,12 @@ class VoxCPM2Model(nn.Module):
|
|||||||
)
|
)
|
||||||
text_length = text_token.shape[0]
|
text_length = text_token.shape[0]
|
||||||
|
|
||||||
ref_feat = self._encode_wav(reference_wav_path, padding_mode="right")
|
ref_feat = self._encode_wav(
|
||||||
prompt_feat = self._encode_wav(prompt_wav_path, padding_mode="left")
|
reference_wav_path,
|
||||||
|
padding_mode="right",
|
||||||
|
trim_silence_vad=trim_silence_vad,
|
||||||
|
)
|
||||||
|
prompt_feat = self._encode_wav(prompt_wav_path, padding_mode="left", trim_silence_vad=trim_silence_vad)
|
||||||
prompt_audio_length = prompt_feat.size(0)
|
prompt_audio_length = prompt_feat.size(0)
|
||||||
|
|
||||||
ref_tokens, ref_feats, ref_t_mask, ref_a_mask = self._make_ref_prefix(ref_feat, text_token.device)
|
ref_tokens, ref_feats, ref_t_mask, ref_a_mask = self._make_ref_prefix(ref_feat, text_token.device)
|
||||||
@@ -538,7 +550,11 @@ class VoxCPM2Model(nn.Module):
|
|||||||
)
|
)
|
||||||
text_length = text_token.shape[0]
|
text_length = text_token.shape[0]
|
||||||
|
|
||||||
ref_feat = self._encode_wav(reference_wav_path, padding_mode="right")
|
ref_feat = self._encode_wav(
|
||||||
|
reference_wav_path,
|
||||||
|
padding_mode="right",
|
||||||
|
trim_silence_vad=trim_silence_vad,
|
||||||
|
)
|
||||||
ref_tokens, ref_feats, ref_t_mask, ref_a_mask = self._make_ref_prefix(ref_feat, text_token.device)
|
ref_tokens, ref_feats, ref_t_mask, ref_a_mask = self._make_ref_prefix(ref_feat, text_token.device)
|
||||||
|
|
||||||
text_pad_feat = torch.zeros(
|
text_pad_feat = torch.zeros(
|
||||||
@@ -595,7 +611,7 @@ class VoxCPM2Model(nn.Module):
|
|||||||
)
|
)
|
||||||
text_length = text_token.shape[0]
|
text_length = text_token.shape[0]
|
||||||
|
|
||||||
prompt_feat = self._encode_wav(prompt_wav_path, padding_mode="left")
|
prompt_feat = self._encode_wav(prompt_wav_path, padding_mode="left", trim_silence_vad=trim_silence_vad)
|
||||||
prompt_audio_length = prompt_feat.size(0)
|
prompt_audio_length = prompt_feat.size(0)
|
||||||
prompt_pad_token = torch.zeros(prompt_audio_length, dtype=torch.int32, device=text_token.device)
|
prompt_pad_token = torch.zeros(prompt_audio_length, dtype=torch.int32, device=text_token.device)
|
||||||
text_pad_feat = torch.zeros(
|
text_pad_feat = torch.zeros(
|
||||||
@@ -677,6 +693,7 @@ class VoxCPM2Model(nn.Module):
|
|||||||
prompt_text: str = None,
|
prompt_text: str = None,
|
||||||
prompt_wav_path: str = None,
|
prompt_wav_path: str = None,
|
||||||
reference_wav_path: str = None,
|
reference_wav_path: str = None,
|
||||||
|
trim_silence_vad: bool = False,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Build prompt cache for subsequent generation.
|
Build prompt cache for subsequent generation.
|
||||||
@@ -693,6 +710,8 @@ class VoxCPM2Model(nn.Module):
|
|||||||
Must be paired with ``prompt_text``.
|
Must be paired with ``prompt_text``.
|
||||||
reference_wav_path: reference audio path for voice cloning
|
reference_wav_path: reference audio path for voice cloning
|
||||||
(structurally isolated via ref_audio tokens).
|
(structurally isolated via ref_audio tokens).
|
||||||
|
trim_silence_vad: whether to apply VAD-based silence trimming
|
||||||
|
before encoding prompt/reference audio.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
prompt_cache: dict used by ``_generate_with_prompt_cache``.
|
prompt_cache: dict used by ``_generate_with_prompt_cache``.
|
||||||
@@ -705,11 +724,19 @@ class VoxCPM2Model(nn.Module):
|
|||||||
cache = {}
|
cache = {}
|
||||||
|
|
||||||
if reference_wav_path:
|
if reference_wav_path:
|
||||||
cache["ref_audio_feat"] = self._encode_wav(reference_wav_path, padding_mode="right")
|
cache["ref_audio_feat"] = self._encode_wav(
|
||||||
|
reference_wav_path,
|
||||||
|
padding_mode="right",
|
||||||
|
trim_silence_vad=trim_silence_vad,
|
||||||
|
)
|
||||||
|
|
||||||
if prompt_wav_path and prompt_text is not None:
|
if prompt_wav_path and prompt_text is not None:
|
||||||
cache["prompt_text"] = prompt_text
|
cache["prompt_text"] = prompt_text
|
||||||
cache["audio_feat"] = self._encode_wav(prompt_wav_path, padding_mode="left")
|
cache["audio_feat"] = self._encode_wav(
|
||||||
|
prompt_wav_path,
|
||||||
|
padding_mode="left",
|
||||||
|
trim_silence_vad=trim_silence_vad,
|
||||||
|
)
|
||||||
|
|
||||||
has_ref = "ref_audio_feat" in cache
|
has_ref = "ref_audio_feat" in cache
|
||||||
has_prompt = "audio_feat" in cache
|
has_prompt = "audio_feat" in cache
|
||||||
|
|||||||
Reference in New Issue
Block a user