diff --git a/.gitignore b/.gitignore index d397292..49a1772 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ __pycache__ voxcpm.egg-info .DS_Store ./pretrained_models/ -app_local.py \ No newline at end of file +app_local.py +models/ \ No newline at end of file diff --git a/app.py b/app.py index dba6fe3..1d8594f 100644 --- a/app.py +++ b/app.py @@ -54,7 +54,7 @@ _EXAMPLES_FOOTER_EN = ( ) _USAGE_INSTRUCTIONS_ZH = ( - "**VoxCPM2 — 三种语音生成方式:**\n\n" + "**三种语音生成方式:**\n\n" "🎨 **声音设计(Voice Design)** \n" "无需参考音频。在 **Control Instruction** 中描述目标音色特征" "(性别、年龄、语气、情绪、语速等),VoxCPM2 即可为你从零创造独一无二的声音。\n\n" @@ -65,6 +65,8 @@ _USAGE_INSTRUCTIONS_ZH = ( "开启 **极致克隆模式** 并提供参考音频的文字内容(可自动识别)。" "模型会将参考音频视为已说出的前文,以**音频续写**的方式完整还原参考音频中的所有声音细节。" "注意:该模式与可控克隆模式互斥,将禁用Control Instruction。\n\n" + "目前支持的方言包括:\n" + "「四川话、粤语、吴语、东北话、河南话、陕西话、山东话、天津话、闽南话」" ) _EXAMPLES_FOOTER_ZH = ( @@ -222,9 +224,9 @@ _APP_THEME = gr.themes.Soft( class VoxCPMDemo: def __init__(self, model_id: str = "openbmb/VoxCPM2") -> None: self.device = "cuda" if torch.cuda.is_available() else "cpu" - logger.info(f"Running on device: {self.device}") + logger.info(f"运行在设备上: {self.device}") - self.asr_model_id = "iic/SenseVoiceSmall" + self.asr_model_id = "./models/iic/SenseVoiceSmall" self.asr_model: Optional[AutoModel] = AutoModel( model=self.asr_model_id, disable_update=True, @@ -486,7 +488,7 @@ def run_demo( server_name: str = "0.0.0.0", server_port: int = 8808, show_error: bool = True, - model_id: str = "openbmb/VoxCPM2", + model_id: str = "./models/openbmb/VoxCPM2", ): demo = VoxCPMDemo(model_id=model_id) interface = create_demo_interface(demo) @@ -504,9 +506,9 @@ if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument( - "--model-id", type=str, default="openbmb/VoxCPM2", - help="Local path or HuggingFace repo ID (default: openbmb/VoxCPM2)", + "--model-id", type=str, default="./models/openbmb/VoxCPM2", + help="本地路径或HuggingFace仓库ID(默认:./models/openbmb/VoxCPM2)", ) - parser.add_argument("--port", type=int, default=8808, help="Server port") + parser.add_argument("--port", type=int, default=8808, help="服务端口") args = parser.parse_args() - run_demo(model_id=args.model_id, server_port=args.port) + run_demo(model_id=args.model_id, server_port=args.port) \ No newline at end of file diff --git a/app_old.py b/app_old.py deleted file mode 100644 index d46c2e1..0000000 --- a/app_old.py +++ /dev/null @@ -1,280 +0,0 @@ -import os -import sys -import numpy as np -import torch -import gradio as gr -from typing import Optional, Tuple -from funasr import AutoModel -from pathlib import Path -os.environ["TOKENIZERS_PARALLELISM"] = "false" -if os.environ.get("HF_REPO_ID", "").strip() == "": - os.environ["HF_REPO_ID"] = "openbmb/VoxCPM1.5" - -import voxcpm - - -class VoxCPMDemo: - def __init__(self) -> None: - self.device = "cuda" if torch.cuda.is_available() else "cpu" - print(f"🚀 Running on device: {self.device}", file=sys.stderr) - - # ASR model for prompt text recognition - self.asr_model_id = "iic/SenseVoiceSmall" - self.asr_model: Optional[AutoModel] = AutoModel( - model=self.asr_model_id, - disable_update=True, - log_level='DEBUG', - device="cuda:0" if self.device == "cuda" else "cpu", - ) - - # TTS model (lazy init) - self.voxcpm_model: Optional[voxcpm.VoxCPM] = None - self.default_local_model_dir = "./models/VoxCPM1.5" - - # ---------- Model helpers ---------- - def _resolve_model_dir(self) -> str: - """ - Resolve model directory: - 1) Use local checkpoint directory if exists - 2) If HF_REPO_ID env is set, download into models/{repo} - 3) Fallback to 'models' - """ - if os.path.isdir(self.default_local_model_dir): - return self.default_local_model_dir - - repo_id = os.environ.get("HF_REPO_ID", "").strip() - if len(repo_id) > 0: - target_dir = os.path.join("models", repo_id.replace("/", "__")) - if not os.path.isdir(target_dir): - try: - from huggingface_hub import snapshot_download # type: ignore - os.makedirs(target_dir, exist_ok=True) - print(f"Downloading model from HF repo '{repo_id}' to '{target_dir}' ...", file=sys.stderr) - snapshot_download(repo_id=repo_id, local_dir=target_dir, local_dir_use_symlinks=False) - except Exception as e: - print(f"Warning: HF download failed: {e}. Falling back to 'data'.", file=sys.stderr) - return "models" - return target_dir - return "models" - - def get_or_load_voxcpm(self) -> voxcpm.VoxCPM: - if self.voxcpm_model is not None: - return self.voxcpm_model - print("Model not loaded, initializing...", file=sys.stderr) - model_dir = self._resolve_model_dir() - print(f"Using model dir: {model_dir}", file=sys.stderr) - self.voxcpm_model = voxcpm.VoxCPM(voxcpm_model_path=model_dir) - print("Model loaded successfully.", file=sys.stderr) - return self.voxcpm_model - - # ---------- Functional endpoints ---------- - def prompt_wav_recognition(self, prompt_wav: Optional[str]) -> str: - if prompt_wav is None: - return "" - res = self.asr_model.generate(input=prompt_wav, language="auto", use_itn=True) - text = res[0]["text"].split('|>')[-1] - return text - - def generate_tts_audio( - self, - text_input: str, - prompt_wav_path_input: Optional[str] = None, - prompt_text_input: Optional[str] = None, - cfg_value_input: float = 2.0, - inference_timesteps_input: int = 10, - do_normalize: bool = True, - denoise: bool = True, - ) -> Tuple[int, np.ndarray]: - """ - Generate speech from text using VoxCPM; optional reference audio for voice style guidance. - Returns (sample_rate, waveform_numpy) - """ - current_model = self.get_or_load_voxcpm() - - text = (text_input or "").strip() - if len(text) == 0: - raise ValueError("Please input text to synthesize.") - - prompt_wav_path = prompt_wav_path_input if prompt_wav_path_input else None - prompt_text = prompt_text_input if prompt_text_input else None - - print(f"Generating audio for text: '{text[:60]}...'", file=sys.stderr) - wav = current_model.generate( - text=text, - prompt_text=prompt_text, - prompt_wav_path=prompt_wav_path, - cfg_value=float(cfg_value_input), - inference_timesteps=int(inference_timesteps_input), - normalize=do_normalize, - denoise=denoise, - ) - return (current_model.tts_model.sample_rate, wav) - - -# ---------- UI Builders ---------- - -_APP_THEME = gr.themes.Soft( - primary_hue="blue", - secondary_hue="gray", - neutral_hue="slate", - font=[gr.themes.GoogleFont("Inter"), "Arial", "sans-serif"], -) - -_CUSTOM_CSS = """ -.logo-container { - text-align: center; - margin: 0.5rem 0 1rem 0; -} -.logo-container img { - height: 80px; - width: auto; - max-width: 200px; - display: inline-block; -} -/* Bold accordion labels */ -#acc_quick details > summary, -#acc_tips details > summary { - font-weight: 600 !important; - font-size: 1.1em !important; -} -/* Bold labels for specific checkboxes */ -#chk_denoise label, -#chk_denoise span, -#chk_normalize label, -#chk_normalize span { - font-weight: 600; -} -""" - - -def create_demo_interface(demo: VoxCPMDemo): - """Build the Gradio UI for VoxCPM demo.""" - gr.set_static_paths(paths=[Path.cwd().absolute()/"assets"]) - - with gr.Blocks() as interface: - # Header logo - gr.HTML('
VoxCPM Logo
') - - # Quick Start - with gr.Accordion("📋 Quick Start Guide |快速入门", open=False, elem_id="acc_quick"): - gr.Markdown(""" - ### How to Use |使用说明 - 1. **(Optional) Provide a Voice Prompt** - Upload or record an audio clip to provide the desired voice characteristics for synthesis. - **(可选)提供参考声音** - 上传或录制一段音频,为声音合成提供音色、语调和情感等个性化特征 - 2. **(Optional) Enter prompt text** - If you provided a voice prompt, enter the corresponding transcript here (auto-recognition available). - **(可选项)输入参考文本** - 如果提供了参考语音,请输入其对应的文本内容(支持自动识别)。 - 3. **Enter target text** - Type the text you want the model to speak. - **输入目标文本** - 输入您希望模型朗读的文字内容。 - 4. **Generate Speech** - Click the "Generate" button to create your audio. - **生成语音** - 点击"生成"按钮,即可为您创造出音频。 - """) - - # Pro Tips - with gr.Accordion("💡 Pro Tips |使用建议", open=False, elem_id="acc_tips"): - gr.Markdown(""" - ### Prompt Speech Enhancement|参考语音降噪 - - **Enable** to remove background noise for a clean voice, with an external ZipEnhancer component. However, this will limit the audio sampling rate to 16kHz, restricting the cloning quality ceiling. - **启用**:通过 ZipEnhancer 组件消除背景噪音,但会将音频采样率限制在16kHz,限制克隆上限。 - - **Disable** to preserve the original audio's all information, including background atmosphere, and support audio cloning up to 44.1kHz sampling rate. - **禁用**:保留原始音频的全部信息,包括背景环境声,最高支持44.1kHz的音频复刻。 - - ### Text Normalization|文本正则化 - - **Enable** to process general text with an external WeTextProcessing component. - **启用**:使用 WeTextProcessing 组件,可支持常见文本的正则化处理。 - - **Disable** to use VoxCPM's native text understanding ability. For example, it supports phonemes input (For Chinese, phonemes are converted using pinyin, {ni3}{hao3}; For English, phonemes are converted using CMUDict, {HH AH0 L OW1}), try it! - **禁用**:将使用 VoxCPM 内置的文本理解能力。如,支持音素输入(如中文转拼音:{ni3}{hao3};英文转CMUDict:{HH AH0 L OW1})和公式符号合成,尝试一下! - - ### CFG Value|CFG 值 - - **Lower CFG** if the voice prompt sounds strained or expressive, or instability occurs with long text input. - **调低**:如果提示语音听起来不自然或过于夸张,或者长文本输入出现稳定性问题。 - - **Higher CFG** for better adherence to the prompt speech style or input text, or instability occurs with too short text input. - **调高**:为更好地贴合提示音频的风格或输入文本, 或者极短文本输入出现稳定性问题。 - - ### Inference Timesteps|推理时间步 - - **Lower** for faster synthesis speed. - **调低**:合成速度更快。 - - **Higher** for better synthesis quality. - **调高**:合成质量更佳。 - """) - - # Main controls - with gr.Row(): - with gr.Column(): - prompt_wav = gr.Audio( - sources=["upload", 'microphone'], - type="filepath", - label="Prompt Speech (Optional, or let VoxCPM improvise)", - value="./examples/example.wav", - ) - DoDenoisePromptAudio = gr.Checkbox( - value=False, - label="Prompt Speech Enhancement", - elem_id="chk_denoise", - info="We use ZipEnhancer model to denoise the prompt audio." - ) - with gr.Row(): - prompt_text = gr.Textbox( - value="Just by listening a few minutes a day, you'll be able to eliminate negative thoughts by conditioning your mind to be more positive.", - label="Prompt Text", - placeholder="Please enter the prompt text. Automatic recognition is supported, and you can correct the results yourself..." - ) - run_btn = gr.Button("Generate Speech", variant="primary") - - with gr.Column(): - cfg_value = gr.Slider( - minimum=1.0, - maximum=3.0, - value=2.0, - step=0.1, - label="CFG Value (Guidance Scale)", - info="Higher values increase adherence to prompt, lower values allow more creativity" - ) - inference_timesteps = gr.Slider( - minimum=4, - maximum=30, - value=10, - step=1, - label="Inference Timesteps", - info="Number of inference timesteps for generation (higher values may improve quality but slower)" - ) - with gr.Row(): - text = gr.Textbox( - value="VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly realistic speech.", - label="Target Text", - ) - with gr.Row(): - DoNormalizeText = gr.Checkbox( - value=False, - label="Text Normalization", - elem_id="chk_normalize", - info="We use wetext library to normalize the input text." - ) - audio_output = gr.Audio(label="Output Audio") - - # Wiring - run_btn.click( - fn=demo.generate_tts_audio, - inputs=[text, prompt_wav, prompt_text, cfg_value, inference_timesteps, DoNormalizeText, DoDenoisePromptAudio], - outputs=[audio_output], - show_progress=True, - api_name="generate", - ) - prompt_wav.change(fn=demo.prompt_wav_recognition, inputs=[prompt_wav], outputs=[prompt_text]) - - return interface - - -def run_demo(server_name: str = "localhost", server_port: int = 7860, show_error: bool = True): - demo = VoxCPMDemo() - interface = create_demo_interface(demo) - interface.queue(max_size=10, default_concurrency_limit=1).launch( - server_name=server_name, - server_port=server_port, - show_error=show_error, - theme=_APP_THEME, - css=_CUSTOM_CSS, - ) - - -if __name__ == "__main__": - run_demo() \ No newline at end of file diff --git a/modeldown.py b/modeldown.py new file mode 100644 index 0000000..d72f8b1 --- /dev/null +++ b/modeldown.py @@ -0,0 +1,33 @@ +""" +模型下载脚本 + +""" +from modelscope import snapshot_download + + +def download(repo_id:str, local_dir:str): + """ + 下载模型仓库或单个文件 + + + Args: + repo_id (str): 用户名/仓库名,例如 'stabilityai/sdxl-turbo' + local_dir (str or Path): 下载文件放置的本地目录路径 + + Returns: + str: 下载文件的本地路径 + + Raises: + ValueError: 当 repo_id 格式不正确时 + """ + model_dir = snapshot_download( + repo_id, + repo_type='model', + local_dir=f"{local_dir}/{repo_id}", + ) + + + +if __name__ == "__main__": + download("OpenBMB/VoxCPM2", "./models") + download("iic/SenseVoiceSmall", "./models") diff --git a/pyproject.toml b/pyproject.toml index 45ae2ee..e96055b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,8 +47,7 @@ dependencies = [ "funasr", "spaces", "argbind", - "safetensors" - + "safetensors", ] [project.optional-dependencies]