diff --git a/.gitignore b/.gitignore index d397292..49a1772 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ __pycache__ voxcpm.egg-info .DS_Store ./pretrained_models/ -app_local.py \ No newline at end of file +app_local.py +models/ \ No newline at end of file diff --git a/app.py b/app.py index dba6fe3..1d8594f 100644 --- a/app.py +++ b/app.py @@ -54,7 +54,7 @@ _EXAMPLES_FOOTER_EN = ( ) _USAGE_INSTRUCTIONS_ZH = ( - "**VoxCPM2 — 三种语音生成方式:**\n\n" + "**三种语音生成方式:**\n\n" "🎨 **声音设计(Voice Design)** \n" "无需参考音频。在 **Control Instruction** 中描述目标音色特征" "(性别、年龄、语气、情绪、语速等),VoxCPM2 即可为你从零创造独一无二的声音。\n\n" @@ -65,6 +65,8 @@ _USAGE_INSTRUCTIONS_ZH = ( "开启 **极致克隆模式** 并提供参考音频的文字内容(可自动识别)。" "模型会将参考音频视为已说出的前文,以**音频续写**的方式完整还原参考音频中的所有声音细节。" "注意:该模式与可控克隆模式互斥,将禁用Control Instruction。\n\n" + "目前支持的方言包括:\n" + "「四川话、粤语、吴语、东北话、河南话、陕西话、山东话、天津话、闽南话」" ) _EXAMPLES_FOOTER_ZH = ( @@ -222,9 +224,9 @@ _APP_THEME = gr.themes.Soft( class VoxCPMDemo: def __init__(self, model_id: str = "openbmb/VoxCPM2") -> None: self.device = "cuda" if torch.cuda.is_available() else "cpu" - logger.info(f"Running on device: {self.device}") + logger.info(f"运行在设备上: {self.device}") - self.asr_model_id = "iic/SenseVoiceSmall" + self.asr_model_id = "./models/iic/SenseVoiceSmall" self.asr_model: Optional[AutoModel] = AutoModel( model=self.asr_model_id, disable_update=True, @@ -486,7 +488,7 @@ def run_demo( server_name: str = "0.0.0.0", server_port: int = 8808, show_error: bool = True, - model_id: str = "openbmb/VoxCPM2", + model_id: str = "./models/openbmb/VoxCPM2", ): demo = VoxCPMDemo(model_id=model_id) interface = create_demo_interface(demo) @@ -504,9 +506,9 @@ if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument( - "--model-id", type=str, default="openbmb/VoxCPM2", - help="Local path or HuggingFace repo ID (default: openbmb/VoxCPM2)", + "--model-id", type=str, default="./models/openbmb/VoxCPM2", + help="本地路径或HuggingFace仓库ID(默认:./models/openbmb/VoxCPM2)", ) - parser.add_argument("--port", type=int, default=8808, help="Server port") + parser.add_argument("--port", type=int, default=8808, help="服务端口") args = parser.parse_args() - run_demo(model_id=args.model_id, server_port=args.port) + run_demo(model_id=args.model_id, server_port=args.port) \ No newline at end of file diff --git a/app_old.py b/app_old.py deleted file mode 100644 index d46c2e1..0000000 --- a/app_old.py +++ /dev/null @@ -1,280 +0,0 @@ -import os -import sys -import numpy as np -import torch -import gradio as gr -from typing import Optional, Tuple -from funasr import AutoModel -from pathlib import Path -os.environ["TOKENIZERS_PARALLELISM"] = "false" -if os.environ.get("HF_REPO_ID", "").strip() == "": - os.environ["HF_REPO_ID"] = "openbmb/VoxCPM1.5" - -import voxcpm - - -class VoxCPMDemo: - def __init__(self) -> None: - self.device = "cuda" if torch.cuda.is_available() else "cpu" - print(f"🚀 Running on device: {self.device}", file=sys.stderr) - - # ASR model for prompt text recognition - self.asr_model_id = "iic/SenseVoiceSmall" - self.asr_model: Optional[AutoModel] = AutoModel( - model=self.asr_model_id, - disable_update=True, - log_level='DEBUG', - device="cuda:0" if self.device == "cuda" else "cpu", - ) - - # TTS model (lazy init) - self.voxcpm_model: Optional[voxcpm.VoxCPM] = None - self.default_local_model_dir = "./models/VoxCPM1.5" - - # ---------- Model helpers ---------- - def _resolve_model_dir(self) -> str: - """ - Resolve model directory: - 1) Use local checkpoint directory if exists - 2) If HF_REPO_ID env is set, download into models/{repo} - 3) Fallback to 'models' - """ - if os.path.isdir(self.default_local_model_dir): - return self.default_local_model_dir - - repo_id = os.environ.get("HF_REPO_ID", "").strip() - if len(repo_id) > 0: - target_dir = os.path.join("models", repo_id.replace("/", "__")) - if not os.path.isdir(target_dir): - try: - from huggingface_hub import snapshot_download # type: ignore - os.makedirs(target_dir, exist_ok=True) - print(f"Downloading model from HF repo '{repo_id}' to '{target_dir}' ...", file=sys.stderr) - snapshot_download(repo_id=repo_id, local_dir=target_dir, local_dir_use_symlinks=False) - except Exception as e: - print(f"Warning: HF download failed: {e}. Falling back to 'data'.", file=sys.stderr) - return "models" - return target_dir - return "models" - - def get_or_load_voxcpm(self) -> voxcpm.VoxCPM: - if self.voxcpm_model is not None: - return self.voxcpm_model - print("Model not loaded, initializing...", file=sys.stderr) - model_dir = self._resolve_model_dir() - print(f"Using model dir: {model_dir}", file=sys.stderr) - self.voxcpm_model = voxcpm.VoxCPM(voxcpm_model_path=model_dir) - print("Model loaded successfully.", file=sys.stderr) - return self.voxcpm_model - - # ---------- Functional endpoints ---------- - def prompt_wav_recognition(self, prompt_wav: Optional[str]) -> str: - if prompt_wav is None: - return "" - res = self.asr_model.generate(input=prompt_wav, language="auto", use_itn=True) - text = res[0]["text"].split('|>')[-1] - return text - - def generate_tts_audio( - self, - text_input: str, - prompt_wav_path_input: Optional[str] = None, - prompt_text_input: Optional[str] = None, - cfg_value_input: float = 2.0, - inference_timesteps_input: int = 10, - do_normalize: bool = True, - denoise: bool = True, - ) -> Tuple[int, np.ndarray]: - """ - Generate speech from text using VoxCPM; optional reference audio for voice style guidance. - Returns (sample_rate, waveform_numpy) - """ - current_model = self.get_or_load_voxcpm() - - text = (text_input or "").strip() - if len(text) == 0: - raise ValueError("Please input text to synthesize.") - - prompt_wav_path = prompt_wav_path_input if prompt_wav_path_input else None - prompt_text = prompt_text_input if prompt_text_input else None - - print(f"Generating audio for text: '{text[:60]}...'", file=sys.stderr) - wav = current_model.generate( - text=text, - prompt_text=prompt_text, - prompt_wav_path=prompt_wav_path, - cfg_value=float(cfg_value_input), - inference_timesteps=int(inference_timesteps_input), - normalize=do_normalize, - denoise=denoise, - ) - return (current_model.tts_model.sample_rate, wav) - - -# ---------- UI Builders ---------- - -_APP_THEME = gr.themes.Soft( - primary_hue="blue", - secondary_hue="gray", - neutral_hue="slate", - font=[gr.themes.GoogleFont("Inter"), "Arial", "sans-serif"], -) - -_CUSTOM_CSS = """ -.logo-container { - text-align: center; - margin: 0.5rem 0 1rem 0; -} -.logo-container img { - height: 80px; - width: auto; - max-width: 200px; - display: inline-block; -} -/* Bold accordion labels */ -#acc_quick details > summary, -#acc_tips details > summary { - font-weight: 600 !important; - font-size: 1.1em !important; -} -/* Bold labels for specific checkboxes */ -#chk_denoise label, -#chk_denoise span, -#chk_normalize label, -#chk_normalize span { - font-weight: 600; -} -""" - - -def create_demo_interface(demo: VoxCPMDemo): - """Build the Gradio UI for VoxCPM demo.""" - gr.set_static_paths(paths=[Path.cwd().absolute()/"assets"]) - - with gr.Blocks() as interface: - # Header logo - gr.HTML('
