surport voxcpm2 cli

2026-04-01 21:15:55 +08:00
parent 42c428164c
commit addee2c550
8 changed files with 1642 additions and 375 deletions
@@ -2,17 +2,22 @@
 """
 VoxCPM Command Line Interface

-Unified CLI for voice cloning, direct TTS synthesis, and batch processing.
+VoxCPM2-first CLI for voice design, cloning, and batch processing.
 """

 import argparse
+import json
 import os
 import sys
 from pathlib import Path
+
 import soundfile as sf

 from voxcpm.core import VoxCPM

+
+DEFAULT_HF_MODEL_ID = "openbmb/VoxCPM2"
+
 # -----------------------------
 # Validators
 # -----------------------------
@@ -25,6 +30,13 @@ def validate_file_exists(file_path: str, file_type: str = "file") -> Path:
    return path


+def require_file_exists(file_path: str, parser, file_type: str = "file") -> Path:
+    try:
+        return validate_file_exists(file_path, file_type)
+    except FileNotFoundError as exc:
+        parser.error(str(exc))
+
+
 def validate_output_path(output_path: str) -> Path:
    path = Path(output_path)
    path.parent.mkdir(parents=True, exist_ok=True)
@@ -49,6 +61,113 @@ def validate_ranges(args, parser):
        parser.error("--lora-dropout must be between 0.0 and 1.0")


+def warn_legacy_mode():
+    print(
+        "Warning: legacy root CLI arguments are deprecated. Prefer `voxcpm design|clone|batch ...`.",
+        file=sys.stderr,
+    )
+
+
+def build_final_text(text: str, control: str | None) -> str:
+    control = (control or "").strip()
+    return f"({control}){text}" if control else text
+
+
+def resolve_prompt_text(args, parser) -> str | None:
+    prompt_text = getattr(args, "prompt_text", None)
+    prompt_file = getattr(args, "prompt_file", None)
+
+    if prompt_text and prompt_file:
+        parser.error("Use either --prompt-text or --prompt-file, not both.")
+
+    if prompt_file:
+        prompt_path = require_file_exists(prompt_file, parser, "prompt text file")
+        return prompt_path.read_text(encoding="utf-8").strip()
+
+    if prompt_text:
+        return prompt_text.strip()
+
+    return None
+
+
+def detect_model_architecture(args) -> str | None:
+    model_location = getattr(args, "model_path", None) or getattr(
+        args, "hf_model_id", None
+    )
+    if not model_location:
+        return None
+
+    if os.path.isdir(model_location):
+        config_path = Path(model_location) / "config.json"
+        if not config_path.exists():
+            return None
+
+        with open(config_path, "r", encoding="utf-8") as f:
+            return json.load(f).get("architecture", "voxcpm").lower()
+
+    model_hint = str(model_location).lower()
+    if "voxcpm2" in model_hint:
+        return "voxcpm2"
+    if (
+        "voxcpm1.5" in model_hint
+        or "voxcpm-1.5" in model_hint
+        or "voxcpm_1.5" in model_hint
+    ):
+        return "voxcpm"
+
+    return None
+
+
+def validate_prompt_related_args(args, parser, prompt_text: str | None):
+    if prompt_text and not args.prompt_audio:
+        parser.error("--prompt-text/--prompt-file requires --prompt-audio.")
+
+    if args.prompt_audio and not prompt_text:
+        parser.error("--prompt-audio requires --prompt-text or --prompt-file.")
+
+    if args.control and prompt_text:
+        parser.error(
+            "--control cannot be used together with --prompt-text or --prompt-file."
+        )
+
+
+def validate_reference_support(args, parser):
+    if not getattr(args, "reference_audio", None):
+        return
+
+    arch = detect_model_architecture(args)
+    if arch == "voxcpm":
+        parser.error("--reference-audio is only supported with VoxCPM2 models.")
+
+
+def validate_design_args(args, parser):
+    prompt_text = resolve_prompt_text(args, parser)
+    if args.prompt_audio or args.reference_audio or prompt_text:
+        parser.error(
+            "`design` does not accept prompt/reference audio. Use `clone` instead."
+        )
+
+
+def validate_clone_args(args, parser):
+    prompt_text = resolve_prompt_text(args, parser)
+    validate_prompt_related_args(args, parser, prompt_text)
+    validate_reference_support(args, parser)
+
+    if not args.prompt_audio and not args.reference_audio:
+        parser.error(
+            "`clone` requires --reference-audio, or --prompt-audio with --prompt-text/--prompt-file."
+        )
+
+    return prompt_text
+
+
+def validate_batch_args(args, parser):
+    prompt_text = resolve_prompt_text(args, parser)
+    validate_prompt_related_args(args, parser, prompt_text)
+    validate_reference_support(args, parser)
+    return prompt_text
+
+
 # -----------------------------
 # Model loading
 # -----------------------------
@@ -57,7 +176,9 @@ def validate_ranges(args, parser):
 def load_model(args) -> VoxCPM:
    print("Loading VoxCPM model...", file=sys.stderr)

-    zipenhancer_path = getattr(args, "zipenhancer_path", None) or os.environ.get("ZIPENHANCER_MODEL_PATH", None)
+    zipenhancer_path = getattr(args, "zipenhancer_path", None) or os.environ.get(
+        "ZIPENHANCER_MODEL_PATH", None
+    )

    # Build LoRA config if provided
    lora_config = None
@@ -87,6 +208,7 @@ def load_model(args) -> VoxCPM:
                voxcpm_model_path=args.model_path,
                zipenhancer_model_path=zipenhancer_path,
                enable_denoiser=not args.no_denoiser,
+                optimize=not args.no_optimize,
                lora_config=lora_config,
                lora_weights_path=lora_weights_path,
            )
@@ -104,6 +226,7 @@ def load_model(args) -> VoxCPM:
            zipenhancer_model_id=zipenhancer_path,
            cache_dir=args.cache_dir,
            local_files_only=args.local_files_only,
+            optimize=not args.no_optimize,
            lora_config=lora_config,
            lora_weights_path=lora_weights_path,
        )
@@ -119,32 +242,26 @@ def load_model(args) -> VoxCPM:
 # -----------------------------


-def cmd_clone(args):
-    if not args.text:
-        sys.exit("Error: Please provide --text for synthesis")
-
-    has_prompt = args.prompt_audio and args.prompt_text
-    has_ref = args.reference_audio is not None
-    if not has_prompt and not has_ref:
-        sys.exit("Error: Voice cloning requires --prompt-audio + --prompt-text, or --reference-audio, or both")
+def _run_single(args, parser, *, text: str, output: str, prompt_text: str | None):
+    output_path = validate_output_path(output)

    if args.prompt_audio:
-        validate_file_exists(args.prompt_audio, "prompt audio file")
+        require_file_exists(args.prompt_audio, parser, "prompt audio file")
    if args.reference_audio:
-        validate_file_exists(args.reference_audio, "reference audio file")
-    output_path = validate_output_path(args.output)
+        require_file_exists(args.reference_audio, parser, "reference audio file")

    model = load_model(args)

    audio_array = model.generate(
-        text=args.text,
-        prompt_wav_path=args.prompt_audio if has_prompt else None,
-        prompt_text=args.prompt_text if has_prompt else None,
+        text=text,
+        prompt_wav_path=args.prompt_audio,
+        prompt_text=prompt_text,
        reference_wav_path=args.reference_audio,
        cfg_value=args.cfg_value,
        inference_timesteps=args.inference_timesteps,
        normalize=args.normalize,
-        denoise=args.denoise,
+        denoise=args.denoise
+        and (args.prompt_audio is not None or args.reference_audio is not None),
    )

    sf.write(str(output_path), audio_array, model.tts_model.sample_rate)
@@ -153,31 +270,24 @@ def cmd_clone(args):
    print(f"Saved audio to: {output_path} ({duration:.2f}s)", file=sys.stderr)


-def cmd_synthesize(args):
-    if not args.text:
-        sys.exit("Error: Please provide --text for synthesis")
-
-    output_path = validate_output_path(args.output)
-    model = load_model(args)
-
-    audio_array = model.generate(
-        text=args.text,
-        prompt_wav_path=None,
-        prompt_text=None,
-        cfg_value=args.cfg_value,
-        inference_timesteps=args.inference_timesteps,
-        normalize=args.normalize,
-        denoise=False,
+def cmd_design(args, parser):
+    validate_design_args(args, parser)
+    final_text = build_final_text(args.text, args.control)
+    return _run_single(
+        args, parser, text=final_text, output=args.output, prompt_text=None
    )

-    sf.write(str(output_path), audio_array, model.tts_model.sample_rate)

-    duration = len(audio_array) / model.tts_model.sample_rate
-    print(f"Saved audio to: {output_path} ({duration:.2f}s)", file=sys.stderr)
+def cmd_clone(args, parser):
+    prompt_text = validate_clone_args(args, parser)
+    final_text = build_final_text(args.text, args.control)
+    return _run_single(
+        args, parser, text=final_text, output=args.output, prompt_text=prompt_text
+    )


-def cmd_batch(args):
-    input_file = validate_file_exists(args.input, "input file")
+def cmd_batch(args, parser):
+    input_file = require_file_exists(args.input, parser, "input file")
    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

@@ -187,29 +297,36 @@ def cmd_batch(args):
    if not texts:
        sys.exit("Error: Input file is empty")

+    prompt_text = validate_batch_args(args, parser)
    model = load_model(args)

    prompt_audio_path = None
    if args.prompt_audio:
-        prompt_audio_path = str(validate_file_exists(args.prompt_audio, "prompt audio file"))
+        prompt_audio_path = str(
+            require_file_exists(args.prompt_audio, parser, "prompt audio file")
+        )

    reference_audio_path = None
    if args.reference_audio:
-        reference_audio_path = str(validate_file_exists(args.reference_audio, "reference audio file"))
+        reference_audio_path = str(
+            require_file_exists(args.reference_audio, parser, "reference audio file")
+        )

    success_count = 0

    for i, text in enumerate(texts, 1):
        try:
+            final_text = build_final_text(text, args.control)
            audio_array = model.generate(
-                text=text,
+                text=final_text,
                prompt_wav_path=prompt_audio_path,
-                prompt_text=args.prompt_text,
+                prompt_text=prompt_text,
                reference_wav_path=reference_audio_path,
                cfg_value=args.cfg_value,
                inference_timesteps=args.inference_timesteps,
                normalize=args.normalize,
-                denoise=args.denoise and (prompt_audio_path is not None or reference_audio_path is not None),
+                denoise=args.denoise
+                and (prompt_audio_path is not None or reference_audio_path is not None),
            )

            output_file = output_dir / f"output_{i:03d}.wav"
@@ -230,97 +347,251 @@ def cmd_batch(args):
 # -----------------------------


-def _build_unified_parser():
+def _add_common_generation_args(parser):
+    parser.add_argument("--text", "-t", help="Text to synthesize")
+    parser.add_argument(
+        "--control",
+        type=str,
+        help="Control instruction for VoxCPM2 voice design/cloning",
+    )
+    parser.add_argument(
+        "--cfg-value",
+        type=float,
+        default=2.0,
+        help="CFG guidance scale (float, recommended 0.5–5.0, default: 2.0)",
+    )
+    parser.add_argument(
+        "--inference-timesteps",
+        type=int,
+        default=10,
+        help="Inference steps (int, 1–100, default: 10)",
+    )
+    parser.add_argument(
+        "--normalize", action="store_true", help="Enable text normalization"
+    )
+
+
+def _add_prompt_reference_args(parser):
+    parser.add_argument(
+        "--prompt-audio",
+        "-pa",
+        help="Prompt audio file path (continuation mode, requires --prompt-text or --prompt-file)",
+    )
+    parser.add_argument(
+        "--prompt-text", "-pt", help="Text corresponding to the prompt audio"
+    )
+    parser.add_argument(
+        "--prompt-file", type=str, help="Text file corresponding to the prompt audio"
+    )
+    parser.add_argument(
+        "--reference-audio",
+        "-ra",
+        help="Reference audio for voice cloning (VoxCPM2 only)",
+    )
+    parser.add_argument(
+        "--denoise",
+        action="store_true",
+        help="Enable prompt/reference speech enhancement",
+    )
+
+
+def _add_model_args(parser):
+    parser.add_argument("--model-path", type=str, help="Local VoxCPM model path")
+    parser.add_argument(
+        "--hf-model-id",
+        type=str,
+        default=DEFAULT_HF_MODEL_ID,
+        help=f"Hugging Face repo id (default: {DEFAULT_HF_MODEL_ID})",
+    )
+    parser.add_argument(
+        "--cache-dir", type=str, help="Cache directory for Hub downloads"
+    )
+    parser.add_argument(
+        "--local-files-only", action="store_true", help="Disable network access"
+    )
+    parser.add_argument(
+        "--no-denoiser", action="store_true", help="Disable denoiser model loading"
+    )
+    parser.add_argument(
+        "--no-optimize",
+        action="store_true",
+        help="Disable model optimization during loading",
+    )
+    parser.add_argument(
+        "--zipenhancer-path",
+        type=str,
+        help="ZipEnhancer model id or local path (or env ZIPENHANCER_MODEL_PATH)",
+    )
+
+
+def _add_lora_args(parser):
+    parser.add_argument("--lora-path", type=str, help="Path to LoRA weights")
+    parser.add_argument(
+        "--lora-r", type=int, default=32, help="LoRA rank (positive int, default: 32)"
+    )
+    parser.add_argument(
+        "--lora-alpha",
+        type=int,
+        default=16,
+        help="LoRA alpha (positive int, default: 16)",
+    )
+    parser.add_argument(
+        "--lora-dropout",
+        type=float,
+        default=0.0,
+        help="LoRA dropout rate (0.0–1.0, default: 0.0)",
+    )
+    parser.add_argument(
+        "--lora-disable-lm", action="store_true", help="Disable LoRA on LM layers"
+    )
+    parser.add_argument(
+        "--lora-disable-dit", action="store_true", help="Disable LoRA on DiT layers"
+    )
+    parser.add_argument(
+        "--lora-enable-proj",
+        action="store_true",
+        help="Enable LoRA on projection layers",
+    )
+
+
+def _build_parser():
    parser = argparse.ArgumentParser(
-        description="VoxCPM CLI - voice cloning, direct TTS, and batch processing",
+        description="VoxCPM CLI - VoxCPM2-first voice design, cloning, and batch processing",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
 Examples:
-  voxcpm --text "Hello world" --output out.wav
-  voxcpm --text "Hello" --prompt-audio ref.wav --prompt-text "hi" --output out.wav --denoise
-  voxcpm --input texts.txt --output-dir ./outs
+  voxcpm design --text "Hello world" --output out.wav
+  voxcpm design --text "Hello world" --control "warm female voice" --output out.wav
+  voxcpm clone --text "Hello" --reference-audio ref.wav --output out.wav
+  voxcpm batch --input texts.txt --output-dir ./outs --reference-audio ref.wav
        """,
    )

-    # Mode selection
+    subparsers = parser.add_subparsers(dest="command")
+
+    design_parser = subparsers.add_parser(
+        "design", help="Generate speech with VoxCPM2-first voice design"
+    )
+    _add_common_generation_args(design_parser)
+    _add_prompt_reference_args(design_parser)
+    _add_model_args(design_parser)
+    _add_lora_args(design_parser)
+    design_parser.add_argument(
+        "--output", "-o", required=True, help="Output audio file path"
+    )
+
+    clone_parser = subparsers.add_parser(
+        "clone", help="Clone a voice with reference/prompt audio"
+    )
+    _add_common_generation_args(clone_parser)
+    _add_prompt_reference_args(clone_parser)
+    _add_model_args(clone_parser)
+    _add_lora_args(clone_parser)
+    clone_parser.add_argument(
+        "--output", "-o", required=True, help="Output audio file path"
+    )
+
+    batch_parser = subparsers.add_parser(
+        "batch", help="Batch-generate one line per output file"
+    )
+    batch_parser.add_argument(
+        "--input", "-i", required=True, help="Input text file (one text per line)"
+    )
+    batch_parser.add_argument(
+        "--output-dir", "-od", required=True, help="Output directory"
+    )
+    batch_parser.add_argument(
+        "--control",
+        type=str,
+        help="Control instruction for VoxCPM2 voice design/cloning",
+    )
+    _add_prompt_reference_args(batch_parser)
+    batch_parser.add_argument(
+        "--cfg-value",
+        type=float,
+        default=2.0,
+        help="CFG guidance scale (float, recommended 0.5–5.0, default: 2.0)",
+    )
+    batch_parser.add_argument(
+        "--inference-timesteps",
+        type=int,
+        default=10,
+        help="Inference steps (int, 1–100, default: 10)",
+    )
+    batch_parser.add_argument(
+        "--normalize", action="store_true", help="Enable text normalization"
+    )
+    _add_model_args(batch_parser)
+    _add_lora_args(batch_parser)
+
+    # Legacy root arguments
    parser.add_argument("--input", "-i", help="Input text file (batch mode only)")
-    parser.add_argument("--output-dir", "-od", help="Output directory (batch mode only)")
-    parser.add_argument("--text", "-t", help="Text to synthesize (single or clone mode)")
-    parser.add_argument("--output", "-o", help="Output audio file path (single or clone mode)")
-
-    # Prompt / Reference
    parser.add_argument(
-        "--prompt-audio", "-pa", help="Prompt audio file path (continuation mode, requires --prompt-text)"
+        "--output-dir", "-od", help="Output directory (batch mode only)"
    )
-    parser.add_argument("--prompt-text", "-pt", help="Text corresponding to the prompt audio")
+    _add_common_generation_args(parser)
    parser.add_argument(
-        "--reference-audio", "-ra", help="Reference audio for voice cloning (isolated mode, VoxCPM2 only)"
+        "--output", "-o", help="Output audio file path (single or clone mode)"
    )
-    parser.add_argument("--denoise", action="store_true", help="Enable prompt/reference speech enhancement")
-
-    # Generation parameters
-    parser.add_argument(
-        "--cfg-value", type=float, default=2.0, help="CFG guidance scale (float, recommended 0.5–5.0, default: 2.0)"
-    )
-    parser.add_argument("--inference-timesteps", type=int, default=10, help="Inference steps (int, 1–100, default: 10)")
-    parser.add_argument("--normalize", action="store_true", help="Enable text normalization")
-
-    # Model loading
-    parser.add_argument("--model-path", type=str, help="Local VoxCPM model path")
-    parser.add_argument(
-        "--hf-model-id", type=str, default="openbmb/VoxCPM1.5", help="Hugging Face repo id (default: openbmb/VoxCPM1.5)"
-    )
-    parser.add_argument("--cache-dir", type=str, help="Cache directory for Hub downloads")
-    parser.add_argument("--local-files-only", action="store_true", help="Disable network access")
-    parser.add_argument("--no-denoiser", action="store_true", help="Disable denoiser model loading")
-    parser.add_argument(
-        "--zipenhancer-path", type=str, help="ZipEnhancer model id or local path (or env ZIPENHANCER_MODEL_PATH)"
-    )
-
-    # LoRA
-    parser.add_argument("--lora-path", type=str, help="Path to LoRA weights")
-    parser.add_argument("--lora-r", type=int, default=32, help="LoRA rank (positive int, default: 32)")
-    parser.add_argument("--lora-alpha", type=int, default=16, help="LoRA alpha (positive int, default: 16)")
-    parser.add_argument("--lora-dropout", type=float, default=0.0, help="LoRA dropout rate (0.0–1.0, default: 0.0)")
-    parser.add_argument("--lora-disable-lm", action="store_true", help="Disable LoRA on LM layers")
-    parser.add_argument("--lora-disable-dit", action="store_true", help="Disable LoRA on DiT layers")
-    parser.add_argument("--lora-enable-proj", action="store_true", help="Enable LoRA on projection layers")
+    _add_prompt_reference_args(parser)
+    _add_model_args(parser)
+    _add_lora_args(parser)

    return parser


+def _dispatch_legacy(args, parser):
+    warn_legacy_mode()
+
+    if args.input and args.text:
+        parser.error(
+            "Use either batch mode (--input) or single mode (--text), not both."
+        )
+
+    if args.input:
+        if not args.output_dir:
+            parser.error("Batch mode requires --output-dir")
+        return cmd_batch(args, parser)
+
+    if not args.text or not args.output:
+        parser.error("Single-sample legacy mode requires --text and --output")
+
+    if (
+        args.prompt_audio
+        or args.prompt_text
+        or args.prompt_file
+        or args.reference_audio
+    ):
+        return cmd_clone(args, parser)
+
+    return cmd_design(args, parser)
+
+
 # -----------------------------
 # Entrypoint
 # -----------------------------


 def main():
-    parser = _build_unified_parser()
+    parser = _build_parser()
    args = parser.parse_args()

-    # Validate ranges
    validate_ranges(args, parser)

-    # Mode conflict checks
-    if args.input and args.text:
-        parser.error("Use either batch mode (--input) or single mode (--text), not both.")
+    if args.command == "design":
+        if not args.text:
+            parser.error("`design` requires --text")
+        return cmd_design(args, parser)

-    # Batch mode
-    if args.input:
-        if not args.output_dir:
-            parser.error("Batch mode requires --output-dir")
-        return cmd_batch(args)
+    if args.command == "clone":
+        if not args.text or not args.output:
+            parser.error("`clone` requires --text and --output")
+        return cmd_clone(args, parser)

-    # Single mode
-    if not args.text or not args.output:
-        parser.error("Single-sample mode requires --text and --output")
+    if args.command == "batch":
+        return cmd_batch(args, parser)

-    # Clone mode (prompt continuation, reference isolation, or both)
-    if args.prompt_audio or args.prompt_text or args.reference_audio:
-        return cmd_clone(args)
-
-    # Direct synthesis
-    return cmd_synthesize(args)
+    return _dispatch_legacy(args, parser)


 if __name__ == "__main__":