diff --git a/conf/voxcpm_v2/voxcpm_finetune_all.yaml b/conf/voxcpm_v2/voxcpm_finetune_all.yaml index 0b32beb..a949b4c 100644 --- a/conf/voxcpm_v2/voxcpm_finetune_all.yaml +++ b/conf/voxcpm_v2/voxcpm_finetune_all.yaml @@ -2,7 +2,7 @@ pretrained_path: /path/to/VoxCPM2/ train_manifest: /path/to/train.jsonl val_manifest: null sample_rate: 16000 # AudioVAE encoder input rate; must match audio_vae_config.sample_rate -out_sample_rate: 48000 # AudioVAE decoder output rate; only used at inference, not during training +out_sample_rate: 48000 # AudioVAE decoder output rate; used for TensorBoard audio logging batch_size: 2 grad_accum_steps: 8 # effective batch size = batch_size × grad_accum_steps = 16 num_workers: 8 @@ -15,6 +15,7 @@ weight_decay: 0.01 warmup_steps: 100 max_steps: 1000 max_batch_tokens: 8192 +max_grad_norm: 1.0 # gradient clipping max norm; 0 = disabled save_path: /path/to/checkpoints/finetune_all tensorboard: /path/to/logs/finetune_all lambdas: diff --git a/conf/voxcpm_v2/voxcpm_finetune_lora.yaml b/conf/voxcpm_v2/voxcpm_finetune_lora.yaml index 32c9a40..515879e 100644 --- a/conf/voxcpm_v2/voxcpm_finetune_lora.yaml +++ b/conf/voxcpm_v2/voxcpm_finetune_lora.yaml @@ -2,7 +2,7 @@ pretrained_path: /path/to/VoxCPM2/ train_manifest: /path/to/train.jsonl val_manifest: null sample_rate: 16000 # AudioVAE encoder input rate; must match audio_vae_config.sample_rate -out_sample_rate: 48000 # AudioVAE decoder output rate; only used at inference, not during training +out_sample_rate: 48000 # AudioVAE decoder output rate; used for TensorBoard audio logging batch_size: 2 grad_accum_steps: 8 # effective batch size = batch_size × grad_accum_steps = 16 num_workers: 8 @@ -15,6 +15,7 @@ weight_decay: 0.01 warmup_steps: 100 max_steps: 1000 max_batch_tokens: 8192 +max_grad_norm: 1.0 # gradient clipping max norm; 0 = disabled save_path: /path/to/checkpoints/finetune_lora tensorboard: /path/to/logs/finetune_lora lambdas: diff --git a/lora_ft_webui.py b/lora_ft_webui.py index e9982ea..439261b 100644 --- a/lora_ft_webui.py +++ b/lora_ft_webui.py @@ -14,8 +14,10 @@ from typing import Optional project_root = Path(__file__).parent sys.path.insert(0, str(project_root / "src")) -# Default pretrained model path relative to this repo -default_pretrained_path = str(project_root / "models" / "openbmb__VoxCPM1.5") +# Default pretrained model path: prefer VoxCPM2 if it exists, fallback to VoxCPM1.5 +_v2_path = project_root / "models" / "openbmb__VoxCPM2" +_v15_path = project_root / "models" / "openbmb__VoxCPM1.5" +default_pretrained_path = str(_v2_path if _v2_path.exists() else _v15_path) from voxcpm.core import VoxCPM from voxcpm.model.voxcpm import LoRAConfig @@ -368,6 +370,7 @@ def start_training( warmup_steps=100, max_steps=None, sample_rate=44100, + max_grad_norm=1.0, # LoRA advanced enable_lm=True, enable_dit=True, @@ -409,11 +412,25 @@ def start_training( # Resolve max_steps default resolved_max_steps = int(max_steps) if max_steps not in (None, "", 0) else int(num_iters) + # Auto-detect out_sample_rate from model config + out_sample_rate = 0 + config_file = os.path.join(pretrained_path, "config.json") + if os.path.isfile(config_file): + try: + with open(config_file, "r", encoding="utf-8") as f: + cfg = json.load(f) + out_sr = cfg.get("audio_vae_config", {}).get("out_sample_rate") + if out_sr: + out_sample_rate = int(out_sr) + except Exception: + pass + config = { "pretrained_path": pretrained_path, "train_manifest": train_manifest, "val_manifest": val_manifest, "sample_rate": int(sample_rate), + "out_sample_rate": out_sample_rate, "batch_size": int(batch_size), "grad_accum_steps": int(grad_accum_steps), "num_workers": int(num_workers), @@ -425,6 +442,7 @@ def start_training( "weight_decay": float(weight_decay), "warmup_steps": int(warmup_steps), "max_steps": resolved_max_steps, + "max_grad_norm": float(max_grad_norm), "save_path": checkpoints_dir, "tensorboard": tensorboard_path if tensorboard_path else logs_dir, "lambdas": {"loss/diff": 1.0, "loss/stop": 1.0}, @@ -932,17 +950,19 @@ with gr.Blocks(title="VoxCPM LoRA WebUI", theme=gr.themes.Soft(), css=custom_css with gr.Row(): max_steps = gr.Number(label="最大步数 (max_steps, 0→默认num_iters)", value=0, precision=0) sample_rate = gr.Number(label="采样率 (sample_rate)", value=44100, precision=0) - tensorboard_path = gr.Textbox(label="Tensorboard 路径 (可选)", value="") + max_grad_norm = gr.Number(label="梯度裁剪 (max_grad_norm, 0=关闭)", value=1.0) with gr.Row(): + tensorboard_path = gr.Textbox(label="Tensorboard 路径 (可选)", value="") enable_lm = gr.Checkbox(label="启用 LoRA LM (enable_lm)", value=True) enable_dit = gr.Checkbox(label="启用 LoRA DIT (enable_dit)", value=True) + with gr.Row(): enable_proj = gr.Checkbox(label="启用投影 (enable_proj)", value=False) dropout = gr.Number(label="LoRA Dropout", value=0.0) gr.Markdown("#### 分发选项 (Distribution)") with gr.Row(): hf_model_id = gr.Textbox( - label="HuggingFace Model ID (e.g., openbmb/VoxCPM1.5)", value="openbmb/VoxCPM1.5" + label="HuggingFace Model ID (e.g., openbmb/VoxCPM2)", value="" ) distribute = gr.Checkbox(label="分发模式 (distribute)", value=False) @@ -992,6 +1012,7 @@ with gr.Blocks(title="VoxCPM LoRA WebUI", theme=gr.themes.Soft(), css=custom_css warmup_steps, max_steps, sample_rate, + max_grad_norm, enable_lm, enable_dit, enable_proj, @@ -1150,12 +1171,13 @@ with gr.Blocks(title="VoxCPM LoRA WebUI", theme=gr.themes.Soft(), css=custom_css "warmup_steps": "warmup_steps", "max_steps": "最大步数 (max_steps)", "sample_rate": "采样率 (sample_rate)", + "max_grad_norm": "梯度裁剪 (max_grad_norm, 0=关闭)", "enable_lm": "启用 LoRA LM (enable_lm)", "enable_dit": "启用 LoRA DIT (enable_dit)", "enable_proj": "启用投影 (enable_proj)", "dropout": "LoRA Dropout", "tensorboard_path": "Tensorboard 路径 (可选)", - "hf_model_id": "HuggingFace Model ID (e.g., openbmb/VoxCPM1.5)", + "hf_model_id": "HuggingFace Model ID (e.g., openbmb/VoxCPM2)", "distribute": "分发模式 (distribute)", } else: @@ -1168,12 +1190,13 @@ with gr.Blocks(title="VoxCPM LoRA WebUI", theme=gr.themes.Soft(), css=custom_css "warmup_steps": "Warmup Steps", "max_steps": "Max Steps", "sample_rate": "Sample Rate", + "max_grad_norm": "Max Grad Norm (0=disabled)", "enable_lm": "Enable LoRA LM", "enable_dit": "Enable LoRA DIT", "enable_proj": "Enable Projection", "dropout": "LoRA Dropout", "tensorboard_path": "Tensorboard Path (Optional)", - "hf_model_id": "HuggingFace Model ID (e.g., openbmb/VoxCPM1.5)", + "hf_model_id": "HuggingFace Model ID (e.g., openbmb/VoxCPM2)", "distribute": "Distribute Mode", } @@ -1203,11 +1226,12 @@ with gr.Blocks(title="VoxCPM LoRA WebUI", theme=gr.themes.Soft(), css=custom_css gr.update(label=adv["warmup_steps"]), gr.update(label=adv["max_steps"]), gr.update(label=adv["sample_rate"]), + gr.update(label=adv["max_grad_norm"]), + gr.update(label=adv["tensorboard_path"]), gr.update(label=adv["enable_lm"]), gr.update(label=adv["enable_dit"]), gr.update(label=adv["enable_proj"]), gr.update(label=adv["dropout"]), - gr.update(label=adv["tensorboard_path"]), # Distribution options gr.update(label=adv["hf_model_id"]), gr.update(label=adv["distribute"]), @@ -1254,11 +1278,12 @@ with gr.Blocks(title="VoxCPM LoRA WebUI", theme=gr.themes.Soft(), css=custom_css warmup_steps, max_steps, sample_rate, + max_grad_norm, + tensorboard_path, enable_lm, enable_dit, enable_proj, dropout, - tensorboard_path, # distribution outputs hf_model_id, distribute, diff --git a/scripts/train_voxcpm_finetune.py b/scripts/train_voxcpm_finetune.py index 2b05e6b..c3da4dc 100644 --- a/scripts/train_voxcpm_finetune.py +++ b/scripts/train_voxcpm_finetune.py @@ -30,7 +30,8 @@ except ImportError: import json from voxcpm.model import VoxCPMModel, VoxCPM2Model -from voxcpm.model.voxcpm import LoRAConfig +from voxcpm.model.voxcpm import LoRAConfig as LoRAConfigV1 +from voxcpm.model.voxcpm2 import LoRAConfig as LoRAConfigV2 from voxcpm.training import ( Accelerator, BatchProcessor, @@ -46,7 +47,7 @@ def train( train_manifest: str, val_manifest: str = "", sample_rate: int = 16_000, - out_sample_rate: int = 0, # accepted from YAML for documentation; not used in training + out_sample_rate: int = 0, # AudioVAE decoder output rate; used for TensorBoard audio logging batch_size: int = 1, grad_accum_steps: int = 1, num_workers: int = 2, @@ -64,12 +65,12 @@ def train( lambdas: Dict[str, float] = {"loss/diff": 1.0, "loss/stop": 1.0}, lora: dict = None, config_path: str = "", + max_grad_norm: float = 0.0, # gradient clipping; 0 = disabled (backward compat) # Distribution options (for LoRA checkpoints) hf_model_id: str = "", # HuggingFace model ID (e.g., "openbmb/VoxCPM1.5") distribute: bool = False, # If True, save hf_model_id as base_model; otherwise save pretrained_path ): _ = config_path - _ = out_sample_rate # Validate distribution options if lora is not None and distribute and not hf_model_id: @@ -93,6 +94,7 @@ def train( with open(os.path.join(pretrained_path, "config.json"), "r", encoding="utf-8") as _f: _arch = json.load(_f).get("architecture", "voxcpm").lower() _model_cls = VoxCPM2Model if _arch == "voxcpm2" else VoxCPMModel + LoRAConfig = LoRAConfigV2 if _arch == "voxcpm2" else LoRAConfigV1 if accelerator.rank == 0: print(f"Detected architecture: {_arch} -> {_model_cls.__name__}", file=sys.stderr) base_model = _model_cls.from_local( @@ -178,8 +180,12 @@ def train( dataset_cnt=dataset_cnt, device=accelerator.device, ) - # Save audio_vae for audio generation + # Save audio_vae and output sample rate for audio generation. + # Prefer model's actual output rate; fall back to YAML out_sample_rate or encode rate. audio_vae_for_gen = base_model.audio_vae + out_sr = base_model.sample_rate # decoder output rate (e.g. 48000 for V2) + if out_sr == 0 and out_sample_rate > 0: + out_sr = out_sample_rate del base_model.audio_vae model = accelerator.prepare_model(base_model) unwrapped_model = accelerator.unwrap(model) @@ -312,8 +318,8 @@ def train( scaler = getattr(accelerator, "scaler", None) if scaler is not None: scaler.unscale_(optimizer) - # Use large max_norm to only compute grad_norm without actual clipping - grad_norm = torch.nn.utils.clip_grad_norm_(unwrapped_model.parameters(), max_norm=1e9) + effective_max_norm = max_grad_norm if max_grad_norm > 0 else 1e9 + grad_norm = torch.nn.utils.clip_grad_norm_(unwrapped_model.parameters(), max_norm=effective_max_norm) accelerator.step(optimizer) accelerator.update() @@ -341,6 +347,7 @@ def train( val_ds=val_ds, audio_vae=audio_vae_for_gen, sample_rate=sample_rate, + out_sample_rate=out_sr, val_texts=val_texts, tokenizer=tokenizer, valid_interval=valid_interval, @@ -367,6 +374,7 @@ def validate( val_ds=None, audio_vae=None, sample_rate=22050, + out_sample_rate=0, val_texts=None, tokenizer=None, valid_interval=1000, @@ -432,6 +440,7 @@ def validate( step, accelerator, sample_rate, + out_sample_rate=out_sample_rate, val_texts=val_texts, tokenizer=tokenizer, valid_interval=valid_interval, @@ -534,6 +543,7 @@ def generate_sample_audio( step, accelerator, sample_rate=22050, + out_sample_rate=0, val_texts=None, tokenizer=None, pretrained_path=None, @@ -548,6 +558,10 @@ def generate_sample_audio( log(f"[Audio] Starting audio generation for {num_samples} samples at step {step}") unwrapped_model = accelerator.unwrap(model) + # Determine the correct output sample rate for generated audio. + # out_sample_rate is the decoder output rate (e.g. 48kHz for V2); + # sample_rate is the encoder input rate (e.g. 16kHz for V2). + gen_sr = out_sample_rate if out_sample_rate > 0 else sample_rate for i in range(num_samples): sample = val_ds[i] @@ -604,10 +618,10 @@ def generate_sample_audio( gen_audio_np = normalize_audio(gen_audio_np) tag = f"val_sample_{i}" - writer.add_audio(f"{tag}/generated_audio", gen_audio_np, global_step=step, sample_rate=sample_rate) - log(f"[Audio] Generated audio for sample {i}: duration={len(gen_audio_np)/sample_rate:.2f}s") + writer.add_audio(f"{tag}/generated_audio", gen_audio_np, global_step=step, sample_rate=gen_sr) + log(f"[Audio] Generated audio for sample {i}: duration={len(gen_audio_np)/gen_sr:.2f}s") - # Log reference audio + # Log reference audio (at encoder input rate, which is what val_ds provides) if ref_audio_np is not None: writer.add_audio( f"{tag}/reference_audio", normalize_audio(ref_audio_np), global_step=step, sample_rate=sample_rate @@ -615,9 +629,9 @@ def generate_sample_audio( # Generate mel spectrogram figure try: - mel_gen = compute_mel_spectrogram(gen_audio_np, sample_rate) + mel_gen = compute_mel_spectrogram(gen_audio_np, gen_sr) mel_ref = compute_mel_spectrogram(ref_audio_np, sample_rate) if ref_audio_np is not None else None - fig = create_mel_figure(gen_audio_np, mel_gen, sample_rate, step, ref_audio_np, mel_ref) + fig = create_mel_figure(gen_audio_np, mel_gen, gen_sr, step, ref_audio_np, mel_ref) writer.add_figure(f"{tag}/mel_spectrogram", fig, global_step=step) log(f"[Audio] Created mel spectrogram figure for sample {i}") except Exception as e: diff --git a/src/voxcpm/modules/locdit/unified_cfm.py b/src/voxcpm/modules/locdit/unified_cfm.py index db7d71d..a16f5e0 100644 --- a/src/voxcpm/modules/locdit/unified_cfm.py +++ b/src/voxcpm/modules/locdit/unified_cfm.py @@ -225,7 +225,7 @@ class UnifiedCFM(torch.nn.Module): losses = F.mse_loss(u_pred, u_tgt.detach(), reduction="none").mean(dim=1) if tgt_mask is not None: weights = self.adaptive_loss_weighting(losses, tgt_mask.squeeze(1)) - loss = (weights * losses).sum() / torch.sum(tgt_mask) + loss = (weights * losses).sum() / torch.clamp(torch.sum(tgt_mask), min=1.0) else: loss = losses.mean()