fix: address four validation correctness issues from review

- Invalid audio rows (bad path or sample-rate mismatch) no longer
  increment valid_samples; has_error is now set on any audio failure
- _check_audio_file now enforces the expected sample rate when soundfile
  is available, making --sample-rate actually useful
- ref_audio missing-file warning is emitted for every invalid entry
  independently, not only before the first valid one is seen
- New tests cover each of the four corrected behaviours: invalid audio
  count, sample-rate mismatch, mixed ref_audio, and CLI exit code
This commit is contained in:
supermario_leo
2026-04-22 05:06:35 +08:00
parent 4457617953
commit 4509becfde
2 changed files with 77 additions and 2 deletions
+12 -2
View File
@@ -31,7 +31,10 @@ class ValidationResult:
def _check_audio_file(audio_path: str, sample_rate: int) -> Optional[str]: def _check_audio_file(audio_path: str, sample_rate: int) -> Optional[str]:
"""Check if an audio file exists and is readable. Returns error message or None.""" """Check if an audio file exists, is readable, and matches expected sample rate.
Returns an error message, or None if the file is valid.
"""
if not os.path.isfile(audio_path): if not os.path.isfile(audio_path):
return f"Audio file not found: {audio_path}" return f"Audio file not found: {audio_path}"
try: try:
@@ -40,6 +43,11 @@ def _check_audio_file(audio_path: str, sample_rate: int) -> Optional[str]:
info = sf.info(audio_path) info = sf.info(audio_path)
if info.frames == 0: if info.frames == 0:
return f"Audio file is empty: {audio_path}" return f"Audio file is empty: {audio_path}"
if info.samplerate != sample_rate:
return (
f"Sample rate mismatch in {audio_path}: "
f"expected {sample_rate} Hz, got {info.samplerate} Hz"
)
return None return None
except ImportError: except ImportError:
# soundfile not available; just check existence # soundfile not available; just check existence
@@ -173,6 +181,7 @@ def validate_manifest(
missing_audio_count += 1 missing_audio_count += 1
if missing_audio_count <= 5: if missing_audio_count <= 5:
result.errors.append(f"Line {i + 1}: {audio_error}") result.errors.append(f"Line {i + 1}: {audio_error}")
has_error = True
else: else:
duration = _get_audio_duration(audio_path) duration = _get_audio_duration(audio_path)
if duration is not None: if duration is not None:
@@ -187,6 +196,7 @@ def validate_manifest(
) )
else: else:
result.errors.append(f"Line {i + 1}: Invalid audio path") result.errors.append(f"Line {i + 1}: Invalid audio path")
has_error = True
# Validate optional ref_audio # Validate optional ref_audio
if "ref_audio" in entry: if "ref_audio" in entry:
@@ -198,7 +208,7 @@ def validate_manifest(
ref_path = str(manifest_dir / ref_path) ref_path = str(manifest_dir / ref_path)
if os.path.isfile(ref_path): if os.path.isfile(ref_path):
result.has_ref_audio += 1 result.has_ref_audio += 1
elif result.has_ref_audio == 0: else:
result.warnings.append( result.warnings.append(
f"Line {i + 1}: ref_audio file not found: {ref_path}" f"Line {i + 1}: ref_audio file not found: {ref_path}"
) )
+65
View File
@@ -183,3 +183,68 @@ class TestValidateManifest:
r3 = ValidationResult(total_samples=0, valid_samples=0) r3 = ValidationResult(total_samples=0, valid_samples=0)
assert not r3.is_valid assert not r3.is_valid
def test_invalid_audio_not_counted_as_valid(self, tmp_dir):
"""A row with a bad audio path must not increment valid_samples."""
manifest = tmp_dir / "bad_audio.jsonl"
_write_manifest(
manifest,
[{"text": "hello", "audio": "/nonexistent/audio.wav"}],
)
result = validate_manifest(str(manifest))
assert result.total_samples == 1
assert result.valid_samples == 0
assert not result.is_valid
assert any("not found" in e for e in result.errors)
def test_sample_rate_mismatch(self, tmp_dir):
"""A file with a different sample rate should be reported as an error."""
try:
import soundfile as sf
import numpy as np
except ImportError:
pytest.skip("soundfile not available")
audio = tmp_dir / "audio_8k.wav"
import numpy as np
samples = np.zeros(8000, dtype=np.float32)
sf.write(str(audio), samples, 8000)
manifest = tmp_dir / "sr_mismatch.jsonl"
_write_manifest(manifest, [{"text": "hello", "audio": str(audio)}])
result = validate_manifest(str(manifest), sample_rate=16000)
assert result.valid_samples == 0
assert not result.is_valid
assert any("Sample rate mismatch" in e or "sample rate" in e.lower() for e in result.errors)
def test_mixed_ref_audio_warns_for_each_missing(self, tmp_dir):
"""Missing ref_audio entries should each generate a warning independently."""
audio = tmp_dir / "audio.wav"
ref_good = tmp_dir / "ref_good.wav"
_create_wav(audio)
_create_wav(ref_good)
manifest = tmp_dir / "mixed_ref.jsonl"
_write_manifest(
manifest,
[
{"text": "row1", "audio": str(audio), "ref_audio": str(ref_good)},
{"text": "row2", "audio": str(audio), "ref_audio": "/nonexistent/ref.wav"},
],
)
result = validate_manifest(str(manifest))
assert result.has_ref_audio == 1
assert any("ref_audio file not found" in w for w in result.warnings)
def test_cli_validate_exit_code(self, tmp_dir):
"""validate subcommand must exit non-zero on error."""
import subprocess
manifest = tmp_dir / "bad.jsonl"
_write_manifest(manifest, [{"text": "hi", "audio": "/nonexistent/x.wav"}])
proc = subprocess.run(
[sys.executable, "-m", "voxcpm.cli", "validate", str(manifest)],
capture_output=True,
)
assert proc.returncode != 0