| import json |
| import torchaudio |
| from tqdm import tqdm |
| import os |
| import sys |
| from collections import defaultdict |
|
|
| def validate_jsonl_audios(jsonl_path): |
| """验证JSONL文件中所有音频文件的完整性""" |
| stats = defaultdict(int) |
| error_log = [] |
| valid_samples = 0 |
|
|
| |
| with open(jsonl_path, 'r') as f: |
| total_lines = sum(1 for _ in f) |
|
|
| |
| with open(jsonl_path, 'r') as f: |
| for line_num, line in enumerate(tqdm(f, total=total_lines, desc="验证进度", unit="line")): |
| try: |
| data = json.loads(line.strip()) |
| if 'audios' not in data or not data['audios']: |
| stats['no_audio_field'] += 1 |
| continue |
|
|
| for audio_path in data['audios']: |
| |
| if not os.path.exists(audio_path): |
| stats['missing'] += 1 |
| error_log.append(f"[行{line_num+1}] 缺失文件: {audio_path}") |
| continue |
|
|
| |
| if os.path.getsize(audio_path) == 0: |
| stats['zero_size'] += 1 |
| error_log.append(f"[行{line_num+1}] 空文件: {audio_path}") |
| continue |
|
|
| |
| try: |
| waveform, sr = torchaudio.load(audio_path) |
| if waveform.numel() == 0: |
| stats['empty_audio'] += 1 |
| error_log.append(f"[行{line_num+1}] 空音频: {audio_path}") |
| elif sr not in [8000, 16000, 22050, 44100, 48000]: |
| stats['abnormal_sr'] += 1 |
| error_log.append(f"[行{line_num+1}] 异常采样率({sr}Hz): {audio_path}") |
| else: |
| stats['valid'] += 1 |
| except Exception as e: |
| stats['corrupted'] += 1 |
| error_type = str(e).split('(')[0] |
| error_log.append(f"[行{line_num+1}] 损坏文件({error_type}): {audio_path}") |
|
|
| valid_samples += 1 |
|
|
| except json.JSONDecodeError: |
| stats['invalid_json'] += 1 |
| error_log.append(f"[行{line_num+1}] 无效JSON格式") |
|
|
| |
| print("\n===== 验证报告 =====") |
| print(f"总行数: {total_lines}") |
| print(f"有效样本: {valid_samples}") |
| print("--- 问题统计 ---") |
| for k, v in sorted(stats.items()): |
| print(f"{k}: {v}") |
|
|
| |
| if error_log: |
| log_file = f"{os.path.splitext(jsonl_path)[0]}_audio_errors.log" |
| with open(log_file, 'w') as f: |
| f.write("\n".join(error_log)) |
| print(f"\n发现 {len(error_log)} 个问题,已保存到 {log_file}") |
|
|
| if __name__ == "__main__": |
| if len(sys.argv) != 2: |
| print("使用方法: python validate_audio_jsonl.py <input.jsonl>") |
| sys.exit(1) |
|
|
| if not os.path.exists(sys.argv[1]): |
| print(f"错误: 文件 {sys.argv[1]} 不存在") |
| sys.exit(1) |
|
|
| validate_jsonl_audios(sys.argv[1]) |