From bec4ebcae574febbcbbeec5fe0bafb8042bf58bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hasan=20Can=20Solako=C4=9Flu?= Date: Fri, 7 Feb 2025 22:35:30 +0300 Subject: [PATCH] Enhance CSV preparation script to preserve order of processed audio files in chunk submissions --- src/f5_tts/train/datasets/prepare_csv_wavs.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/f5_tts/train/datasets/prepare_csv_wavs.py b/src/f5_tts/train/datasets/prepare_csv_wavs.py index 22c97ef..ba964ea 100644 --- a/src/f5_tts/train/datasets/prepare_csv_wavs.py +++ b/src/f5_tts/train/datasets/prepare_csv_wavs.py @@ -111,14 +111,15 @@ def prepare_csv_wavs_dir(input_dir, num_workers=None): # Process files in chunks for better efficiency for i in range(0, len(audio_path_text_pairs), CHUNK_SIZE): chunk = audio_path_text_pairs[i:i + CHUNK_SIZE] + # Submit futures in order chunk_futures = [ executor.submit(process_audio_file, pair[0], pair[1], polyphone) for pair in chunk ] - # Process chunk results with progress bar + # Iterate over futures in the original submission order to preserve ordering for future in tqdm( - concurrent.futures.as_completed(chunk_futures), + chunk_futures, total=len(chunk), desc=f"Processing chunk {i//CHUNK_SIZE + 1}/{(total_files + CHUNK_SIZE - 1)//CHUNK_SIZE}" ):