diff --git a/pyproject.toml b/pyproject.toml index 69e4c9c..bbd633e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,7 @@ dependencies = [ "tqdm>=4.65.0", "transformers", "transformers_stream_generator", + "unidecode", "vocos", "wandb", "x_transformers>=1.31.14", diff --git a/src/f5_tts/infer/infer_cli.py b/src/f5_tts/infer/infer_cli.py index 95800fd..5bd20ce 100644 --- a/src/f5_tts/infer/infer_cli.py +++ b/src/f5_tts/infer/infer_cli.py @@ -12,6 +12,7 @@ import tomli from cached_path import cached_path from hydra.utils import get_class from omegaconf import OmegaConf +from unidecode import unidecode from f5_tts.infer.utils_infer import ( cfg_strength, @@ -112,6 +113,11 @@ parser.add_argument( action="store_true", help="To save each audio chunks during inference", ) +parser.add_argument( + "--no_legacy_text", + action="store_false", + help="Not to use lossy ASCII transliterations of unicode text in saved file names.", +) parser.add_argument( "--remove_silence", action="store_true", @@ -197,6 +203,12 @@ output_file = args.output_file or config.get( ) save_chunk = args.save_chunk or config.get("save_chunk", False) +use_legacy_text = args.no_legacy_text or config.get("no_legacy_text", False) # no_legacy_text is a store_false arg +if save_chunk and use_legacy_text: + print( + "\nWarning to --save_chunk: lossy ASCII transliterations of unicode text for legacy (.wav) file names, --no_legacy_text to disable.\n" + ) + remove_silence = args.remove_silence or config.get("remove_silence", False) load_vocoder_from_local = args.load_vocoder_from_local or config.get("load_vocoder_from_local", False) @@ -344,6 +356,8 @@ def main(): if save_chunk: if len(gen_text_) > 200: gen_text_ = gen_text_[:200] + " ... " + if use_legacy_text: + gen_text_ = unidecode(gen_text_) sf.write( os.path.join(output_chunk_dir, f"{len(generated_audio_segments) - 1}_{gen_text_}.wav"), audio_segment,