From 3e2a07da1d3fabcb6e8804cec25100238e78c04e Mon Sep 17 00:00:00 2001 From: SWivid Date: Sun, 11 May 2025 19:40:37 +0800 Subject: [PATCH] Update README.md & minor fixes --- src/f5_tts/infer/README.md | 24 +++++++++++++++++++++++- src/f5_tts/infer/infer_cli.py | 2 +- src/f5_tts/infer/utils_infer.py | 2 +- src/f5_tts/train/README.md | 6 ++++++ 4 files changed, 31 insertions(+), 3 deletions(-) diff --git a/src/f5_tts/infer/README.md b/src/f5_tts/infer/README.md index 07bfeb4..9de47aa 100644 --- a/src/f5_tts/infer/README.md +++ b/src/f5_tts/infer/README.md @@ -13,7 +13,7 @@ To avoid possible inference failures, make sure you have seen through the follow - Add some spaces (blank: " ") or punctuations (e.g. "," ".") to explicitly introduce some pauses. - If English punctuation marks the end of a sentence, make sure there is a space " " after it. Otherwise not regarded as when chunk. - Preprocess numbers to Chinese letters if you want to have them read in Chinese, otherwise in English. -- If the generation output is blank (pure silence), check for ffmpeg installation. +- If the generation output is blank (pure silence), check for FFmpeg installation. - Try turn off `use_ema` if using an early-stage finetuned checkpoint (which goes just few updates). @@ -129,6 +129,28 @@ ref_text = "" ``` You should mark the voice with `[main]` `[town]` `[country]` whenever you want to change voice, refer to `src/f5_tts/infer/examples/multi/story.txt`. +## API Usage + +```python +from importlib.resources import files +from f5_tts.api import F5TTS + +f5tts = F5TTS() +wav, sr, spec = f5tts.infer( + ref_file=str(files("f5_tts").joinpath("infer/examples/basic/basic_ref_en.wav")), + ref_text="some call me nature, others call me mother nature.", + gen_text="""I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring. Respect me and I'll nurture you; ignore me and you shall face the consequences.""", + file_wave=str(files("f5_tts").joinpath("../../tests/api_out.wav")), + file_spec=str(files("f5_tts").joinpath("../../tests/api_out.png")), + seed=None, +) +``` +Check [api.py](../api.py) for more details. + +## TensorRT-LLM Deployment + +See [detailed instructions](../runtime/triton_trtllm/README.md) for more information. + ## Socket Real-time Service Real-time voice output with chunk stream: diff --git a/src/f5_tts/infer/infer_cli.py b/src/f5_tts/infer/infer_cli.py index 673acaa..95800fd 100644 --- a/src/f5_tts/infer/infer_cli.py +++ b/src/f5_tts/infer/infer_cli.py @@ -323,7 +323,7 @@ def main(): ref_text_ = voices[voice]["ref_text"] gen_text_ = text.strip() print(f"Voice: {voice}") - audio_segment, final_sample_rate, spectragram = infer_process( + audio_segment, final_sample_rate, spectrogram = infer_process( ref_audio_, ref_text_, gen_text_, diff --git a/src/f5_tts/infer/utils_infer.py b/src/f5_tts/infer/utils_infer.py index bb7a9b8..7ef7926 100644 --- a/src/f5_tts/infer/utils_infer.py +++ b/src/f5_tts/infer/utils_infer.py @@ -384,7 +384,7 @@ def infer_process( ): # Split the input text into batches audio, sr = torchaudio.load(ref_audio) - max_chars = int(len(ref_text.encode("utf-8")) / (audio.shape[-1] / sr) * (22 - audio.shape[-1] / sr)) + max_chars = int(len(ref_text.encode("utf-8")) / (audio.shape[-1] / sr) * (22 - audio.shape[-1] / sr) * speed) gen_text_batches = chunk_text(gen_text, max_chars=max_chars) for i, gen_text in enumerate(gen_text_batches): print(f"gen_text {i}", gen_text) diff --git a/src/f5_tts/train/README.md b/src/f5_tts/train/README.md index 537cea6..b66d120 100644 --- a/src/f5_tts/train/README.md +++ b/src/f5_tts/train/README.md @@ -1,5 +1,11 @@ # Training +Check your FFmpeg installation: +```bash +ffmpeg -version +``` +If not found, install it first (or skip assuming you know of other backends available). + ## Prepare Dataset Example data processing scripts, and you may tailor your own one along with a Dataset class in `src/f5_tts/model/dataset.py`.