Update. Cache last used custom model path #447

2026-01-01 15:50:03 -08:00 · 2024-11-11 11:13:11 +08:00
parent e6e7bd01d3
commit 59761ec9f8
2 changed files with 57 additions and 27 deletions
--- a/src/f5_tts/infer/SHARED.md
+++ b/src/f5_tts/infer/SHARED.md
@@ -1,21 +1,25 @@
 <!-- omit in toc -->
 # Shared Model Cards

- This document is serving as a quick lookup table for the community training/finetuning result, with various language support.
- The models in this repository are open source and are based on voluntary contributions from contributors.
- The use of models must be conditioned on respect for the respective creators. The convenience brought comes from their efforts.
- Welcome to pull request sharing your result here.
+- **Prerequisites of using**
+  - This document is serving as a quick lookup table for the community training/finetuning result, with various language support.
+  - The models in this repository are open source and are based on voluntary contributions from contributors.
+  - The use of models must be conditioned on respect for the respective creators. The convenience brought comes from their efforts.
+- **Welcome to share here**
+  - Have a pretrained/finetuned result: model checkpoint (pruned best to facilitate inference, i.e. leave only `ema_model_state_dict`) and corresponding vocab file (for tokenization).
+  - Host a public [huggingface model repository](https://huggingface.co/new) and upload the model related files.
+  - Make a pull request adding a model card to the current page, i.e. `src\f5_tts\infer\SHARED.md`.


 <!-- omit in toc -->
 ### Support Language
 - [Multilingual](#multilingual)
-  - [F5-TTS Base @ pretrain @ zh \& en](#f5-tts-base--pretrain--zh--en)
+    - [F5-TTS Base @ pretrain @ zh \& en](#f5-tts-base--pretrain--zh--en)
 - [Mandarin](#mandarin)
 - [English](#english)


-### Multilingual
+## Multilingual

 #### F5-TTS Base @ pretrain @ zh & en
 |Model|🤗Hugging Face|Data (Hours)|Model License|
@@ -26,10 +30,10 @@
 MODEL_CKPT: hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors
 VOCAB_FILE: hf://SWivid/F5-TTS/F5TTS_Base/vocab.txt
 ```
-*Other infos, e.g. Link to some sampled results, Github repo, Usage instruction, Tutorial (Blog, Video, etc.) ...*
+*Other infos, e.g. Author info, Github repo, Link to some sampled results, Usage instruction, Tutorial (Blog, Video, etc.) ...*

-### Mandarin
+## Mandarin


-### English
+## English

--- a/src/f5_tts/infer/infer_gradio.py
+++ b/src/f5_tts/infer/infer_gradio.py
@@ -4,6 +4,7 @@
 import re
 import tempfile
 from collections import OrderedDict
+from importlib.resources import files

 import click
 import gradio as gr
@@ -71,6 +72,7 @@ def load_custom(ckpt_path: str, vocab_path="", model_cfg=None):

 F5TTS_ema_model = load_f5tts()
 E2TTS_ema_model = load_e2tts() if USING_SPACES else None
+custom_ema_model, pre_custom_path = None, ""

 chat_model_state = None
 chat_tokenizer_state = None
@@ -115,8 +117,11 @@ def infer(
        ema_model = E2TTS_ema_model
    elif isinstance(model, list) and model[0] == "Custom":
        assert not USING_SPACES, "Only official checkpoints allowed in Spaces."
-        show_info("Loading Custom TTS model...")
-        custom_ema_model = load_custom(model[1], vocab_path=model[2])
+        global custom_ema_model, pre_custom_path
+        if pre_custom_path != model[1]:
+            show_info("Loading Custom TTS model...")
+            custom_ema_model = load_custom(model[1], vocab_path=model[2])
+            pre_custom_path = model[1]
        ema_model = custom_ema_model

    final_wave, final_sample_rate, combined_spectrogram = infer_process(
@@ -739,14 +744,29 @@ If you're having issues, try converting your reference audio to WAV or MP3, clip
 """
    )

+    last_used_custom = files("f5_tts").joinpath("infer/.cache/last_used_custom.txt")
+
+    def load_last_used_custom():
+        try:
+            with open(last_used_custom, "r") as f:
+                return f.read().split(",")
+        except FileNotFoundError:
+            last_used_custom.parent.mkdir(parents=True, exist_ok=True)
+            return [
+                "hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors",
+                "hf://SWivid/F5-TTS/F5TTS_Base/vocab.txt",
+            ]
+
    def switch_tts_model(new_choice, custom_ckpt_path, custom_vocab_path):
        global tts_model_choice
        if new_choice == "Custom":
            tts_model_choice = ["Custom", custom_ckpt_path, custom_vocab_path]
-            return gr.update(visible=True)
+            with open(last_used_custom, "w") as f:
+                f.write(f"{custom_ckpt_path},{custom_vocab_path}")
+            return gr.update(visible=True), gr.update(visible=True)
        else:
            tts_model_choice = new_choice
-            return gr.update(visible=False)
+            return gr.update(visible=False), gr.update(visible=False)

    with gr.Row():
        if not USING_SPACES:
@@ -757,32 +777,38 @@ If you're having issues, try converting your reference audio to WAV or MP3, clip
            choose_tts_model = gr.Radio(
                choices=[DEFAULT_TTS_MODEL, "E2-TTS"], label="Choose TTS Model", value=DEFAULT_TTS_MODEL
            )
-        with gr.Column(visible=False) as choose_custom_tts_model:
-            custom_ckpt_path = gr.Textbox(
-                placeholder="MODEL_CKPT:  local_path  |  hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors",
-                show_label=False,
-                min_width=200,
-            )
-            custom_vocab_path = gr.Textbox(
-                placeholder="VOCAB_FILE:  local_path  |  hf://SWivid/F5-TTS/F5TTS_Base/vocab.txt  |  leave blank to use default",
-                show_label=False,
-                min_width=200,
-            )
+        custom_ckpt_path = gr.Dropdown(
+            choices=["hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors"],
+            value=load_last_used_custom()[0],
+            allow_custom_value=True,
+            label="MODEL CKPT: local_path | hf://user_id/repo_id/model_ckpt",
+            visible=False,
+        )
+        custom_vocab_path = gr.Dropdown(
+            choices=["hf://SWivid/F5-TTS/F5TTS_Base/vocab.txt"],
+            value=load_last_used_custom()[1],
+            allow_custom_value=True,
+            label="VOCAB FILE: local_path | hf://user_id/repo_id/vocab_file",
+            visible=False,
+        )

    choose_tts_model.change(
        switch_tts_model,
        inputs=[choose_tts_model, custom_ckpt_path, custom_vocab_path],
-        outputs=[choose_custom_tts_model],
+        outputs=[custom_ckpt_path, custom_vocab_path],
+        show_progress="hidden",
    )
    custom_ckpt_path.change(
        switch_tts_model,
        inputs=[choose_tts_model, custom_ckpt_path, custom_vocab_path],
-        outputs=[choose_custom_tts_model],
+        outputs=[custom_ckpt_path, custom_vocab_path],
+        show_progress="hidden",
    )
    custom_vocab_path.change(
        switch_tts_model,
        inputs=[choose_tts_model, custom_ckpt_path, custom_vocab_path],
-        outputs=[choose_custom_tts_model],
+        outputs=[custom_ckpt_path, custom_vocab_path],
+        show_progress="hidden",
    )

    gr.TabbedInterface(