v1.1.12 bump gradio from 5.0 to 6.0, several fixes to ensure compatibility with new gradio version

fix #1239 , use gradio>=6.0; add more clear instruction for ffmpeg installation (#1234 )
Replace jieba pkg with rjieba - a jieba-rs Python binding
2026-01-15 06:23:22 -08:00 · 2025-12-20 18:44:43 +08:00 · 2025-12-20 16:08:13 +08:00 · 2025-11-28 13:08:07 +00:00 · 2025-11-09 11:56:03 +00:00 · 2025-11-09 18:25:30 +08:00
9 changed files with 43 additions and 40 deletions
--- a/README.md
+++ b/README.md
@@ -30,6 +30,9 @@
 # Create a conda env with python_version>=3.10  (you could also use virtualenv)
 conda create -n f5-tts python=3.11
 conda activate f5-tts
+
+# Install FFmpeg if you haven't yet
+conda install ffmpeg
 ```

 ### Install PyTorch with matched device
@@ -39,7 +42,11 @@ conda activate f5-tts

 > ```bash
 > # Install pytorch with your CUDA version, e.g.
+> pip install torch==2.8.0+cu128 torchaudio==2.8.0+cu128 --extra-index-url https://download.pytorch.org/whl/cu128
+> 
+> # And also possible previous versions, e.g.
 > pip install torch==2.4.0+cu124 torchaudio==2.4.0+cu124 --extra-index-url https://download.pytorch.org/whl/cu124
+> # etc.
 > ```

 </details>
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

 [project]
 name = "f5-tts"
-version = "1.1.9"
+version = "1.1.12"
 description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
 readme = "README.md"
 license = {text = "MIT License"}
@@ -20,15 +20,15 @@ dependencies = [
    "click",
    "datasets",
    "ema_pytorch>=0.5.2",
-    "gradio>=5.0.0",
+    "gradio>=6.0.0",
    "hydra-core>=1.3.0",
-    "jieba",
    "librosa",
    "matplotlib",
    "numpy<=1.26.4; python_version<='3.10'",
    "pydantic<=2.10.6",
    "pydub",
    "pypinyin",
+    "rjieba",
    "safetensors",
    "soundfile",
    "tomli",
--- a/src/f5_tts/infer/infer_gradio.py
+++ b/src/f5_tts/infer/infer_gradio.py
@@ -577,7 +577,7 @@ with gr.Blocks() as app_multistyle:
        label="Cherry-pick Interface",
        lines=10,
        max_lines=40,
-        show_copy_button=True,
+        buttons=["copy"],  # show_copy_button=True if gradio<6.0
        interactive=False,
        visible=False,
    )
@@ -816,7 +816,9 @@ Have a conversation with an AI using your reference voice!
                        lines=2,
                    )

-        chatbot_interface = gr.Chatbot(label="Conversation", type="messages")
+        chatbot_interface = gr.Chatbot(
+            label="Conversation"
+        )  # type="messages" hard-coded and no need to pass in since gradio 6.0

        with gr.Row():
            with gr.Column():
@@ -853,6 +855,10 @@ Have a conversation with an AI using your reference voice!
        @gpu_decorator
        def generate_text_response(conv_state, system_prompt):
            """Generate text response from AI"""
+            for single_state in conv_state:
+                if isinstance(single_state["content"], list):
+                    assert len(single_state["content"]) == 1 and single_state["content"][0]["type"] == "text"
+                    single_state["content"] = single_state["content"][0]["text"]

            system_prompt_state = [{"role": "system", "content": system_prompt}]
            response = chat_model_inference(system_prompt_state + conv_state, chat_model_state, chat_tokenizer_state)
@@ -866,7 +872,7 @@ Have a conversation with an AI using your reference voice!
            if not conv_state or not ref_audio:
                return None, ref_text, seed_input

-            last_ai_response = conv_state[-1]["content"]
+            last_ai_response = conv_state[-1]["content"][0]["text"]
            if not last_ai_response or conv_state[-1]["role"] != "assistant":
                return None, ref_text, seed_input

@@ -1108,7 +1114,6 @@ def main(port, host, share, api, root_path, inbrowser):
        server_name=host,
        server_port=port,
        share=share,
-        show_api=api,
        root_path=root_path,
        inbrowser=inbrowser,
    )
--- a/src/f5_tts/model/backbones/dit.py
+++ b/src/f5_tts/model/backbones/dit.py
@@ -43,7 +43,7 @@ class TextEmbedding(nn.Module):

        if conv_layers > 0:
            self.extra_modeling = True
-            self.precompute_max_pos = 4096  # ~44s of 24khz audio
+            self.precompute_max_pos = 8192  # 8192 is ~87.38s of 24khz audio; 4096 is ~43.69s of 24khz audio
            self.register_buffer("freqs_cis", precompute_freqs_cis(text_dim, self.precompute_max_pos), persistent=False)
            self.text_blocks = nn.Sequential(
                *[ConvNeXtV2Block(text_dim, text_dim * conv_mult) for _ in range(conv_layers)]
@@ -51,33 +51,29 @@ class TextEmbedding(nn.Module):
        else:
            self.extra_modeling = False

-    def average_upsample_text_by_mask(self, text, text_mask, audio_mask):
+    def average_upsample_text_by_mask(self, text, text_mask):
        batch, text_len, text_dim = text.shape

-        if audio_mask is None:
-            audio_mask = torch.ones_like(text_mask, dtype=torch.bool)
-        valid_mask = audio_mask & text_mask
-        audio_lens = audio_mask.sum(dim=1)  # [batch]
-        valid_lens = valid_mask.sum(dim=1)  # [batch]
+        audio_len = text_len  # cuz text already padded to same length as audio sequence
+        text_lens = text_mask.sum(dim=1)  # [batch]

        upsampled_text = torch.zeros_like(text)

        for i in range(batch):
-            audio_len = audio_lens[i].item()
-            valid_len = valid_lens[i].item()
+            text_len = text_lens[i].item()

-            if valid_len == 0:
+            if text_len == 0:
                continue

-            valid_ind = torch.where(valid_mask[i])[0]
-            valid_data = text[i, valid_ind, :]  # [valid_len, text_dim]
+            valid_ind = torch.where(text_mask[i])[0]
+            valid_data = text[i, valid_ind, :]  # [text_len, text_dim]

-            base_repeat = audio_len // valid_len
-            remainder = audio_len % valid_len
+            base_repeat = audio_len // text_len
+            remainder = audio_len % text_len

            indices = []
-            for j in range(valid_len):
-                repeat_count = base_repeat + (1 if j >= valid_len - remainder else 0)
+            for j in range(text_len):
+                repeat_count = base_repeat + (1 if j >= text_len - remainder else 0)
                indices.extend([j] * repeat_count)

            indices = torch.tensor(indices[:audio_len], device=text.device, dtype=torch.long)
@@ -87,7 +83,7 @@ class TextEmbedding(nn.Module):

        return upsampled_text

-    def forward(self, text: int["b nt"], seq_len, drop_text=False, audio_mask: bool["b n"] | None = None):
+    def forward(self, text: int["b nt"], seq_len, drop_text=False):
        text = text + 1  # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
        text = text[:, :seq_len]  # curtail if character tokens are more than the mel spec tokens
        text = F.pad(text, (0, seq_len - text.shape[1]), value=0)  # (opt.) if not self.average_upsampling:
@@ -114,7 +110,7 @@ class TextEmbedding(nn.Module):
                text = self.text_blocks(text)

        if self.average_upsampling:
-            text = self.average_upsample_text_by_mask(text, ~text_mask, audio_mask)
+            text = self.average_upsample_text_by_mask(text, ~text_mask)

        return text

@@ -247,17 +243,16 @@ class DiT(nn.Module):
    ):
        if self.text_uncond is None or self.text_cond is None or not cache:
            if audio_mask is None:
-                text_embed = self.text_embed(text, x.shape[1], drop_text=drop_text, audio_mask=audio_mask)
+                text_embed = self.text_embed(text, x.shape[1], drop_text=drop_text)
            else:
                batch = x.shape[0]
-                seq_lens = audio_mask.sum(dim=1)
+                seq_lens = audio_mask.sum(dim=1)  # Calculate the actual sequence length for each sample
                text_embed_list = []
                for i in range(batch):
                    text_embed_i = self.text_embed(
                        text[i].unsqueeze(0),
-                        seq_lens[i].item(),
+                        seq_len=seq_lens[i].item(),
                        drop_text=drop_text,
-                        audio_mask=audio_mask,
                    )
                    text_embed_list.append(text_embed_i[0])
                text_embed = pad_sequence(text_embed_list, batch_first=True, padding_value=0)
--- a/src/f5_tts/model/utils.py
+++ b/src/f5_tts/model/utils.py
@@ -7,7 +7,7 @@ import random
 from collections import defaultdict
 from importlib.resources import files

-import jieba
+import rjieba
 import torch
 from pypinyin import Style, lazy_pinyin
 from torch.nn.utils.rnn import pad_sequence
@@ -146,10 +146,6 @@ def get_tokenizer(dataset_name, tokenizer: str = "pinyin"):


 def convert_char_to_pinyin(text_list, polyphone=True):
-    if jieba.dt.initialized is False:
-        jieba.default_logger.setLevel(50)  # CRITICAL
-        jieba.initialize()
-
    final_text_list = []
    custom_trans = str.maketrans(
        {";": ",", "“": '"', "”": '"', "‘": "'", "’": "'"}
@@ -163,7 +159,7 @@ def convert_char_to_pinyin(text_list, polyphone=True):
    for text in text_list:
        char_list = []
        text = text.translate(custom_trans)
-        for seg in jieba.cut(text):
+        for seg in rjieba.cut(text):
            seg_byte_len = len(bytes(seg, "UTF-8"))
            if seg_byte_len == len(seg):  # if pure alphabets and symbols
                if char_list and seg_byte_len > 1 and char_list[-1] not in " :'\"":
--- a/src/f5_tts/runtime/triton_trtllm/Dockerfile.server
+++ b/src/f5_tts/runtime/triton_trtllm/Dockerfile.server
@@ -1,3 +1,3 @@
 FROM nvcr.io/nvidia/tritonserver:24.12-py3
-RUN pip install tritonclient[grpc] tensorrt-llm==0.16.0 torchaudio==2.5.1 jieba pypinyin librosa vocos
+RUN pip install tritonclient[grpc] tensorrt-llm==0.16.0 torchaudio==2.5.1 rjieba pypinyin librosa vocos
 WORKDIR /workspace
--- a/src/f5_tts/runtime/triton_trtllm/model_repo_f5_tts/f5_tts/1/model.py
+++ b/src/f5_tts/runtime/triton_trtllm/model_repo_f5_tts/f5_tts/1/model.py
@@ -26,7 +26,7 @@
 import json
 import os

-import jieba
+import rjieba
 import torch
 import torchaudio
 import triton_python_backend_utils as pb_utils
@@ -66,7 +66,7 @@ def convert_char_to_pinyin(reference_target_texts_list, polyphone=True):
    for text in reference_target_texts_list:
        char_list = []
        text = text.translate(custom_trans)
-        for seg in jieba.cut(text):
+        for seg in rjieba.cut(text):
            seg_byte_len = len(bytes(seg, "UTF-8"))
            if seg_byte_len == len(seg):  # if pure alphabets and symbols
                if char_list and seg_byte_len > 1 and char_list[-1] not in " :'\"":
--- a/src/f5_tts/train/datasets/prepare_emilia.py
+++ b/src/f5_tts/train/datasets/prepare_emilia.py
@@ -225,5 +225,5 @@ if __name__ == "__main__":
    # bad zh asr cnt        230435   (samples)
    # bad eh asr cnt         37217   (samples)

-    # vocab size may be slightly different due to jieba tokenizer and pypinyin (e.g. way of polyphoneme)
+    # vocab size may be slightly different due to rjieba tokenizer and pypinyin (e.g. way of polyphoneme)
    # please be careful if using pretrained model, make sure the vocab.txt is same
--- a/src/f5_tts/train/datasets/prepare_wenetspeech4tts.py
+++ b/src/f5_tts/train/datasets/prepare_wenetspeech4tts.py
@@ -122,5 +122,5 @@ if __name__ == "__main__":
    #                           -            -        1459   (polyphone)
    # char   vocab size      5264         5219        5042

-    # vocab size may be slightly different due to jieba tokenizer and pypinyin (e.g. way of polyphoneme)
+    # vocab size may be slightly different due to rjieba tokenizer and pypinyin (e.g. way of polyphoneme)
    # please be careful if using pretrained model, make sure the vocab.txt is same
Author	SHA1	Message	Date
SWivid	39617fcf7a	v1.1.12 bump gradio from 5.0 to 6.0, several fixes to ensure compatibility with new gradio version	2025-12-20 18:44:43 +08:00
Yushen Chen	5b82f97c26	fix #1239 , use gradio>=6.0; add more clear instruction for ffmpeg installation (#1234 )	2025-12-20 16:08:13 +08:00
SWivid	9ae46c8360	Replace jieba pkg with rjieba - a jieba-rs Python binding	2025-11-28 13:08:07 +00:00
SWivid	3eecd94baa	support back avg upsampling for batch, cover up non-mask case	2025-11-09 11:56:03 +00:00
SWivid	d9a69452ce	formatting	2025-11-09 18:25:30 +08:00
Yushen CHEN	bc15df2b57	Merge pull request #1212 from QingyuLiu0521/fix/AverageUpsampling Fix Average Upsampling conflict logic, introduced from the previous batch inference fix.	2025-11-09 18:23:38 +08:00
QingyuLiu0521	9b2357a1b9	Fix Average Upsampling	2025-11-08 18:39:06 -05:00