Replace jieba pkg with rjieba - a jieba-rs Python binding

2025-12-05 20:40:12 -08:00 · 2025-11-28 13:08:07 +00:00
parent 3eecd94baa
commit 9ae46c8360
6 changed files with 9 additions and 13 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

 [project]
 name = "f5-tts"
-version = "1.1.9"
+version = "1.1.10"
 description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
 readme = "README.md"
 license = {text = "MIT License"}
@@ -22,13 +22,13 @@ dependencies = [
    "ema_pytorch>=0.5.2",
    "gradio>=5.0.0",
    "hydra-core>=1.3.0",
-    "jieba",
    "librosa",
    "matplotlib",
    "numpy<=1.26.4; python_version<='3.10'",
    "pydantic<=2.10.6",
    "pydub",
    "pypinyin",
+    "rjieba",
    "safetensors",
    "soundfile",
    "tomli",
--- a/src/f5_tts/model/utils.py
+++ b/src/f5_tts/model/utils.py
@@ -7,7 +7,7 @@ import random
 from collections import defaultdict
 from importlib.resources import files

-import jieba
+import rjieba
 import torch
 from pypinyin import Style, lazy_pinyin
 from torch.nn.utils.rnn import pad_sequence
@@ -146,10 +146,6 @@ def get_tokenizer(dataset_name, tokenizer: str = "pinyin"):


 def convert_char_to_pinyin(text_list, polyphone=True):
-    if jieba.dt.initialized is False:
-        jieba.default_logger.setLevel(50)  # CRITICAL
-        jieba.initialize()
-
    final_text_list = []
    custom_trans = str.maketrans(
        {";": ",", "“": '"', "”": '"', "‘": "'", "’": "'"}
@@ -163,7 +159,7 @@ def convert_char_to_pinyin(text_list, polyphone=True):
    for text in text_list:
        char_list = []
        text = text.translate(custom_trans)
-        for seg in jieba.cut(text):
+        for seg in rjieba.cut(text):
            seg_byte_len = len(bytes(seg, "UTF-8"))
            if seg_byte_len == len(seg):  # if pure alphabets and symbols
                if char_list and seg_byte_len > 1 and char_list[-1] not in " :'\"":
--- a/src/f5_tts/runtime/triton_trtllm/Dockerfile.server
+++ b/src/f5_tts/runtime/triton_trtllm/Dockerfile.server
@@ -1,3 +1,3 @@
 FROM nvcr.io/nvidia/tritonserver:24.12-py3
-RUN pip install tritonclient[grpc] tensorrt-llm==0.16.0 torchaudio==2.5.1 jieba pypinyin librosa vocos
+RUN pip install tritonclient[grpc] tensorrt-llm==0.16.0 torchaudio==2.5.1 rjieba pypinyin librosa vocos
 WORKDIR /workspace
--- a/src/f5_tts/runtime/triton_trtllm/model_repo_f5_tts/f5_tts/1/model.py
+++ b/src/f5_tts/runtime/triton_trtllm/model_repo_f5_tts/f5_tts/1/model.py
@@ -26,7 +26,7 @@
 import json
 import os

-import jieba
+import rjieba
 import torch
 import torchaudio
 import triton_python_backend_utils as pb_utils
@@ -66,7 +66,7 @@ def convert_char_to_pinyin(reference_target_texts_list, polyphone=True):
    for text in reference_target_texts_list:
        char_list = []
        text = text.translate(custom_trans)
-        for seg in jieba.cut(text):
+        for seg in rjieba.cut(text):
            seg_byte_len = len(bytes(seg, "UTF-8"))
            if seg_byte_len == len(seg):  # if pure alphabets and symbols
                if char_list and seg_byte_len > 1 and char_list[-1] not in " :'\"":
--- a/src/f5_tts/train/datasets/prepare_emilia.py
+++ b/src/f5_tts/train/datasets/prepare_emilia.py
@@ -225,5 +225,5 @@ if __name__ == "__main__":
    # bad zh asr cnt        230435   (samples)
    # bad eh asr cnt         37217   (samples)

-    # vocab size may be slightly different due to jieba tokenizer and pypinyin (e.g. way of polyphoneme)
+    # vocab size may be slightly different due to rjieba tokenizer and pypinyin (e.g. way of polyphoneme)
    # please be careful if using pretrained model, make sure the vocab.txt is same
--- a/src/f5_tts/train/datasets/prepare_wenetspeech4tts.py
+++ b/src/f5_tts/train/datasets/prepare_wenetspeech4tts.py
@@ -122,5 +122,5 @@ if __name__ == "__main__":
    #                           -            -        1459   (polyphone)
    # char   vocab size      5264         5219        5042

-    # vocab size may be slightly different due to jieba tokenizer and pypinyin (e.g. way of polyphoneme)
+    # vocab size may be slightly different due to rjieba tokenizer and pypinyin (e.g. way of polyphoneme)
    # please be careful if using pretrained model, make sure the vocab.txt is same