From b4abb3cbd6dc4cd27f4cc1db7f7df5addcc654ab Mon Sep 17 00:00:00 2001
From: SWivid <swivid@qq.com>
Date: Thu, 24 Oct 2024 13:51:06 +0800
Subject: [PATCH] update infer_gradio

---
 src/f5_tts/infer/infer_gradio.py | 111 ++++++++-----------------------
 1 file changed, 27 insertions(+), 84 deletions(-)

diff --git a/src/f5_tts/infer/infer_gradio.py b/src/f5_tts/infer/infer_gradio.py
index c7fd443..130e8fc 100644
--- a/src/f5_tts/infer/infer_gradio.py
+++ b/src/f5_tts/infer/infer_gradio.py
@@ -140,31 +140,6 @@ def generate_podcast(
     return podcast_path
 
 
-def parse_speechtypes_text(gen_text):
-    # Pattern to find (Emotion)
-    pattern = r"\((.*?)\)"
-
-    # Split the text by the pattern
-    tokens = re.split(pattern, gen_text)
-
-    segments = []
-
-    current_emotion = "Regular"
-
-    for i in range(len(tokens)):
-        if i % 2 == 0:
-            # This is text
-            text = tokens[i].strip()
-            if text:
-                segments.append({"emotion": current_emotion, "text": text})
-        else:
-            # This is emotion
-            emotion = tokens[i].strip()
-            current_emotion = emotion
-
-    return segments
-
-
 with gr.Blocks() as app_credits:
     gr.Markdown("""
 # Credits
@@ -272,9 +247,9 @@ with gr.Blocks() as app_podcast:
     )
 
 
-def parse_emotional_text(gen_text):
-    # Pattern to find (Emotion)
-    pattern = r"\((.*?)\)"
+def parse_speechtypes_text(gen_text):
+    # Pattern to find {speechtype}
+    pattern = r"\{(.*?)\}"
 
     # Split the text by the pattern
     tokens = re.split(pattern, gen_text)
@@ -307,7 +282,7 @@ with gr.Blocks() as app_emotional:
 
     **Example Input:**
 
-    (Regular) Hello, I'd like to order a sandwich please. (Surprised) What do you mean you're out of bread? (Sad) I really wanted a sandwich though... (Angry) You know what, darn you and your little shop, you suck! (Whisper) I'll just go back home and cry now. (Shouting) Why me?!
+    {Regular} Hello, I'd like to order a sandwich please. {Surprised} What do you mean you're out of bread? {Sad} I really wanted a sandwich though... {Angry} You know what, darn you and your little shop, you suck! {Whisper} I'll just go back home and cry now. {Shouting} Why me?!
     """
     )
 
@@ -323,17 +298,19 @@ with gr.Blocks() as app_emotional:
 
     # Additional speech types (up to 99 more)
     max_speech_types = 100
+    speech_type_rows = []
     speech_type_names = []
     speech_type_audios = []
     speech_type_ref_texts = []
     speech_type_delete_btns = []
 
     for i in range(max_speech_types - 1):
-        with gr.Row():
-            name_input = gr.Textbox(label="Speech Type Name", visible=False)
-            audio_input = gr.Audio(label="Reference Audio", type="filepath", visible=False)
-            ref_text_input = gr.Textbox(label="Reference Text", lines=2, visible=False)
-            delete_btn = gr.Button("Delete", variant="secondary", visible=False)
+        with gr.Row(visible=False) as row:
+            name_input = gr.Textbox(label="Speech Type Name")
+            audio_input = gr.Audio(label="Reference Audio", type="filepath")
+            ref_text_input = gr.Textbox(label="Reference Text", lines=2)
+            delete_btn = gr.Button("Delete", variant="secondary")
+        speech_type_rows.append(row)
         speech_type_names.append(name_input)
         speech_type_audios.append(audio_input)
         speech_type_ref_texts.append(ref_text_input)
@@ -349,79 +326,44 @@ with gr.Blocks() as app_emotional:
     def add_speech_type_fn(speech_type_count):
         if speech_type_count < max_speech_types - 1:
             speech_type_count += 1
-            # Prepare updates for the components
-            name_updates = []
-            audio_updates = []
-            ref_text_updates = []
-            delete_btn_updates = []
+            # Prepare updates for the rows
+            row_updates = []
             for i in range(max_speech_types - 1):
                 if i < speech_type_count:
-                    name_updates.append(gr.update(visible=True))
-                    audio_updates.append(gr.update(visible=True))
-                    ref_text_updates.append(gr.update(visible=True))
-                    delete_btn_updates.append(gr.update(visible=True))
+                    row_updates.append(gr.update(visible=True))
                 else:
-                    name_updates.append(gr.update())
-                    audio_updates.append(gr.update())
-                    ref_text_updates.append(gr.update())
-                    delete_btn_updates.append(gr.update())
+                    row_updates.append(gr.update())
         else:
             # Optionally, show a warning
-            # gr.Warning("Maximum number of speech types reached.")
-            name_updates = [gr.update() for _ in range(max_speech_types - 1)]
-            audio_updates = [gr.update() for _ in range(max_speech_types - 1)]
-            ref_text_updates = [gr.update() for _ in range(max_speech_types - 1)]
-            delete_btn_updates = [gr.update() for _ in range(max_speech_types - 1)]
-        return [speech_type_count] + name_updates + audio_updates + ref_text_updates + delete_btn_updates
+            row_updates = [gr.update() for _ in range(max_speech_types - 1)]
+        return [speech_type_count] + row_updates
 
     add_speech_type_btn.click(
-        add_speech_type_fn,
-        inputs=speech_type_count,
-        outputs=[speech_type_count]
-        + speech_type_names
-        + speech_type_audios
-        + speech_type_ref_texts
-        + speech_type_delete_btns,
+        add_speech_type_fn, inputs=speech_type_count, outputs=[speech_type_count] + speech_type_rows
     )
 
     # Function to delete a speech type
     def make_delete_speech_type_fn(index):
         def delete_speech_type_fn(speech_type_count):
             # Prepare updates
-            name_updates = []
-            audio_updates = []
-            ref_text_updates = []
-            delete_btn_updates = []
+            row_updates = []
 
             for i in range(max_speech_types - 1):
                 if i == index:
-                    name_updates.append(gr.update(visible=False, value=""))
-                    audio_updates.append(gr.update(visible=False, value=None))
-                    ref_text_updates.append(gr.update(visible=False, value=""))
-                    delete_btn_updates.append(gr.update(visible=False))
+                    row_updates.append(gr.update(visible=False))
                 else:
-                    name_updates.append(gr.update())
-                    audio_updates.append(gr.update())
-                    ref_text_updates.append(gr.update())
-                    delete_btn_updates.append(gr.update())
+                    row_updates.append(gr.update())
 
             speech_type_count = max(0, speech_type_count - 1)
 
-            return [speech_type_count] + name_updates + audio_updates + ref_text_updates + delete_btn_updates
+            return [speech_type_count] + row_updates
 
         return delete_speech_type_fn
 
+    # Update delete button clicks
     for i, delete_btn in enumerate(speech_type_delete_btns):
         delete_fn = make_delete_speech_type_fn(i)
-        delete_btn.click(
-            delete_fn,
-            inputs=speech_type_count,
-            outputs=[speech_type_count]
-            + speech_type_names
-            + speech_type_audios
-            + speech_type_ref_texts
-            + speech_type_delete_btns,
-        )
+        delete_btn.click(delete_fn, inputs=speech_type_count, outputs=[speech_type_count] + speech_type_rows)
 
     # Text input for the prompt
     gen_text_input_emotional = gr.Textbox(label="Text to Generate", lines=10)
@@ -432,7 +374,7 @@ with gr.Blocks() as app_emotional:
     with gr.Accordion("Advanced Settings", open=False):
         remove_silence_emotional = gr.Checkbox(
             label="Remove Silences",
-            value=True,
+            value=False,
         )
 
     # Generate button
@@ -529,7 +471,7 @@ with gr.Blocks() as app_emotional:
                 speech_types_available.add(name_input)
 
         # Parse the gen_text to get the speech types used
-        segments = parse_emotional_text(gen_text)
+        segments = parse_speechtypes_text(gen_text)
         speech_types_in_text = set(segment["emotion"] for segment in segments)
 
         # Check if all speech types in text are available
@@ -547,6 +489,7 @@ with gr.Blocks() as app_emotional:
         inputs=[gen_text_input_emotional, regular_name] + speech_type_names,
         outputs=generate_emotional_btn,
     )
+
 with gr.Blocks() as app:
     gr.Markdown(
         """