From b4abb3cbd6dc4cd27f4cc1db7f7df5addcc654ab Mon Sep 17 00:00:00 2001 From: SWivid Date: Thu, 24 Oct 2024 13:51:06 +0800 Subject: [PATCH] update infer_gradio --- src/f5_tts/infer/infer_gradio.py | 111 ++++++++----------------------- 1 file changed, 27 insertions(+), 84 deletions(-) diff --git a/src/f5_tts/infer/infer_gradio.py b/src/f5_tts/infer/infer_gradio.py index c7fd443..130e8fc 100644 --- a/src/f5_tts/infer/infer_gradio.py +++ b/src/f5_tts/infer/infer_gradio.py @@ -140,31 +140,6 @@ def generate_podcast( return podcast_path -def parse_speechtypes_text(gen_text): - # Pattern to find (Emotion) - pattern = r"\((.*?)\)" - - # Split the text by the pattern - tokens = re.split(pattern, gen_text) - - segments = [] - - current_emotion = "Regular" - - for i in range(len(tokens)): - if i % 2 == 0: - # This is text - text = tokens[i].strip() - if text: - segments.append({"emotion": current_emotion, "text": text}) - else: - # This is emotion - emotion = tokens[i].strip() - current_emotion = emotion - - return segments - - with gr.Blocks() as app_credits: gr.Markdown(""" # Credits @@ -272,9 +247,9 @@ with gr.Blocks() as app_podcast: ) -def parse_emotional_text(gen_text): - # Pattern to find (Emotion) - pattern = r"\((.*?)\)" +def parse_speechtypes_text(gen_text): + # Pattern to find {speechtype} + pattern = r"\{(.*?)\}" # Split the text by the pattern tokens = re.split(pattern, gen_text) @@ -307,7 +282,7 @@ with gr.Blocks() as app_emotional: **Example Input:** - (Regular) Hello, I'd like to order a sandwich please. (Surprised) What do you mean you're out of bread? (Sad) I really wanted a sandwich though... (Angry) You know what, darn you and your little shop, you suck! (Whisper) I'll just go back home and cry now. (Shouting) Why me?! + {Regular} Hello, I'd like to order a sandwich please. {Surprised} What do you mean you're out of bread? {Sad} I really wanted a sandwich though... {Angry} You know what, darn you and your little shop, you suck! {Whisper} I'll just go back home and cry now. {Shouting} Why me?! """ ) @@ -323,17 +298,19 @@ with gr.Blocks() as app_emotional: # Additional speech types (up to 99 more) max_speech_types = 100 + speech_type_rows = [] speech_type_names = [] speech_type_audios = [] speech_type_ref_texts = [] speech_type_delete_btns = [] for i in range(max_speech_types - 1): - with gr.Row(): - name_input = gr.Textbox(label="Speech Type Name", visible=False) - audio_input = gr.Audio(label="Reference Audio", type="filepath", visible=False) - ref_text_input = gr.Textbox(label="Reference Text", lines=2, visible=False) - delete_btn = gr.Button("Delete", variant="secondary", visible=False) + with gr.Row(visible=False) as row: + name_input = gr.Textbox(label="Speech Type Name") + audio_input = gr.Audio(label="Reference Audio", type="filepath") + ref_text_input = gr.Textbox(label="Reference Text", lines=2) + delete_btn = gr.Button("Delete", variant="secondary") + speech_type_rows.append(row) speech_type_names.append(name_input) speech_type_audios.append(audio_input) speech_type_ref_texts.append(ref_text_input) @@ -349,79 +326,44 @@ with gr.Blocks() as app_emotional: def add_speech_type_fn(speech_type_count): if speech_type_count < max_speech_types - 1: speech_type_count += 1 - # Prepare updates for the components - name_updates = [] - audio_updates = [] - ref_text_updates = [] - delete_btn_updates = [] + # Prepare updates for the rows + row_updates = [] for i in range(max_speech_types - 1): if i < speech_type_count: - name_updates.append(gr.update(visible=True)) - audio_updates.append(gr.update(visible=True)) - ref_text_updates.append(gr.update(visible=True)) - delete_btn_updates.append(gr.update(visible=True)) + row_updates.append(gr.update(visible=True)) else: - name_updates.append(gr.update()) - audio_updates.append(gr.update()) - ref_text_updates.append(gr.update()) - delete_btn_updates.append(gr.update()) + row_updates.append(gr.update()) else: # Optionally, show a warning - # gr.Warning("Maximum number of speech types reached.") - name_updates = [gr.update() for _ in range(max_speech_types - 1)] - audio_updates = [gr.update() for _ in range(max_speech_types - 1)] - ref_text_updates = [gr.update() for _ in range(max_speech_types - 1)] - delete_btn_updates = [gr.update() for _ in range(max_speech_types - 1)] - return [speech_type_count] + name_updates + audio_updates + ref_text_updates + delete_btn_updates + row_updates = [gr.update() for _ in range(max_speech_types - 1)] + return [speech_type_count] + row_updates add_speech_type_btn.click( - add_speech_type_fn, - inputs=speech_type_count, - outputs=[speech_type_count] - + speech_type_names - + speech_type_audios - + speech_type_ref_texts - + speech_type_delete_btns, + add_speech_type_fn, inputs=speech_type_count, outputs=[speech_type_count] + speech_type_rows ) # Function to delete a speech type def make_delete_speech_type_fn(index): def delete_speech_type_fn(speech_type_count): # Prepare updates - name_updates = [] - audio_updates = [] - ref_text_updates = [] - delete_btn_updates = [] + row_updates = [] for i in range(max_speech_types - 1): if i == index: - name_updates.append(gr.update(visible=False, value="")) - audio_updates.append(gr.update(visible=False, value=None)) - ref_text_updates.append(gr.update(visible=False, value="")) - delete_btn_updates.append(gr.update(visible=False)) + row_updates.append(gr.update(visible=False)) else: - name_updates.append(gr.update()) - audio_updates.append(gr.update()) - ref_text_updates.append(gr.update()) - delete_btn_updates.append(gr.update()) + row_updates.append(gr.update()) speech_type_count = max(0, speech_type_count - 1) - return [speech_type_count] + name_updates + audio_updates + ref_text_updates + delete_btn_updates + return [speech_type_count] + row_updates return delete_speech_type_fn + # Update delete button clicks for i, delete_btn in enumerate(speech_type_delete_btns): delete_fn = make_delete_speech_type_fn(i) - delete_btn.click( - delete_fn, - inputs=speech_type_count, - outputs=[speech_type_count] - + speech_type_names - + speech_type_audios - + speech_type_ref_texts - + speech_type_delete_btns, - ) + delete_btn.click(delete_fn, inputs=speech_type_count, outputs=[speech_type_count] + speech_type_rows) # Text input for the prompt gen_text_input_emotional = gr.Textbox(label="Text to Generate", lines=10) @@ -432,7 +374,7 @@ with gr.Blocks() as app_emotional: with gr.Accordion("Advanced Settings", open=False): remove_silence_emotional = gr.Checkbox( label="Remove Silences", - value=True, + value=False, ) # Generate button @@ -529,7 +471,7 @@ with gr.Blocks() as app_emotional: speech_types_available.add(name_input) # Parse the gen_text to get the speech types used - segments = parse_emotional_text(gen_text) + segments = parse_speechtypes_text(gen_text) speech_types_in_text = set(segment["emotion"] for segment in segments) # Check if all speech types in text are available @@ -547,6 +489,7 @@ with gr.Blocks() as app_emotional: inputs=[gen_text_input_emotional, regular_name] + speech_type_names, outputs=generate_emotional_btn, ) + with gr.Blocks() as app: gr.Markdown( """