From c817d6a21d759d2934688d2380acb5b53f00fb61 Mon Sep 17 00:00:00 2001 From: QingyuLiu0521 <2904292256@qq.com> Date: Sun, 15 Feb 2026 23:24:11 -0500 Subject: [PATCH] Unify seq_len naming in DiT get_input_embed --- src/f5_tts/model/backbones/dit.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/f5_tts/model/backbones/dit.py b/src/f5_tts/model/backbones/dit.py index 5a3e01a..243a0dd 100644 --- a/src/f5_tts/model/backbones/dit.py +++ b/src/f5_tts/model/backbones/dit.py @@ -265,10 +265,10 @@ class DiT(nn.Module): ): if self.text_uncond is None or self.text_cond is None or not cache: if audio_mask is None: - seq_lens = x.shape[1] + seq_len = x.shape[1] else: - seq_lens = audio_mask.sum(dim=1) # per-sample valid speech length - text_embed = self.text_embed(text, seq_lens, drop_text=drop_text) + seq_len = audio_mask.sum(dim=1) # per-sample valid speech length + text_embed = self.text_embed(text, seq_len=seq_len, drop_text=drop_text) if cache: if drop_text: self.text_uncond = text_embed