From 0ee258c0251fc6a314aecbf667a61def2fa768de Mon Sep 17 00:00:00 2001
From: ZhikangNiu <nzk020109@163.com>
Date: Sat, 23 Nov 2024 23:51:31 +0800
Subject: [PATCH] support hydra config training

---
 src/f5_tts/config/E2TTS_Base_train.yaml |  40 +++++++++
 src/f5_tts/config/F5TTS_Base_train.yaml |  42 ++++++++++
 src/f5_tts/train/README.md              |   2 +-
 src/f5_tts/train/train.py               | 104 ++++++++----------------
 4 files changed, 118 insertions(+), 70 deletions(-)
 create mode 100644 src/f5_tts/config/E2TTS_Base_train.yaml
 create mode 100644 src/f5_tts/config/F5TTS_Base_train.yaml

diff --git a/src/f5_tts/config/E2TTS_Base_train.yaml b/src/f5_tts/config/E2TTS_Base_train.yaml
new file mode 100644
index 0000000..4672bb8
--- /dev/null
+++ b/src/f5_tts/config/E2TTS_Base_train.yaml
@@ -0,0 +1,40 @@
+hydra:
+  run:
+    dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    
+datasets:
+  name: Emilia_ZH_EN
+  batch_size_per_gpu: 38400  # 8 GPUs, 8 * 38400 = 307200
+  batch_size_type: frame # "frame" or "sample"
+  max_samples: 64  # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
+
+optim:
+  epochs: 15
+  learning_rate: 7.5e-5
+  num_warmup_updates: 20000  # warmup steps
+  grad_accumulation_steps: 1  # note: updates = steps / grad_accumulation_steps
+  max_grad_norm: 1.0
+
+model:
+  name: E2TTS
+  tokenizer: char
+  tokenizer_path: None  # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
+  arch:
+    dim: 1024
+    depth: 24
+    heads: 16
+    ff_mult: 4
+  mel_spec:
+    target_sample_rate: 24000
+    n_mel_channels: 100
+    hop_length: 256
+    win_length: 1024
+    n_fft: 1024
+    mel_spec_type: vocos  # 'vocos' or 'bigvgan'
+    is_local_vocoder: False
+    local_vocoder_path: None
+
+ckpts:
+  save_per_updates: 50000 # save checkpoint per steps
+  last_per_steps: 5000 # save last checkpoint per steps
+  save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
\ No newline at end of file
diff --git a/src/f5_tts/config/F5TTS_Base_train.yaml b/src/f5_tts/config/F5TTS_Base_train.yaml
new file mode 100644
index 0000000..f3ead3e
--- /dev/null
+++ b/src/f5_tts/config/F5TTS_Base_train.yaml
@@ -0,0 +1,42 @@
+hydra:
+  run:
+    dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    
+datasets:
+  name: Emilia_ZH_EN
+  batch_size_per_gpu: 38400  # 8 GPUs, 8 * 38400 = 307200
+  batch_size_type: frame # "frame" or "sample"
+  max_samples: 64  # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
+
+optim:
+  epochs: 15
+  learning_rate: 7.5e-5
+  num_warmup_updates: 20000  # warmup steps
+  grad_accumulation_steps: 1  # note: updates = steps / grad_accumulation_steps
+  max_grad_norm: 1.0
+
+model:
+  name: F5TTS
+  tokenizer: char
+  tokenizer_path: None  # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
+  arch:
+    dim: 1024
+    depth: 22
+    heads: 16
+    ff_mult: 2
+    text_dim: 512
+    conv_layers: 4
+  mel_spec:
+    target_sample_rate: 24000
+    n_mel_channels: 100
+    hop_length: 256
+    win_length: 1024
+    n_fft: 1024
+    mel_spec_type: vocos  # 'vocos' or 'bigvgan'
+    is_local_vocoder: False
+    local_vocoder_path: None
+
+ckpts:
+  save_per_updates: 50000 # save checkpoint per steps
+  last_per_steps: 5000 # save last checkpoint per steps
+  save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
\ No newline at end of file
diff --git a/src/f5_tts/train/README.md b/src/f5_tts/train/README.md
index d114db5..a6dfda3 100644
--- a/src/f5_tts/train/README.md
+++ b/src/f5_tts/train/README.md
@@ -35,7 +35,7 @@ Once your datasets are prepared, you can start the training process.
 # setup accelerate config, e.g. use multi-gpu ddp, fp16
 # will be to: ~/.cache/huggingface/accelerate/default_config.yaml     
 accelerate config
-accelerate launch src/f5_tts/train/train.py
+accelerate launch src/f5_tts/train/train.py --config-name F5TTS_Base_train.yaml # F5TTS_Base_train.yaml | E2TTS_Base_train.yaml
 ```
 
 ### 2. Finetuning practice
diff --git a/src/f5_tts/train/train.py b/src/f5_tts/train/train.py
index fac0fe5..48341f2 100644
--- a/src/f5_tts/train/train.py
+++ b/src/f5_tts/train/train.py
@@ -1,98 +1,64 @@
 # training script.
-
+import os
 from importlib.resources import files
 
+import hydra
+
 from f5_tts.model import CFM, DiT, Trainer, UNetT
 from f5_tts.model.dataset import load_dataset
 from f5_tts.model.utils import get_tokenizer
 
-# -------------------------- Dataset Settings --------------------------- #
 
-target_sample_rate = 24000
-n_mel_channels = 100
-hop_length = 256
-win_length = 1024
-n_fft = 1024
-mel_spec_type = "vocos"  # 'vocos' or 'bigvgan'
+@hydra.main(config_path=os.path.join("..", "configs"), config_name=None)
+def main(cfg):
+    tokenizer = cfg.model.tokenizer
+    mel_spec_type = cfg.model.mel_spec.mel_spec_type
+    exp_name = f"{cfg.model.name}_{mel_spec_type}_{cfg.model.tokenizer}_{cfg.datasets.name}"
 
-tokenizer = "pinyin"  # 'pinyin', 'char', or 'custom'
-tokenizer_path = None  # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
-dataset_name = "Emilia_ZH_EN"
-
-# -------------------------- Training Settings -------------------------- #
-
-exp_name = "F5TTS_Base"  # F5TTS_Base | E2TTS_Base
-
-learning_rate = 7.5e-5
-
-batch_size_per_gpu = 38400  # 8 GPUs, 8 * 38400 = 307200
-batch_size_type = "frame"  # "frame" or "sample"
-max_samples = 64  # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
-grad_accumulation_steps = 1  # note: updates = steps / grad_accumulation_steps
-max_grad_norm = 1.0
-
-epochs = 11  # use linear decay, thus epochs control the slope
-num_warmup_updates = 20000  # warmup steps
-save_per_updates = 50000  # save checkpoint per steps
-last_per_steps = 5000  # save last checkpoint per steps
-
-# model params
-if exp_name == "F5TTS_Base":
-    wandb_resume_id = None
-    model_cls = DiT
-    model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
-elif exp_name == "E2TTS_Base":
-    wandb_resume_id = None
-    model_cls = UNetT
-    model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
-
-
-# ----------------------------------------------------------------------- #
-
-
-def main():
-    if tokenizer == "custom":
-        tokenizer_path = tokenizer_path
+    # set text tokenizer
+    if tokenizer != "custom":
+        tokenizer_path = cfg.datasets.name
     else:
-        tokenizer_path = dataset_name
+        tokenizer_path = cfg.model.tokenizer_path
     vocab_char_map, vocab_size = get_tokenizer(tokenizer_path, tokenizer)
 
-    mel_spec_kwargs = dict(
-        n_fft=n_fft,
-        hop_length=hop_length,
-        win_length=win_length,
-        n_mel_channels=n_mel_channels,
-        target_sample_rate=target_sample_rate,
-        mel_spec_type=mel_spec_type,
-    )
+    # set model
+    if "F5TTS" in cfg.model.name:
+        model_cls = DiT
+    elif "E2TTS" in cfg.model.name:
+        model_cls = UNetT
+    wandb_resume_id = None
 
     model = CFM(
-        transformer=model_cls(**model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels),
-        mel_spec_kwargs=mel_spec_kwargs,
+        transformer=model_cls(**cfg.model.arch, text_num_embeds=vocab_size, mel_dim=cfg.model.mel_spec.n_mel_channels),
+        mel_spec_kwargs=cfg.model.mel_spec,
         vocab_char_map=vocab_char_map,
     )
 
+    # init trainer
     trainer = Trainer(
         model,
-        epochs,
-        learning_rate,
-        num_warmup_updates=num_warmup_updates,
-        save_per_updates=save_per_updates,
-        checkpoint_path=str(files("f5_tts").joinpath(f"../../ckpts/{exp_name}")),
-        batch_size=batch_size_per_gpu,
-        batch_size_type=batch_size_type,
-        max_samples=max_samples,
-        grad_accumulation_steps=grad_accumulation_steps,
-        max_grad_norm=max_grad_norm,
+        epochs=cfg.optim.epochs,
+        learning_rate=cfg.optim.learning_rate,
+        num_warmup_updates=cfg.optim.num_warmup_updates,
+        save_per_updates=cfg.ckpts.save_per_updates,
+        checkpoint_path=str(files("f5_tts").joinpath(f"../../{cfg.ckpts.save_dir}")),
+        batch_size=cfg.datasets.batch_size_per_gpu,
+        batch_size_type=cfg.datasets.batch_size_type,
+        max_samples=cfg.datasets.max_samples,
+        grad_accumulation_steps=cfg.optim.grad_accumulation_steps,
+        max_grad_norm=cfg.optim.max_grad_norm,
         wandb_project="CFM-TTS",
         wandb_run_name=exp_name,
         wandb_resume_id=wandb_resume_id,
-        last_per_steps=last_per_steps,
+        last_per_steps=cfg.ckpts.last_per_steps,
         log_samples=True,
         mel_spec_type=mel_spec_type,
+        is_local_vocoder=cfg.model.mel_spec.is_local_vocoder,
+        local_vocoder_path=cfg.model.mel_spec.local_vocoder_path,
     )
 
-    train_dataset = load_dataset(dataset_name, tokenizer, mel_spec_kwargs=mel_spec_kwargs)
+    train_dataset = load_dataset(cfg.datasets.name, tokenizer, mel_spec_kwargs=cfg.model.mel_spec)
     trainer.train(
         train_dataset,
         resumable_with_seed=666,  # seed for shuffling dataset