update: better save last & per ckpt logic #924

Co-authored-by: Yushen CHEN <45333109+SWivid@users.noreply.github.com>
This commit is contained in:
Zhikang Niu
2025-03-28 13:53:12 +08:00
committed by GitHub
parent 7236536f9a
commit 5bd8cd7aed

View File

@@ -395,6 +395,9 @@ class Trainer:
self.writer.add_scalar("loss", loss.item(), global_update)
self.writer.add_scalar("lr", self.scheduler.get_last_lr()[0], global_update)
if global_update % self.last_per_updates == 0 and self.accelerator.sync_gradients:
self.save_checkpoint(global_update, last=True)
if global_update % self.save_per_updates == 0 and self.accelerator.sync_gradients:
self.save_checkpoint(global_update)
@@ -430,9 +433,6 @@ class Trainer:
)
self.model.train()
if global_update % self.last_per_updates == 0 and self.accelerator.sync_gradients:
self.save_checkpoint(global_update, last=True)
self.save_checkpoint(global_update, last=True)
self.accelerator.end_training()