
    2Vh$                     \    d dl Z d dlmZ d dlmZ d dlmZ  ed       G d de             Zy)    N)keras_export)Callback)
file_utilsz keras.callbacks.BackupAndRestorec                   \     e Zd ZdZ	 	 	 d
 fd	ZddZd ZddZddZd Z	d Z
dd	Z xZS )BackupAndRestorea  Callback to back up and restore the training state.

    `BackupAndRestore` callback is intended to recover training from an
    interruption that has happened in the middle of a `Model.fit` execution, by
    backing up the training states in a temporary checkpoint file, at the end of
    each epoch. Each backup overwrites the previously written checkpoint file,
    so at any given time there is at most one such checkpoint file for
    backup/restoring purpose.

    If training restarts before completion, the training state (which includes
    the `Model` weights and epoch number) is restored to the most recently saved
    state at the beginning of a new `Model.fit` run. At the completion of a
    `Model.fit` run, the temporary checkpoint file is deleted.

    Note that the user is responsible to bring jobs back after the interruption.
    This callback is important for the backup and restore mechanism for fault
    tolerance purpose, and the model to be restored from a previous checkpoint
    is expected to be the same as the one used to back up. If user changes
    arguments passed to compile or fit, the checkpoint saved for fault tolerance
    can become invalid.

    Example:

    >>> class InterruptingCallback(keras.callbacks.Callback):
    ...   def on_epoch_begin(self, epoch, logs=None):
    ...     if epoch == 4:
    ...       raise RuntimeError('Interrupting!')
    >>> callback = keras.callbacks.BackupAndRestore(backup_dir="/tmp/backup")
    >>> model = keras.models.Sequential([keras.layers.Dense(10)])
    >>> model.compile(keras.optimizers.SGD(), loss='mse')
    >>> model.build(input_shape=(None, 20))
    >>> try:
    ...   model.fit(np.arange(100).reshape(5, 20), np.zeros(5), epochs=10,
    ...             batch_size=1, callbacks=[callback, InterruptingCallback()],
    ...             verbose=0)
    ... except:
    ...   pass
    >>> history = model.fit(np.arange(100).reshape(5, 20), np.zeros(5),
    ...                     epochs=10, batch_size=1, callbacks=[callback],
    ...                     verbose=0)
    >>> # Only 6 more epochs are run, since first training got interrupted at
    >>> # zero-indexed epoch 4, second training will continue from 4 to 9.
    >>> len(history.history['loss'])
    >>> 6

    Args:
        backup_dir: String, path of directory where to store the data
            needed to restore the model. The directory
            cannot be reused elsewhere to store other files, e.g. by the
            `BackupAndRestore` callback of another training run,
            or by another callback (e.g. `ModelCheckpoint`)
            of the same training run.
        save_freq: `"epoch"`, integer, or `False`. When set to `"epoch"`
          the callback saves the checkpoint at the end of each epoch.
          When set to an integer, the callback saves the checkpoint every
          `save_freq` batches. Set `save_freq=False` only if using
          preemption checkpointing (i.e. with `save_before_preemption=True`).
        double_checkpoint: Boolean. If enabled, `BackupAndRestore` callback
          will save 2 last training states (current and previous). After
          interruption if current state can't be loaded due to IO error
          (e.g. file corrupted) it will try to restore previous one. Such
          behaviour will consume twice more space on disk, but increase fault
          tolerance. Defaults to `False`.
        delete_checkpoint: Boolean. This `BackupAndRestore`
          callback works by saving a checkpoint to back up the training state.
          If `delete_checkpoint=True`, the checkpoint will be deleted after
          training is finished. Use `False` if you'd like to keep the checkpoint
          for future usage. Defaults to `True`.
    c                    t         |           || _        || _        || _        d| _        d| _        d| _        |st        d      || _	        t        j                  |d      | _        t        j                  |d      | _        | j                  dz   | _        | j                  dz   | _        |dk7  r t!        |t"              st        d| d      y y )	Nr   z"Empty `backup_dir` argument passedzlatest.weights.h5ztraining_metadata.jsonz.bkpepochz<Invalid value for argument `save_freq`. Received: save_freq=z.. Expected either 'epoch' or an integer value.)super__init__	save_freqdouble_checkpointdelete_checkpoint_batches_seen_since_last_saving_last_batch_seen_current_epoch
ValueError
backup_dirr   join_weights_path_training_metadata_path_prev_weights_path_prev_training_metadata_path
isinstanceint)selfr   r   r   r   	__class__s        V/home/dcms/DCMS/lib/python3.12/site-packages/keras/src/callbacks/backup_and_restore.pyr   zBackupAndRestore.__init__P   s     	"!2!2/0, !ABB$'__Z9LM'10(
$ #'"4"4v"=((61 	) 
9c(B''0k 2??  )C    c                    	 | j                          y # t        $ r}t        j                  | j                        s|t        j
                  | j                  | j                         t        j                  | j                        r+t        j
                  | j                  | j                         n>t        j                  | j                        rt        j                  | j                         | j                          Y d }~y d }~ww xY wN)
_load_modelOSErrorr   existsr   copyr   r   r   remove)r   logses      r   on_train_beginzBackupAndRestore.on_train_beginq   s    	 	$$T%<%<=OOD33T5G5GH  !B!BC5500 ""4#?#?@!!$">">?	s    	D	C#DD	c                    | j                   j                  st        d| j                    d      t        j                  | j
                        r| j                   j                  Y| j                   j                  j                  s9| j                   j                  j                  | j                   j                         | j                   j                  | j
                         t        j                  | j                        rct        j                  | j                  d      5 }t        j                  |j                               }ddd       d   }|| j                   _        yy# 1 sw Y   !xY w)z6Get training state from temporary file and restore it.z]To use the BackupAndRestore callback, you model must be built before you call `fit()`. Model zJ is unbuilt. You can build it beforehand by calling it on a batch of data.Nrr	   )modelbuiltr   r   r#   r   	optimizerbuildtrainable_variablesload_weightsr   Filejsonloadsread_initial_epoch)r   ftraining_metadatar	   s       r   r!   zBackupAndRestore._load_model   s   zz %??  T//0

$$0

,,22 

$$**4::+I+IJJJ##D$6$67T99:!=!=sC 9q$(JJqvvx$8!9%g.E(-DJJ%	 ;9 9s   "$E&&E/c                 f    |dz   | _         d| _        | j                  dk(  r| j                          y y )N   r   r	   )r   r   r   _save_model)r   r	   r&   s      r   on_epoch_endzBackupAndRestore.on_epoch_end   s3    #ai !>>W$ %r   c                 H    | j                  |      r| j                          y y r    )_should_save_on_batchr:   )r   batchr&   s      r   on_train_batch_endz#BackupAndRestore.on_train_batch_end   s!    %%e, -r   c                    t        j                  | j                        st        j                  | j                         | j                  rIt        j                  | j
                        r*t        j                  | j
                  | j                         | j                  rIt        j                  | j                        r*t        j                  | j                  | j                         | j                  j                  | j
                  d       t        j                  | j                  d      5 }| j                  | j                  d}|j                  t!        j"                  |             ddd       y# 1 sw Y   yxY w)a  Saves the model.

        Args:
            epoch: the epoch this iteration is in.
            batch: the batch this iteration is in. `None` if the `save_freq`
                is set to `"epoch"`.
            logs: the `logs` dict passed in to `on_batch_end` or `on_epoch_end`.
        T)filepath	overwritew)r	   r>   N)r   r#   r   makedirsr   r   r$   r   r   r   r+   save_weightsr1   r   r   writer2   dumps)r   r6   r7   s      r   r:   zBackupAndRestore._save_model   s      10!!j&7&78J8J&KOOD..0G0GH!!j&7&7(('
 OO,,d.O.O 	

););tL__T993? 	31,,..! GGDJJ012	3 	3 	3s   0>E77F c                     | j                   dk(  ry|| j                  k  r|dz   }n|| j                  z
  }| xj                  |z  c_        || _        | j                  | j                   k\  rd| _        yy)z?Handles batch-level saving logic, supports steps_per_execution.r	   Fr9   r   T)r   r   r   )r   r>   add_batchess      r   r=   z&BackupAndRestore._should_save_on_batch   sq    >>W$D)))!)K$"7"77K,,;, %//4>>A34D0r   c                     | j                   r@t        j                  | j                        r t        j                  | j                         y y y r    )r   r   r#   r   rmtree)r   r&   s     r   on_train_endzBackupAndRestore.on_train_end   s7    !!j&7&7&Hdoo. 'I!r   )r	   FTr    )__name__
__module____qualname____doc__r   r(   r!   r;   r?   r:   r=   rL   __classcell__)r   s   @r   r   r      s>    DR B".038 /r   r   )r2   keras.src.api_exportr   keras.src.callbacks.callbackr   keras.src.utilsr   r    r   r   <module>rV      s5     - 1 & 01I/x I/ 2I/r   