
    2Vh                         d dl mZ d dlmZ d dlmZ  edg       G d dej
                               Zej                  j                  dej                        e_        y)	    )keras_export)adam)	optimizerzkeras.optimizers.AdamWc                   B     e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )AdamWa  Optimizer that implements the AdamW algorithm.

    AdamW optimization is a stochastic gradient descent method that is based on
    adaptive estimation of first-order and second-order moments with an added
    method to decay weights per the techniques discussed in the paper,
    'Decoupled Weight Decay Regularization' by
    [Loshchilov, Hutter et al., 2019](https://arxiv.org/abs/1711.05101).

    According to
    [Kingma et al., 2014](http://arxiv.org/abs/1412.6980),
    the underlying Adam method is "*computationally
    efficient, has little memory requirement, invariant to diagonal rescaling of
    gradients, and is well suited for problems that are large in terms of
    data/parameters*".

    Args:
        learning_rate: A float, a
            `keras.optimizers.schedules.LearningRateSchedule` instance, or
            a callable that takes no arguments and returns the actual value to
            use. The learning rate. Defaults to `0.001`.
        beta_1: A float value or a constant float tensor, or a callable
            that takes no arguments and returns the actual value to use. The
            exponential decay rate for the 1st moment estimates.
            Defaults to `0.9`.
        beta_2: A float value or a constant float tensor, or a callable
            that takes no arguments and returns the actual value to use. The
            exponential decay rate for the 2nd moment estimates.
            Defaults to `0.999`.
        epsilon: A small constant for numerical stability. This epsilon is
            "epsilon hat" in the Kingma and Ba paper (in the formula just
            before Section 2.1), not the epsilon in Algorithm 1 of the paper.
            Defaults to 1e-7.
        amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm
            from the paper "On the Convergence of Adam and beyond".
            Defaults to `False`.
        {{base_optimizer_keyword_args}}

    References:

    - [Loshchilov et al., 2019](https://arxiv.org/abs/1711.05101)
    - [Kingma et al., 2014](http://arxiv.org/abs/1412.6980) for `adam`
    - [Reddi et al., 2018](
        https://openreview.net/pdf?id=ryQu7f-RZ) for `amsgrad`.
    c                 r    t        |   d||||||||||	|
||||d| | j                  t        d      y )N)learning_ratebeta_1beta_2epsilonamsgradnameweight_decayclipnorm	clipvalueglobal_clipnormuse_emaema_momentumema_overwrite_frequencyloss_scale_factorgradient_accumulation_stepszDArgument `weight_decay` must be a float. Received: weight_decay=None )super__init__r   
ValueError)selfr	   r   r
   r   r   r   r   r   r   r   r   r   r   r   r   kwargs	__class__s                    J/home/dcms/DCMS/lib/python3.12/site-packages/keras/src/optimizers/adamw.pyr   zAdamW.__init__5   ss    & 	 	
'%+%$;/(C	
  !	
& $$  %    )gMbP?gMbp?g?g+?gHz>FNNNFgGz?NNNadamw)__name__
__module____qualname____doc__r   __classcell__)r   s   @r   r   r      sB    +^  $$(!* *r    r   z{{base_optimizer_keyword_args}}N)
keras.src.api_exportr   keras.src.optimizersr   r   Adamr   r%   replacebase_optimizer_keyword_argsr   r    r   <module>r,      s]    - % * '()XDII X *Xv %%%y'L'Lr    