
    2Vh                         d dl mZ d dl mZ d dlmZ d dlmZ  edg       G d dej                               Zej                  j                  dej                        e_	        y	)
    )backend)ops)keras_export)	optimizerzkeras.optimizers.Adafactorc                   d     e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z fdZd Zd Z fdZ xZ	S )	Adafactora(  Optimizer that implements the Adafactor algorithm.

    Adafactor is commonly used in NLP tasks, and has the advantage
    of taking less memory because it only saves partial information of previous
    gradients.

    The default argument setup is based on the original paper (see reference).
    When gradients are of dimension > 2, Adafactor optimizer will delete the
    last 2 dimensions separately in its accumulator variables.

    Args:
        learning_rate: A float, a
            `keras.optimizers.schedules.LearningRateSchedule` instance, or
            a callable that takes no arguments and returns the actual value to
            use. The learning rate. Defaults to `0.001`.
        beta_2_decay: float, defaults to -0.8. The decay rate of `beta_2`.
        epsilon_1: float, defaults to 1e-30. A small offset to keep denominator
            away from 0.
        epsilon_2: float, defaults to 1e-3. A small offset to avoid learning
            rate becoming too small by time.
        clip_threshold: float, defaults to 1.0. Clipping threshold. This is a
            part of Adafactor algorithm, independent from `clipnorm`,
            `clipvalue`, and `global_clipnorm`.
        relative_step: bool, defaults to `True`. If `learning_rate` is a
            constant and `relative_step=True`, learning rate will be adjusted
            based on current iterations. This is a default learning rate decay
            in Adafactor.
        {{base_optimizer_keyword_args}}

    Reference:

    - [Shazeer, Noam et al., 2018](https://arxiv.org/abs/1804.04235).

    c                     t        |   d|||||	|
|||||d| || _        || _        || _        || _        || _        y )N)learning_ratenameweight_decayclipnorm	clipvalueglobal_clipnormuse_emaema_momentumema_overwrite_frequencyloss_scale_factorgradient_accumulation_steps )super__init__beta_2_decay	epsilon_1	epsilon_2clip_thresholdrelative_step)selfr
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   kwargs	__class__s                     N/home/dcms/DCMS/lib/python3.12/site-packages/keras/src/optimizers/adafactor.pyr   zAdafactor.__init__,   sj    ( 	 	
'%+%$;/(C	
 	
 )"",*    c                 >   | j                   ryt        | 	  |       g | _        g | _        g | _        |D ]  }t        |j                        dk  rw| j                  j                  t        j                  d|j                  d             | j                  j                  t        j                  d|j                  d             n| j                  |      r7| j                  j                  d       | j                  j                  d       n|j                  dd }|j                  dd |j                  d   fz   }| j                  j                  | j                  ||j                  |j                               | j                  j                  | j                  ||j                  |j                               | j                  |      r| j
                  j                  d       | j
                  j                  | j                  |d	
              y)a  Initialize optimizer variables.

        Adam optimizer has 3 types of variables: momentums, velocities and
        velocity_hat (only set when amsgrad is applied),

        Args:
            var_list: list of model variables to build Adam variables on.
        N   r   F)r   	trainable)shapedtyper   velocity)reference_variabler   )builtr   build_r_c_vlenr'   appendr   Variabler   !_overwrite_variable_with_gradientadd_variabler(   add_variable_from_reference)r   var_listvarr_shapec_shaper   s        r    r,   zAdafactor.buildT   s    ::h '	C399~! $$QSXXG $$QSXXG 77<t$t$ ))CR.))CR.CIIbM+;;%%%!ii XX &  %%%!ii XX &  55c:t$44+.Z 5 G'	r!   c                 x    t        j                  t        j                  t        j                  |                  S )N)r   sqrtmeansquare)r   xs     r    _rmszAdafactor._rms   s"    xxA/00r!   c                 H   t        j                  ||j                        }t        j                  ||j                        }t        j                  | j                  |j                        }t        j                  d|j                        }t        j                  | j                  dz   |j                        }t        | j                        s8| j                  r,t        j                  |dt        j                  |      z        }| j                  | j                  |         }| j                  | j                  |         }	| j                  | j                  |         }
t        j                  |dt        j                  |      z        }t        j                  || j                  |            |z  }t        j                   t        j"                  |      | j$                        }dt        j&                  || j(                        z
  }t+        |j,                        dk\  r| j/                  |||z  d|z
  t        j0                  |d      z  z          | j/                  |	||	z  d|z
  t        j0                  |d      z  z          | j/                  |
t        j2                  |t        j0                  |dd      z  d      t        j2                  |	d      z         n| j/                  |
||
z  d|z
  |z  z          t        j4                  |t        j                  |
            }t        j4                  |t        j                  |t        j4                  | j                  |      | j6                                    }| j9                  |t        j:                  ||             y	)
z=Update step given gradient and the associated model variable.      ?   r#   r%   )axisr&   T)rC   keepdimsN)r   castr(   r   
iterationscallable_learning_rater   minimumr;   r-   _get_variable_indexr.   r/   maximumr?   addr=   r   powerr   r0   r'   assignr<   expand_dimsdivider   
assign_submultiply)r   gradientvariabler
   lrr   one
local_steprcvrho_talpha_tregulated_grad_squarebeta_2_tu_tu_t_hats                    r    update_stepzAdafactor.update_step   s    XXmX^^488Hhnn5HHT^^X^^<	hhsHNN+XXdoo18>>B
++,1C1CRSXXj%9!9:BGGD,,X67GGD,,X67GGD,,X67BCHHZ$8 89++i8)<=E #

8(<dnn MsyyT->->??x~~!# KK1x<388,A#KKL KK1x<388,A#KKL
 KKd;;" //!R() KK8a<1x<3H"HH jj388A;/**KKSZZ		#8K8KLM
 	#,,w"@Ar!   c                     t         |          }|j                  | j                  | j                  | j
                  | j                  | j                  d       |S )N)r   r   r   r   r   )r   
get_configupdater   r   r   r   r   )r   configr   s     r    rc   zAdafactor.get_config   sQ    #% $ 1 1!^^!^^"&"5"5!%!3!3	
 r!   )MbP?ggKH9rf   rA   TNNNNFgGz?NNN	adafactor)
__name__
__module____qualname____doc__r   r,   r?   ra   rc   __classcell__)r   s   @r    r   r      s]    !J  $$(#&+P6p14Bl r!   r   z{{base_optimizer_keyword_args}}N)	keras.srcr   r   keras.src.api_exportr   keras.src.optimizersr   	Optimizerr   rk   replacebase_optimizer_keyword_argsr   r!   r    <module>rs      se      - * +,-I	## I .IX %%--%y'L'L	 r!   