
    2Vh9*                     r    d dl Z d dlmZ d dlmZ d dlmZ  edg       G d dej                               Zy)    N)ops)keras_export)	optimizerzkeras.optimizers.Muonc                        e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Zd Z fdZd Zd Zd Z	d Z
d	efd
Z fdZ xZS )Muona>  Optimizer that implements the Muon algorithm.

    Note that this optimizer should not be used in the following layers:

    1. Embedding layer
    2. Final output fully connected layer
    3. Any {0,1}-D variables

    These should all be optimized using AdamW.

    The Muon optimizer can use both the Muon update step or the
    AdamW update step based on the following:

    - For any variable that isn't 2D, 3D or 4D, the AdamW step
        will be used. This is not configurable.
    - If the argument `exclude_embeddings` (defaults to `True`) is set
    to `True`, the AdamW step will be used.
    - For any variablewith a name that matches an expression
        listed in the argument `exclude_layers` (a list), the
        AdamW step will be used.
    - Any other variable uses the Muon step.

    Typically, you only need to pass the name of your densely-connected
    output layer to `exclude_layers`, e.g.
    `exclude_layers=["output_dense"]`.

    References:
        - [Original implementation](https://github.com/KellerJordan/Muon)
        - [Liu et al, 2025](https://arxiv.org/abs/2502.16982)

    Args:
        learning_rate: A float,
            `keras.optimizers.schedules.LearningRateSchedule` instance, or
            a callable that takes no arguments and returns the actual value to
            use. The learning rate. Defaults to `0.001`.
        adam_beta_1: A float value or a constant float tensor, or a callable
            that takes no arguments and returns the actual value to use.
            The exponential decay rate for the 1st moment estimates. Defaults to
            `0.9`.
        adam_beta_2: A float value or a constant float tensor, ora callable
            that takes no arguments and returns the actual value to use.
            The exponential decay rate for the 2nd moment estimates. Defaults to
            `0.999`.
        epsilon: A small constant for numerical stability. This is
            "epsilon hat" in the Kingma and Ba paper
            (in the formula just before Section 2.1),
            not the epsilon in Algorithm 1 of the paper.
            It be used at Adamw.Defaults to `1e-7`.
        exclude_layers: List of strings, keywords of layer names to exclude.
            All layers with keywords in their path will use adamw.
        exclude_embeddings: Boolean value
            If True, embedding layers will use adamw.
        muon_a: Float, parameter a of the muon algorithm.
            It is recommended to use the default value
        muon_b: Float, parameter b of the muon algorithm.
            It is recommended to use the default value
        muon_c: Float, parameter c of the muon algorithm.
            It is recommended to use the default value
        adam_lr_ratio: Float, the ratio of the learning rate when
                using Adam to the main learning rate.
                it is recommended to set it to 0.1
        momentum: Float, momentum used by internal SGD.
        ns_steps: Integer, number of Newton-Schulz iterations to run.
        nesterov: Boolean, whether to use Nesterov-style momentum
        {{base_optimizer_keyword_args}}
    c                     t        |   d|||||||	|
|||d| || _        || _        || _        || _        || _        || _        || _        || _	        || _
        || _        || _        |xs g | _        y )N)learning_ratenameweight_decayclipnorm	clipvalueglobal_clipnormuse_emaema_momentumema_overwrite_frequencyloss_scale_factorgradient_accumulation_steps )super__init__adam_beta_1adam_beta_2epsilonmuon_amuon_bmuon_cadam_lr_ratiomomentumns_stepsnesterovexclude_embeddingsexclude_layers)selfr	   r   r   r   r   r   r   r   r   r   r   r   r   r
   r"   r!   r   r   r   r   r   r   r    kwargs	__class__s                            I/home/dcms/DCMS/lib/python3.12/site-packages/keras/src/optimizers/muon.pyr   zMuon.__init__M   s    6 	 	
'%+%$;/(C	
 	
 '&*   "4,2    c                     dt        |j                        cxk  rdk  sy y| j                  rd|j                  j	                         v ry| j
                  D ]$  }t        j                  ||j                        s$ y y)N      T	embeddingF)lenshaper!   pathlowerr"   research)r#   variablekeywords      r&   _should_use_adamwzMuon._should_use_adamw   st     3x~~&** +""{hmm6I6I6K'K** 	Gyy(--0	 r'   c                 t   | j                   ryt        | 	  |       i | _        i | _        i | _        i | _        |D ]z  }| j                  |      r| j                  |d      | j                  |j                  <   | j                  |      sQ| j                  |d      | j                  |j                  <   | y)a  Initialize optimizer variables.

        Adam optimizer has 3 types of variables: momentums, velocities and
        velocity_hat (only set when amsgrad is applied),

        Args:
            var_list: list of model variables to build Adam variables on.
        Nr   )reference_variabler
   velocity)builtr   buildadam_momentumsadam_velocitiesmuon_momentumsmuon_velocities!_overwrite_variable_with_gradientadd_variable_from_referencer.   r4   )r#   var_listvarr%   s      r&   r9   z
Muon.build   s     ::h ! ! 	C99#>44+.Z 5  ##CHH-
 ))#.88/2 9  ((2	r'   c                     | j                  |      r!| j                  |||| j                  z         y | j                  |||       y )N)r4   _adamw_update_stepr   _muon_update_step)r#   gradientr2   r	   s       r&   update_stepzMuon.update_step   sD    !!(+##(MD4F4F$F ""8X}Er'   c           
         | j                   |j                     }| j                  |t        j                  ||| j
                  dz
  z               |j                  }| j                  r$t        j                  || j
                  |z        }n|}| j                  ||| j                  || j                        z  t        d|d   |d   z        dz  z         y )Nr)   r   g      ?)r:   r.   
assign_addr   addr   r-   r    
assign_subzeropower_via_newtonschulz5r   max)r#   rE   r2   lrmr-   gs          r&   rD   zMuon._muon_update_step   s    .3778Q$--!2C-DEF==$--!"34AA..q$--@A!U1Xa()S01	
r'   c                    t        j                  ||j                        }t        j                  ||j                        }t        j                  | j                  dz   |j                        }t        j                  t        j                  | j
                  |j                        |      }t        j                  t        j                  | j                  |j                        |      }| j                  |j                     }| j                  |j                     }	|t        j                  d|z
        z  d|z
  z  }
| j                  |t        j                  t        j                  ||      d| j
                  z
               | j                  |	t        j                  t        j                  t        j                  |      |	      d| j                  z
               | j                  |t        j                   t        j                  ||
      t        j"                  t        j                  |	      | j$                                     y)z=Update step given gradient and the associated model variable.r)   N)r   castdtype
iterationspowerr   r   r:   r.   r;   sqrtrH   multiplysubtractsquarerJ   dividerI   r   )r#   rE   r2   r	   rM   
local_stepadam_beta_1_poweradam_beta_2_powerrN   valphas              r&   rC   zMuon._adamw_update_step   s   XXmX^^488Hhnn5XXdoo18>>B
IIHHT%%x~~6

  IIHHT%%x~~6

 .  /SXXa"3344<M8MNs||CLL15q4;K;K7KL	
 	LLSZZ115q4;K;K7K	
 	JJQ&T\\(J	
r'   c                     t        j                  |      }t        t        t	        |                  }|d   |d<   t	        |      dz
  |d<   t        j
                  ||      }|S )N   )r   r-   listranger,   	transpose)r#   Xr-   
temp_orders       r&   transpose_last_axiszMuon.transpose_last_axis   sU    		!%E
+,
#B
2Ua
2MM!Z(r'   stepsc                    t        j                  |      }t        |      dk\  sJ | j                  | j                  | j
                  }}}|d   |d   kD  r| j                  |      }|t        j                  |dd      dz   z  }t        |      D ]/  }|| j                  |      z  }||z  ||z  |z  z   }	||z  |	|z  z   }1 |d   |d   kD  r| j                  |      }|S )a  We apply the Newton-Schulz iteration to compute matrix G.

        We select a quintic iteration that maximizes the slope at zero. This
        approach helps minimize steps, even if the iteration doesn't fully
        converge across the interval. The result isn't exactly UV^T (from the
        SVD of G), but rather an approximation like US'V^T. Despite this
        approximation, model performance remains unaffected compared to using
        the exact UV^T from the SVD.
        rb   ra   r`   )ra   r`   T)axiskeepdimsHz>)	r   r-   r,   r   r   r   rh   normrd   )
r#   xri   r-   abc_temp_atemp_bs
             r&   rK   z Muon.zeropower_via_newtonschulz5   s     		!5zQ++t{{DKKa19uRy ((+A !(T:TABu 	#A11!44FZ!f*v"55FA
"A	#
 9uRy ((+Ar'   c                 P   t         |          }|j                  | j                  | j                  | j
                  | j                  | j                  | j                  | j                  | j                  | j                  | j                  | j                  | j                  d       |S )N)r   r   r   r"   r   r   r   r   r   r   r    r!   )r   
get_configupdater   r   r   r"   r   r   r   r   r   r   r    r!   )r#   configr%   s     r&   rw   zMuon.get_config  s    #%#//#//<<"&"5"5++++++!%!3!3 MM MM MM&*&=&=	
  r'   )gMbP?g?g+?rm   皙?NNNFgGz?NNNmuonNTguV@ggn@ @rz   gffffff?   T)__name__
__module____qualname____doc__r   r4   r9   rF   rD   rC   rh   intrK   rw   __classcell__)r%   s   @r&   r   r      s    AJ  $$(143l@F
 
BC : r'   r   )	r0   	keras.srcr   keras.src.api_exportr   keras.src.optimizersr   	Optimizerr   r   r'   r&   <module>r      s>    	  - * &'(X9 X )Xr'   