
    BVh^                     n    d Z ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ  G d dej                        Z
y	)
zSGD optimizer implementation.    )tensor)optimizer_v2)	array_ops)gen_resource_variable_ops)gen_training_opsc                   d     e Zd ZdZdZ	 	 	 	 d
 fd	Zd Z fdZddZ fdZ	ddZ
 fd	Z xZS )SGDa  Gradient descent (with momentum) optimizer.

  Update rule for parameter `w` with gradient `g` when `momentum` is 0:

  ```python
  w = w - learning_rate * g
  ```

  Update rule when `momentum` is larger than 0:

  ```python
  velocity = momentum * velocity - learning_rate * g
  w = w + velocity
  ```

  When `nesterov=True`, this rule becomes:

  ```python
  velocity = momentum * velocity - learning_rate * g
  w = w + momentum * velocity - learning_rate * g
  ```

  Args:
    learning_rate: A `Tensor`, floating point value, or a schedule that is a
      `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
      that takes no arguments and returns the actual value to use. The
      learning rate. Defaults to 0.01.
    momentum: float hyperparameter >= 0 that accelerates gradient descent
      in the relevant
      direction and dampens oscillations. Defaults to 0, i.e., vanilla gradient
      descent.
    nesterov: boolean. Whether to apply Nesterov momentum.
      Defaults to `False`.
    name: Optional name prefix for the operations created when applying
      gradients.  Defaults to `"SGD"`.
    **kwargs: Keyword arguments. Allowed to be one of
      `"clipnorm"` or `"clipvalue"`.
      `"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips
      gradients by value.

  Usage:

  >>> opt = tf.keras.optimizers.SGD(learning_rate=0.1)
  >>> var = tf.Variable(1.0)
  >>> loss = lambda: (var ** 2)/2.0         # d(loss)/d(var1) = var1
  >>> step_count = opt.minimize(loss, [var]).numpy()
  >>> # Step is `- learning_rate * grad`
  >>> var.numpy()
  0.9

  >>> opt = tf.keras.optimizers.SGD(learning_rate=0.1, momentum=0.9)
  >>> var = tf.Variable(1.0)
  >>> val0 = var.value()
  >>> loss = lambda: (var ** 2)/2.0         # d(loss)/d(var1) = var1
  >>> # First step is `- learning_rate * grad`
  >>> step_count = opt.minimize(loss, [var]).numpy()
  >>> val1 = var.value()
  >>> (val0 - val1).numpy()
  0.1
  >>> # On later steps, step-size increases because of momentum
  >>> step_count = opt.minimize(loss, [var]).numpy()
  >>> val2 = var.value()
  >>> (val1 - val2).numpy()
  0.18

  Reference:
      - For `nesterov=True`, See [Sutskever et al., 2013](
        http://jmlr.org/proceedings/papers/v28/sutskever13.pdf).
  Tc                    t        t        | 
  |fi | | j                  d|j	                  d|             | j                  d| j
                         d| _        t        |t        j                        st        |      s|dkD  rd| _        t        |t        t        f      r|dk  s|dkD  rt        d      | j                  d	|       || _        y )
Nlearning_ratelrdecayFr   T   z"`momentum` must be between [0, 1].momentum)superr	   __init__
_set_hyperget_initial_decay	_momentum
isinstancer   Tensorcallableintfloat
ValueErrornesterov)selfr   r   r   namekwargs	__class__s         e/home/dcms/DCMS/lib/python3.12/site-packages/tensorflow/python/keras/optimizer_v2/gradient_descent.pyr   zSGD.__init__b   s     
#td-f-OOOVZZm%DEOOGT001DN&--!$,X$6(Q,dn(S%L)x!|x!|;<<OOJ)DM    c                 P    | j                   r|D ]  }| j                  |d        y y Nr   )r   add_slot)r   var_listvars      r!   _create_slotszSGD._create_slotsv   s+    ~~ '#c:&' r"   c                     t         t        |   |||       t        j                  | j                  d|            |||f   d<   y r$   )r   r	   _prepare_localr   identity
_get_hyper)r   
var_device	var_dtypeapply_stater    s       r!   r*   zSGD._prepare_local{   sC    	#t#J	;G7@7I7I
I.80KY'(4r"   c           	         |j                   |j                  j                  }}|xs i j                  ||f      xs | j	                  ||      }| j
                  r\| j                  |d      }t        j                  |j                  |j                  |d   ||d   | j                  | j                        S t        j                  |j                  |d   || j                        S )Nr   lr_t)r'   accumr   gradr   use_lockinguse_nesterov)r'   alphadeltar4   )devicedtype
base_dtyper   _fallback_apply_stater   get_slotr   ResourceApplyKerasMomentumhandle_use_lockingr   ResourceApplyGradientDescent)r   r3   r'   r/   r-   r.   coefficientsmomentum_vars           r!   _resource_apply_densezSGD._resource_apply_dense   s    JJ		(<(<	J &B++Z,CD I11*iH  ~~]]3
3l88jj##&!
+''}}& & ::jjV$''	) )r"   c                 L   | j                   rt        t        |   |||fi |S |j                  |j
                  j                  }}|j                  di       j                  ||f      xs | j                  ||      }t        j                  |j                  || |d   z        S )Nr/   r1   )resourceindicesupdates)r   r   r	   (_resource_apply_sparse_duplicate_indicesr8   r9   r:   r   r;   r   ResourceScatterAddr>   )	r   r3   r'   rF   r   r-   r.   rA   r    s	           r!   rH   z,SGD._resource_apply_sparse_duplicate_indices   s    ~~3F
W( &( ( "jj#))*>*>)jjj377Y8OP K33J	J  '99::%,v..0 0r"   c           
      V   |j                   |j                  j                  }}|xs i j                  ||f      xs | j	                  ||      }| j                  |d      }t        j                  |j                  |j                  |d   |||d   | j                  | j                        S )Nr   r1   )r'   r2   r   r3   rF   r   r4   r5   )r8   r9   r:   r   r;   r<   r    ResourceSparseApplyKerasMomentumr>   r?   r   )	r   r3   r'   rF   r/   r-   r.   rA   rB   s	            r!   _resource_apply_sparsezSGD._resource_apply_sparse   s    JJ		(<(<	J &B++Z,CD I11*iH  ==j1L<<JJ!!j)%%]]$ $r"   c                     t         t        |          }|j                  | j	                  d      | j
                  | j	                  d      | j                  d       |S )Nr   r   )r   r   r   r   )r   r	   
get_configupdate_serialize_hyperparameterr   r   )r   configr    s     r!   rN   zSGD.get_config   sV    3(*F
MM77H$$22:>MM	  Mr"   )g{Gz?g        Fr	   )N)__name__
__module____qualname____doc___HAS_AGGREGATE_GRADr   r(   r*   rC   rH   rL   rN   __classcell__)r    s   @r!   r	   r	      sH    DL  "	('
0
),0$" r"   r	   N)rU   tensorflow.python.frameworkr   $tensorflow.python.keras.optimizer_v2r   tensorflow.python.opsr   r   r   OptimizerV2r	    r"   r!   <module>r]      s/    $ / = + ; 2e,
"
" er"   