
    BVh)                         d Z ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZ  ej$                  d       eddg       G d de
j&                                      Zy)zContains LossScale classes.    )distribute_lib)indexed_slices)
smart_cond)control_flow_ops)math_ops)	optimizer)
loss_scale)deprecation)	tf_exportz3train.experimental.MixedPrecisionLossScaleOptimizerz0mixed_precision.MixedPrecisionLossScaleOptimizer)v1c                        e Zd ZdZ fdZd Zdej                  j                  dddfdZ	d Z
d Zd	 Zdd
Z	 	 ddZd Zd Zd Zd Zd Zd Z xZS ) MixedPrecisionLossScaleOptimizera   An optimizer that applies loss scaling.

  Loss scaling is a process that multiplies the loss by a multiplier called the
  loss scale, and divides each gradient by the same multiplier. The pseudocode
  for this process is:

  ```
  loss = ...
  loss *= loss_scale
  grads = gradients(loss, vars)
  grads /= loss_scale
  ```

  Mathematically, loss scaling has no effect, but can help avoid numerical
  underflow in intermediate gradients when float16 tensors are used for mixed
  precision training. By multiplying the loss, each intermediate gradient will
  have the same multiplier applied.

  The loss scale can either be a fixed constant, chosen by the user, or be
  dynamically determined. Dynamically determining the loss scale is convenient
  as a loss scale does not have to be explicitly chosen. However it reduces
  performance.

  This optimizer wraps another optimizer and applies loss scaling to it via a
  `LossScale`. Loss scaling is applied whenever gradients are
  computed, such as through `minimize()`.
  c                    t        |t        j                        st        dt	        |      z        || _        |j                  }|j                         }t        t        | +  ||       t        j                  |      | _        | j                  t        d      | j                  | j
                  d       | j                  | j                  d       y )Nz3"opt" must be an instance of Optimizer, but got: %szloss_scale cannot be Nonebase_optimizerr	   )
isinstancer   	Optimizer
ValueErrortype
_optimizer_use_lockingget_namesuperr   __init__loss_scale_moduleget_loss_scale_track_trackable)selfoptr	   use_lockingname	__class__s        l/home/dcms/DCMS/lib/python3.12/site-packages/tensorflow/python/training/experimental/loss_scale_optimizer.pyr   z)MixedPrecisionLossScaleOptimizer.__init__<   s    c9../LCy! " "DO""K<<>D	
*D:;M(,,Z8D233$//+;<$**L9    c                 J    t        | j                  t        j                        S )z:Check if `_loss_scale` dynamically manages the loss scale.)r   r   r   DynamicLossScaler   s    r#   _doing_dynamic_loss_scalingz<MixedPrecisionLossScaleOptimizer._doing_dynamic_loss_scalingL   s    d&&(9(J(JKKr$   NFc                    | j                  |      }| j                  j                  ||||||      }|D 	cg c]  \  }}	|	 }
}}	|D 	cg c]  \  }	}|	 }}	}| j                  |
      }t	        t        ||            S c c}	}w c c}}	w )aX  Compute gradients of `loss` for the variables in `var_list`.

    This adjusts the dynamic range of the gradient evaluation by scaling up
    the `loss` value. The gradient values are then scaled back down by the
    reciprocal of the loss scale. This is useful in reduced precision training
    where small gradient values would otherwise underflow the representable
    range.

    Args:
      loss: A Tensor containing the value to minimize or a callable taking no
        arguments which returns the value to minimize. When eager execution is
        enabled it must be a callable.
      var_list: Optional list or tuple of `tf.Variable` to update to minimize
        `loss`.  Defaults to the list of variables collected in the graph under
        the key `GraphKeys.TRAINABLE_VARIABLES`.
      gate_gradients: How to gate the computation of gradients.  Can be
        `GATE_NONE`, `GATE_OP`, or `GATE_GRAPH`.
      aggregation_method: Specifies the method used to combine gradient terms.
        Valid values are defined in the class `AggregationMethod`.
      colocate_gradients_with_ops: If True, try colocating gradients with the
        corresponding op.
      grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.

    Returns:
      A list of (gradient, variable) pairs. Variable is always present, but
      gradient can be `None`.
    )lossvar_listgate_gradientsaggregation_methodcolocate_gradients_with_ops	grad_loss)_scale_lossr   compute_gradients_unscale_gradslistzip)r   r*   r+   r,   r-   r.   r/   grads_and_varsg_gradsv	variablesunscaled_gradss                 r#   r1   z2MixedPrecisionLossScaleOptimizer.compute_gradientsP   s    D D!D__66%-$? 7 N **41aQ*E*-.tq!.I.((/NNI.// +.s   BBc                     | j                         t              rfd}|S t        j                  j                        z  S )Nc                  X            } | t        j                  | j                        z  S N)r   castdtype)loss_valr*   r	   s    r#   new_lossz>MixedPrecisionLossScaleOptimizer._scale_loss.<locals>.new_loss   s$    6(--
HNNCCCr$   )r   callabler   r?   r@   )r   r*   rB   r	   s    ` @r#   r0   z,MixedPrecisionLossScaleOptimizer._scale_loss   s?    !!#J~D oHMM*djj999r$   c                 ~    | j                         }d|z  }|D cg c]  }|d n| j                  ||       c}S c c}w )N   )r   _scale_grad)r   r8   r	   loss_scale_reciprocalr6   s        r#   r2   z/MixedPrecisionLossScaleOptimizer._unscale_grads   sS    !!#J
N  	t//3HII  s   :c                     t        |t        j                        r:|j                  |z  }t        j                  ||j                  |j
                        S ||z  S r>   )r   r   IndexedSlicesvaluesindicesdense_shape)r   gradrG   	grad_valss       r#   rF   z,MixedPrecisionLossScaleOptimizer._scale_grad   sQ    $445++ 55i)))T\\*.*:*:< <'''r$   c                    t        j                         rt        d      | j                         s| j                  j                  |||      S t        j                         }t        |      }|j                  | j                  |||f      S )aR  Apply gradients to variables.

    This is the second part of `minimize()`. It returns an `Operation` that
    conditionally applies gradients if all gradient values are finite.
    Otherwise no update is performed (nor is `global_step` incremented).

    Args:
      grads_and_vars: List of (gradient, variable) pairs as returned by
        `compute_gradients()`.
      global_step: Optional `Variable` to increment by one after the variables
        have been updated.
      name: Optional name for the returned operation.  Default to the name
        passed to the `Optimizer` constructor.

    Returns:
      An `Operation` that conditionally applies the specified gradients. If
      `global_step` was not None, that operation also increments `global_step`.

    Raises:
      RuntimeError: If you should use `_distributed_apply()` instead.
    z6apply_gradients() must be called in a replica context.args)
r   in_cross_replica_contextr   r(   r   apply_gradientsget_replica_contexttuple
merge_call_distributed_apply)r   r5   global_stepr!   replica_contexts        r#   rS   z0MixedPrecisionLossScaleOptimizer.apply_gradients   s    , ..0OPP++-__,,^[$OO$88:O>*N %%~{D&I & K Kr$   c                 0    n j                         D cg c]  \  }}|	 }}} j                  j                  |      \  }}	 fd}
t        j                  |	|
t        j
                        }t	        j                  ||      S c c}}w )a  A version of `apply_gradients` for cross replica context.

    When users are in a cross replica strategy, they must call this rather than
    `apply_gradients()`.

    Args:
      distribution: a `DistributionStrategy` object.
      grads_and_vars: List of (gradient, variable) pairs as returned by
        `compute_gradients()` and then aggregated across replicas.
      global_step: Optional (mirrored) `Variable` to increment by one after the
        variables have been updated.
      name: Optional name for the returned operation. Default to the name passed
        to the `Optimizer` constructor.

    Returns:
      An `Operation` that applies the specified gradients across all
      replicas. If `global_step` was not None, that operation also
      increments `global_step`
    c                  2    j                   dz         S )Nz-wrapped)_apply_gradients)distributionrX   r5   r!   r   s   r#   apply_fnzEMixedPrecisionLossScaleOptimizer._distributed_apply.<locals>.apply_fn   s$    ""<#'*#46 6r$   )r!   )r   r   updater   r   no_opgroup)r   r]   r5   rX   r!   r6   r7   r8   loss_scale_update_opshould_apply_gradsr^   maybe_apply_ops   `````       r#   rW   z3MixedPrecisionLossScaleOptimizer._distributed_apply   s    0 #4D)*41aQ*E*040@0@0G0G0N,,6 6  **+=x+;+A+ACN!!,49 9 +s   Bc                     |j                   j                  | j                  j                  |||f      }|j	                  |      S )z9Unconditionally apply gradients in cross replica context.rP   )extendedcall_for_each_replicar   rS   ra   )r   r]   r5   rX   r!   
update_opss         r#   r\   z1MixedPrecisionLossScaleOptimizer._apply_gradients   sG    &&<<''k40 = 2J j))r$   c                     t        d      z%This function should never be called.z$This function should never be calledRuntimeErrorr   rM   vars      r#   _apply_sparsez.MixedPrecisionLossScaleOptimizer._apply_sparse       
=
>>r$   c                     t        d      rj   rk   rm   s      r#   _apply_densez-MixedPrecisionLossScaleOptimizer._apply_dense   rp   r$   c                     t        d      rj   rk   )r   rM   handlerK   s       r#   _resource_apply_sparsez7MixedPrecisionLossScaleOptimizer._resource_apply_sparse   rp   r$   c                     t        d      rj   rk   )r   rM   rt   s      r#   _resource_apply_densez6MixedPrecisionLossScaleOptimizer._resource_apply_dense   rp   r$   c                     | j                   j                         t        | j                  j                  j                               z   S )z'Returns the variables of the Optimizer.)r   r:   r3   r   _weightsrJ   r'   s    r#   r:   z*MixedPrecisionLossScaleOptimizer.variables   s9    OO%%'!!**11345 6r$   )NN)__name__
__module____qualname____doc__r   r(   r   r   GATE_OPr1   r0   r2   rF   rS   rW   r\   ro   rr   ru   rw   r:   __classcell__)r"   s   @r#   r   r      sx    
8: L "&'0':':'B'B+/49"&.0`:(!KL &*"	#9J*????6r$   r   N)r}   tensorflow.python.distributer   tensorflow.python.frameworkr   r   tensorflow.python.opsr   r   tensorflow.python.trainingr   'tensorflow.python.training.experimentalr	   r   tensorflow.python.utilr
    tensorflow.python.util.tf_exportr   deprecated_endpointsr   r    r$   r#   <module>r      st    " 7 6 2 2 * 0 S . 6 "!!9;
ADF G\6y':': \6G;\6r$   