
    BVhB                     "   d Z ddlZddlmZ ddlmZ ddlmZ ddlmZ ddlm	Z	 ddlm
Z
 dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ  ej2                  dd       eg d       G d dej4                  ej6                                      Z ej2                  dd       eg d       G d de                    Zd Zd Zd  Z  ej2                  d!d"       eg d#       G d$ d%e                    Z!d& Z"y)'zContains LossScale classes.    N)distribute_lib)reduce_util)context)dtypes)indexed_slices)ops)cond)control_flow_ops)math_ops)variable_v1)	variables)base)deprecation)nest)	tf_export&mixed_precision.experimental.LossScaletrain.experimental.LossScale)zmixed_precision.LossScaler   r   )v1c                        e Zd ZdZd Zej                  d        Zej                  d        Zd
dZ	e
j                  j                  f fd	Zd
 fd	Zej                  d        Zed	        Z xZS )	LossScalea	  Base class for all TF1 loss scales.

  This is an abstract base class, so you cannot instantiate it directly.
  Instead, use one of its concrete subclasses:
    * `tf.compat.v1.mixed_precision.DynamicLossScale`
    * `tf.compat.v1.mixed_precision.FixedLossScale`

  Loss scaling is a process that multiplies the loss by a multiplier called the
  loss scale, and divides each gradient by the same multiplier. The pseudocode
  for this process is:

  ```
  loss = ...
  loss *= loss_scale
  grads = gradients(loss, vars)
  grads /= loss_scale
  ```

  Mathematically, loss scaling has no effect, but can help avoid numerical
  underflow in intermediate gradients when float16 tensors are used for mixed
  precision training. By multiplying the loss, each intermediate gradient will
  have the same multiplier applied.

  Instances of this class represent a loss scale. Calling instances of this
  class returns the loss scale as a scalar float32 tensor, while method
  `update()` updates the loss scale depending on the values of the gradients.
  Optimizers use instances of this class to scale loss and gradients.

  In most functions that accept a LossScale, you can also pass an int (such as
  8) to create a `FixedLossScale` or the string `"dynamic"` to create a dynamic
  loss scale.
  c                     i | _         y)z!Initializes the loss scale class.N)_weightsselfs    b/home/dcms/DCMS/lib/python3.12/site-packages/tensorflow/python/training/experimental/loss_scale.py__init__zLossScale.__init__M   s	    DM    c                      y)z<Returns the current loss scale as a scalar `float32` tensor.N r   s    r   __call__zLossScale.__call__Q        	r   c                      y)a  Updates the value of the loss scale.

    The loss scale will be potentially updated, based on the value of `grads`.
    The tensor returned by calling this class is only updated when this function
    is evaluated.

    In eager mode, this directly updates the loss scale, so that calling
    `__call__` will return the newly updated loss scale. In graph mode,
    this returns an op that, when evaluated, updates the loss scale.

    This function also returns a `should_apply_gradients` bool. If False,
    gradients should not be applied to the variables that step, as nonfinite
    gradients were found, and the loss scale has been be updated to reduce the
    chance of finding nonfinite gradients in the next step. Some loss scale
    classes will always return True, as they cannot adjust themselves in
    response to nonfinite gradients.

    When a DistributionStrategy is used, this function may only be called in a
    cross-replica context.

    Args:
      grads: A nested structure of unscaled gradients, each which is the
        gradient of the loss with respect to a weight. The gradients should have
        already been divided by the loss scale being before passed to this
        function. 'None' gradients are accepted, and are ignored.

    Returns:
      update_op: In eager mode, None. In graph mode, an op to update the loss
        scale.
      should_apply_gradients: Either a bool or a scalar boolean tensor. If
        False, the caller should skip applying `grads` to the variables this
        step.
    Nr   r   gradss     r   updatezLossScale.updateV   s    F 	r   c           	         t        j                  |||ddt        j                  j                  t        j
                  j                        }t        j                         rd}n t        j                         }|j                  }||f}| j                  j                  |d      t        dj                  |            || j                  |<   | j!                  ||       |S )a  Adds a weight to this loss scale.

    Args:
      name: Variable name.
      initial_value: The variable's initial value.
      dtype: The type of the variable.

    Returns:
      A variable.

    Raises:
      RuntimeError: If a weight with `name` has already been added.
    FT)initial_valuenamedtype	trainableuse_resourcesynchronizationaggregationNz Duplicate variables detected. {})r(   	trackable)r   
VariableV1r   VariableSynchronizationAUTOVariableAggregationNONEr   executing_eagerlyr   get_default_graph
_graph_keyr   getRuntimeErrorformat_handle_deferred_dependencies)r   r(   r'   r)   variable	graph_keygraphkeys           r   _add_weightzLossScale._add_weight{   s     %%#!99>> 1166	8H   "i##%e""i
C}}d#/;BB3GHH!DMM#&&DH&EOr   c                 8   t        j                         rd}n t        j                         }|j                  }i }t        | j                  j                         d       D ]  \  \  }}}||k(  s|||<    |j                  t        t        | .  |fi |       |S )z6From Trackable. Gather graph-specific weights to save.Nc                     | d   d   S Nr   r   )is    r   <lambda>z/LossScale._trackable_children.<locals>.<lambda>   s    AaDG r   )r>   )r   r4   r   r5   r6   sortedr   itemsr%   superr   _trackable_children)
r   	save_typekwargsr<   r=   weightsr(   gv	__class__s
            r   rH   zLossScale._trackable_children   s       "i##%e""iGt}}224:KL 	q1	
i NNi29GGINr   c                     t         t        |   ||      }||S t        j                         rd}n t        j                         }|j                  }| j                  j                  ||fd      S )z3From Trackable. Find a weight in the current graph.N)
rG   r   _lookup_dependencyr   r4   r   r5   r6   r   r7   )r   r(   cached_dependenciesunconditionalr<   r=   rN   s         r   rP   zLossScale._lookup_dependency   sl    )T=!#M   "i##%e""i==dI.55r   c                      y)z&Returns the config of this loss scale.Nr   r   s    r   
get_configzLossScale.get_config   r!   r   c                      | di |S )z&Creates the LossScale from its config.r   r   )clsconfigs     r   from_configzLossScale.from_config   s     ==r   N)__name__
__module____qualname____doc__r   abcabstractmethodr    r%   r?   r.   SaveType
CHECKPOINTrH   rP   rT   classmethodrX   __classcell__rN   s   @r   r   r   #   s    B 	 	 "	 "	H#L %.$6$6$A$A"6 	 	  r   r   )	metaclass+mixed_precision.experimental.FixedLossScale!train.experimental.FixedLossScale)zmixed_precision.FixedLossScalerf   rg   c                   f     e Zd ZdZ ej
                  dd       fd       Zd Zd Zd Z	d Z
 xZS )	FixedLossScalezLoss scale with a fixed value.

  The loss scale is not updated for the lifetime of instances of this class.
  A given instance of this class always returns the same number when called.
  Nz{Use tf.keras.mixed_precision.LossScaleOptimizer instead. LossScaleOptimizer now has all the functionality of FixedLossScalec                     t         t        |           t        |t        t
        f      st        d      |dk  rt        d      t        |      | _        y)a  Creates the fixed loss scale.

    Args:
      loss_scale_value: A Python float. Its ideal value varies depending on
        models to run. Choosing a too small loss_scale might affect model
        quality; a too big loss_scale might cause inf or nan. There is no single
        right loss_scale to apply. There is no harm choosing a relatively big
        number as long as no nan or inf is encountered in training.

    Raises:
      ValueError: If loss_scale_value is less than 1.
    z/loss_scale_value must be a Python int or float.   z$loss_scale_value must be at least 1.N)rG   ri   r   
isinstanceintfloat
ValueError_loss_scale_value)r   loss_scale_valuerN   s     r   r   zFixedLossScale.__init__   sQ    " 
.$(*&e5HII!=>> ##34Dr   c                 @    t        j                  | j                        S rY   )r   convert_to_tensorrp   r   s    r   r    zFixedLossScale.__call__   s      !7!788r   c                 0    ~t        j                         dfS )NT)r
   no_opr#   s     r   r%   zFixedLossScale.update   s    !!#T))r   c                      d| j                   z  S )NzFixedLossScale(%s)rp   r   s    r   __repr__zFixedLossScale.__repr__   s    $"8"888r   c                     d| j                   iS )Nrq   rw   r   s    r   rT   zFixedLossScale.get_config   s     6 677r   )rZ   r[   r\   r]   r   
deprecatedr   r    r%   rx   rT   rc   rd   s   @r   ri   ri      sE     ;
 5	549*98r   ri   c           
          d }| D cg c]2  }|.t        j                  t        j                   ||                  4 }}t        j                  |      S c c}w )zGReturns a scalar boolean tensor indicating if all gradients are finite.c                 R    t        | t        j                        r| j                  S | S rY   )rl   r   IndexedSlicesvalues)rL   s    r   
raw_valuesz"_is_all_finite.<locals>.raw_values  s     !!^%A%AB188IIr   )r   
reduce_all	is_finite)r$   r   rL   is_finite_per_grads       r   _is_all_finiter     s`    J
 
	
 (,,Z];< 
 
		/	00s   7Ac                 F    t        j                         r| S | j                  S )a(  Returns the tensor's op in graph mode, or the tensor in eager mode.

  This is useful because sometimes an op is needed in graph mode instead of a
  tensor. In eager mode, there are no ops.

  Args:
    tensor: A tensor.

  Returns:
    The tensor's op in graph mode. The tensor in eager mode.
  )r   r4   op)tensors    r   _op_in_graph_moder     s      M	r   c                 ~     t        j                   t        j                         fdt        j                        S )z5Assigns a value to a variable if the value is finite.c                  8    t        j                               S rY   )r   assign)valuevars   r   rD   z#_assign_if_finite.<locals>.<lambda>%  s    ):3::e;L)M r   )r	   r   r   r
   ru   )r   r   s   ``r   _assign_if_finiter   "  s.    	!M
 r   -mixed_precision.experimental.DynamicLossScale#train.experimental.DynamicLossScale)z mixed_precision.DynamicLossScaler   r   c                        e Zd ZdZ ej
                  dd      	 	 	 d fd	       Zed        Zed        Z	ed        Z
d Zd	 Zd
 Zd Z xZS )DynamicLossScalea  Loss scale that dynamically adjusts itself.

  Dynamic loss scaling works by adjusting the loss scale as training progresses.
  The goal is to keep the loss scale as high as possible without overflowing the
  gradients. As long as the gradients do not overflow, raising the loss scale
  never hurts.

  The algorithm starts by setting the loss scale to an initial value. Every N
  steps that the gradients are finite, the loss scale is increased by some
  factor. However, if a NaN or Inf gradient is found, the gradients for that
  step are not applied, and the loss scale is decreased by the factor. This
  process tends to keep the loss scale as high as possible without gradients
  overflowing.
  Nz}Use tf.keras.mixed_precision.LossScaleOptimizer instead. LossScaleOptimizer now has all the functionality of DynamicLossScalec                 :   t         t        |           t        |      | _        t        |      | _        t        |      | _        | j                  dt        j                  | j                        | _        | j                  dt        j                  d      | _        y)a  Creates the dynamic loss scale.

    Args:
      initial_loss_scale: A Python float.  The loss scale to use at the
        beginning. It's better to start this at a very high number, because a
        loss scale that is too high gets lowered far more quickly than a loss
        scale that is too low gets raised. The default is 2 ** 15, which is
        approximately half the maximum float16 value.
      increment_period: Increases loss scale every `increment_period`
        consecutive steps that finite gradients are encountered. If a nonfinite
        gradient is encountered, the count is reset back to zero.
      multiplier: The multiplier to use when increasing or decreasing the loss
        scale.
    current_loss_scale)r(   r)   r'   
good_stepsr   N)rG   r   r   rn   _initial_loss_scalerm   _increment_period_multiplierr?   r   float32_current_loss_scaleint64_num_good_steps)r   initial_loss_scaleincrement_period
multiplierrN   s       r   r   zDynamicLossScale.__init__B  s    , 

D*,$%78D !12DZ(D#//!nn..  0  0D  ++Q , @Dr   c                     | j                   S rY   )r   r   s    r   r   z#DynamicLossScale.initial_loss_scalef  s    ###r   c                     | j                   S rY   )r   r   s    r   r   z!DynamicLossScale.increment_periodj  s    !!!r   c                     | j                   S rY   )r   r   s    r   r   zDynamicLossScale.multipliern  s    r   c                 @    t        j                  | j                        S rY   )r   rs   r   r   s    r   r    zDynamicLossScale.__call__r  s      !9!9::r   c                     t        j                  |      }t        j                         rt        j                         }d }|j
                  j                  ||f      }|j                  t        j                  j                  |d      }t        j                  ||j                        }nt        |      } fd} fd}t        j                  |||      }	|}
|	|
fS )zDUpdates loss scale based on if gradients are finite in current step.c                 `    t        |       }t        j                  |t        j                        S rY   )r   r   castr   r   )r$   r   s     r   get_is_finitez.DynamicLossScale.update.<locals>.get_is_finite{  s#    "5)	 }}Y77r   )argsN)axisc                  v    fd} t        j                   j                  dz   j                  k\  | fd      S )z)Update assuming the gradients are finite.c                      j                   j                  z  } t        j                  t	        j                   |       j
                  j                  d            S rB   )r   r   r
   groupr   r   r   new_loss_scaler   s    r   incr_loss_scalezPDynamicLossScale.update.<locals>.update_if_finite_grads.<locals>.incr_loss_scale  sP    11D4D4DD%%d66G  ''*, 	,r   rk   c                  L    t         j                  j                  d            S )Nrk   )r   r   
assign_addr   s   r   rD   zIDynamicLossScale.update.<locals>.update_if_finite_grads.<locals>.<lambda>  s!    #4""--a0$2 r   )r	   r   r   )r   r   s    r   update_if_finite_gradsz7DynamicLossScale.update.<locals>.update_if_finite_grads  s<    , YY



"d&<&<
<
 23 3r   c                      t        j                  j                  j                  z  d      } t	        j
                  j                  j                  d      j                  j                  |             S )z,Update assuming the gradients are nonfinite.rk   r   )r   maximumr   r   r
   r   r   r   r   s    r   update_if_not_finite_gradsz;DynamicLossScale.update.<locals>.update_if_not_finite_grads  sf      ''

"
"T%5%5
5q:n##



%
%a
(

"
"
)
).
9; ;r   )r   flattenr   has_strategyget_cross_replica_contextextendedcall_for_each_replicareducer   ReduceOpSUMr   equalnum_replicas_in_syncr   r	   )r   r$   distributionr   is_finite_floatreduced_is_finite_floatr   r   r   	update_opshould_apply_gradientss   `          r   r%   zDynamicLossScale.updateu  s    LLE""$#==?l8 %--CC
uh D (o , 3 3K4H4H4L4L4C$ !4 !P..!8!-!B!BDi !'i3; 		)%;46I&,,,r   c                 >   t        j                         r`d| j                  j                         d| j                  j                         d| j
                  d| j                  d| j                  dS d| j
                  d| j                  d| j                  dS )Nz$DynamicLossScale(current_loss_scale=z, num_good_steps=z, initial_loss_scale=z, increment_period=z, multiplier=)z$DynamicLossScale(initial_loss_scale=)r   r4   r   numpyr   r   r   r   r   s    r   rx   zDynamicLossScale.__repr__  s      " ''--/1E1E1K1K1M&&(=(=tP Q # &&(=(=tP Qr   c                 J    | j                   | j                  | j                  dS )Nr   r   r   r   r   s    r   rT   zDynamicLossScale.get_config  s%    "55 11oo r   )i   i  g       @)rZ   r[   r\   r]   r   rz   r   propertyr   r   r   r    r%   rx   rT   rc   rd   s   @r   r   r   )  s     ;
  
 #* $@	 @@ $ $ " "  ;/-b	Qr   r   c                     t        | t        t        f      rt        |       S | dk(  r
t	               S t        | t
              r| S | yt        d| z        )zGet a loss scale object.dynamicNz-Could not interpret loss scale identifier: %s)rl   rm   rn   ri   r   r   ro   )
identifiers    r   r7   r7     s]    
S%L)*%%9
I&
D  ! !r   )#r]   r^   tensorflow.python.distributer   r   tensorflow.python.eagerr   tensorflow.python.frameworkr   r   r   tensorflow.python.opsr	   r
   r   r   r   tensorflow.python.trackabler   r.   tensorflow.python.utilr   r    tensorflow.python.util.tf_exportr   deprecated_endpoints	TrackableABCMetar   ri   r   r   r   r   r7   r   r   r   <module>r      s)   " 
 7 4 + . 6 + & 2 * - + 9 . ' 6 "!!"J"@B
[	##s{{ [B[| "!!"O"EG
08Y 08G08f
1" "!!3)+ Dy D+DN!r   