
    AVhw                        d Z ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddl	m
Z
 dd	l	mZ dd
l	mZ ddl	mZ ddl	mZ ddl	mZ ddl	mZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddgZ ed      d&d       Z G d d      Zd Z d Z!d Z"d  Z#d! Z$ ed"      d#        Z% ed$      d%        Z&y)'z3Decorator to overrides the gradient for a function.    )backprop)context)record)composite_tensor_gradient)dtypes)ops)	array_ops)gen_array_ops)handle_data_util)math_ops)op_selector)resource_variable_ops)variable_scope)UnconnectedGradients)
tf_logging)nest)tf_decorator)
tf_inspect)variable_utils)	tf_export
VariableV2VarHandleOpcustom_gradientNc                 p    | d S t         j                  d        }t        j                  |  ||             S )a%  Decorator to define a function with a custom gradient.

  This decorator allows fine grained control over the gradients of a sequence
  for operations.  This may be useful for multiple reasons, including providing
  a more efficient or numerically stable gradient for a sequence of operations.

  For example, consider the following function that commonly occurs in the
  computation of cross entropy and log likelihoods:

  ```python
  def log1pexp(x):
    return tf.math.log(1 + tf.exp(x))
  ```

  Due to numerical instability, the gradient of this function evaluated at x=100
  is NaN.  For example:

  ```python
  with tf.GradientTape() as tape:
    tape.watch(x)
    y=log1pexp(x)
  dy_dx = tape.gradient(y, x) # Will be NaN when evaluated.
  ```

  The gradient expression can be analytically simplified to provide numerical
  stability:

  ```python
  @tf.custom_gradient
  def log1pexp(x):
    e = tf.exp(x)
    def grad(upstream):
      return upstream * (1 - 1 / (1 + e))
    return tf.math.log(1 + e), grad
  ```

  With this definition, the gradient `dy_dx` at `x = 100` will be correctly
  evaluated as 1.0.

  The variable `upstream` is defined as the upstream gradient. i.e. the gradient
  from all the layers or functions originating from this layer. The above
  example has no upstream functions, therefore `upstream = dy/dy = 1.0`.

  Assume that `x_i` is `log1pexp` in the forward pass `x_1 = x_1(x_0)`,
  `x_2 = x_2(x_1)`, ..., `x_i = x_i(x_i-1)`, ..., `x_n = x_n(x_n-1)`. By
  chain rule we know that `dx_n/dx_0 = dx_n/dx_n-1 * dx_n-1/dx_n-2 * ... *
  dx_i/dx_i-1 * ... * dx_1/dx_0`.

  In this case the gradient of our current function defined as
  `dx_i/dx_i-1 = (exp(x_i) / (1 + exp(x_i))) = (1 - 1 / (1 + exp(x_i)))`. The
  upstream gradient `upstream` would be `dx_n/dx_n-1 * dx_n-1/dx_n-2 * ... *
  dx_i+1/dx_i`. The upstream gradient multiplied by the current gradient is
  then passed downstream.

  In case the function takes multiple variables as input, the `grad`
  function must also return  the same number of variables.
  We take the function `z = x * y` as an example.

  >>> @tf.custom_gradient
  ... def bar(x, y):
  ...   def grad(upstream):
  ...     dz_dx = y
  ...     dz_dy = x
  ...     return upstream * dz_dx, upstream * dz_dy
  ...   z = x * y
  ...   return z, grad
  >>> x = tf.constant(2.0, dtype=tf.float32)
  >>> y = tf.constant(3.0, dtype=tf.float32)
  >>> with tf.GradientTape(persistent=True) as tape:
  ...   tape.watch(x)
  ...   tape.watch(y)
  ...   z = bar(x, y)
  >>> z
  <tf.Tensor: shape=(), dtype=float32, numpy=6.0>
  >>> tape.gradient(z, x)
  <tf.Tensor: shape=(), dtype=float32, numpy=3.0>
  >>> tape.gradient(z, y)
  <tf.Tensor: shape=(), dtype=float32, numpy=2.0>

  Nesting custom gradients can lead to unintuitive results. The default
  behavior does not correspond to n-th order derivatives. For example

  ```python
  @tf.custom_gradient
  def op(x):
    y = op1(x)
    @tf.custom_gradient
    def grad_fn(dy):
      gdy = op2(x, y, dy)
      def grad_grad_fn(ddy):  # Not the 2nd order gradient of op w.r.t. x.
        return op3(x, y, dy, ddy)
      return gdy, grad_grad_fn
    return y, grad_fn
  ```

  The function `grad_grad_fn` will be calculating the first order gradient
  of `grad_fn` with respect to `dy`, which is used to generate forward-mode
  gradient graphs from backward-mode gradient graphs, but is not the same as
  the second order gradient of `op` with respect to `x`.

  Instead, wrap nested `@tf.custom_gradients` in another function:

  ```python
  @tf.custom_gradient
  def op_with_fused_backprop(x):
    y, x_grad = fused_op(x)
    def first_order_gradient(dy):
      @tf.custom_gradient
      def first_order_custom(unused_x):
        def second_order_and_transpose(ddy):
          return second_order_for_x(...), gradient_wrt_dy(...)
        return x_grad, second_order_and_transpose
      return dy * first_order_custom(x)
    return y, first_order_gradient
  ```

  Additional arguments to the inner `@tf.custom_gradient`-decorated function
  control the expected return values of the innermost function.

  The examples above illustrate how to specify custom gradients for functions
  which do not read from variables. The following example uses variables, which
  require special handling because they are effectively inputs of the forward
  function.

  >>> weights = tf.Variable(tf.ones([2]))  # Trainable variable weights
  >>> @tf.custom_gradient
  ... def linear_poly(x):
  ...   # Creating polynomial
  ...   poly = weights[1] * x + weights[0]
  ...
  ...   def grad_fn(dpoly, variables):
  ...     # dy/dx = weights[1] and we need to left multiply dpoly
  ...     grad_xs = dpoly * weights[1]  # Scalar gradient
  ...
  ...     grad_vars = []  # To store gradients of passed variables
  ...     assert variables is not None
  ...     assert len(variables) == 1
  ...     assert variables[0] is weights
  ...     # Manually computing dy/dweights
  ...     dy_dw = dpoly * tf.stack([x ** 1, x ** 0])
  ...     grad_vars.append(
  ...         tf.reduce_sum(tf.reshape(dy_dw, [2, -1]), axis=1)
  ...     )
  ...     return grad_xs, grad_vars
  ...   return poly, grad_fn
  >>> x = tf.constant([1., 2., 3.])
  >>> with tf.GradientTape(persistent=True) as tape:
  ...   tape.watch(x)
  ...   poly = linear_poly(x)
  >>> poly # poly = x + 1
  <tf.Tensor: shape=(3,),
    dtype=float32,
    numpy=array([2., 3., 4.], dtype=float32)>
  >>> tape.gradient(poly, x)  # conventional scalar gradient dy/dx
  <tf.Tensor: shape=(3,),
    dtype=float32,
    numpy=array([1., 1., 1.], dtype=float32)>
  >>> tape.gradient(poly, weights)
  <tf.Tensor: shape=(2,), dtype=float32, numpy=array([6., 3.], dtype=float32)>

  Above example illustrates usage of trainable variable `weights`.
  In the example, the inner `grad_fn` accepts an extra `variables` input
  parameter and also returns an extra `grad_vars` output. That extra argument
  is passed if the forward function reads any variables. You need to
  compute the gradient w.r.t. each of those `variables` and output it as a list
  of `grad_vars`. Note here that default value of `variables` is set to `None`
  when no variables are used in the forward function.

  It should be noted `tf.GradientTape` is still watching the forward pass of a
  `tf.custom_gradient`, and will use the ops it watches. As a consequence,
  calling `tf.function` while the tape is still watching leads
  to a gradient graph being built. If an op is used in `tf.function` without
  registered gradient, a `LookupError` will be raised.

  Users can insert `tf.stop_gradient` to customize this behavior. This
  is demonstrated in the example below. `tf.random.shuffle` does not have a
  registered gradient. As a result `tf.stop_gradient` is used to avoid the
  `LookupError`.

  ```python
  x = tf.constant([0.3, 0.5], dtype=tf.float32)

  @tf.custom_gradient
  def test_func_with_stop_grad(x):
    @tf.function
    def _inner_func():
      # Avoid exception during the forward pass
      return tf.stop_gradient(tf.random.shuffle(x))
      # return tf.random.shuffle(x)  # This will raise

    res = _inner_func()
    def grad(upstream):
      return upstream  # Arbitrarily defined custom gradient
    return res, grad

  with tf.GradientTape() as g:
    g.watch(x)
    res = test_func_with_stop_grad(x)

  g.gradient(res, x)
  ```

  See also `tf.RegisterGradient` which registers a gradient function for a
  primitive TensorFlow operation. `tf.custom_gradient` on the other hand allows
  for fine grained control over the gradient computation of a sequence of
  operations.

  Note that if the decorated function uses `Variable`s, the enclosing variable
  scope must be using
  [ResourceVariables](https://www.tensorflow.org/guide/migrate/tf1_vs_tf2#resourcevariables_instead_of_referencevariables).

  Args:
    f: function `f(*x)` that returns a tuple `(y, grad_fn)` where: - `x` is a
      sequence of (nested structures of) `Tensor` inputs to the function. - `y`
      is a (nested structure of) `Tensor` outputs of applying TensorFlow
      operations in `f` to `x`. - `grad_fn` is a function with the signature
      `g(*grad_ys)` which returns a list of `Tensor`s the same size as
      (flattened) `x` - the derivatives of `Tensor`s in `y` with respect to the
      `Tensor`s in `x`.  `grad_ys` is a sequence of `Tensor`s the same size as
      (flattened) `y` holding the initial value gradients for each `Tensor` in
      `y`.  In a pure mathematical sense, a vector-argument vector-valued
      function `f`'s derivatives should be its Jacobian matrix `J`. Here we are
      expressing the Jacobian `J` as a function `grad_fn` which defines how `J`
      will transform a vector `grad_ys` when left-multiplied with it (`grad_ys *
      J`, the vector-Jacobian product, or VJP). This functional representation
      of a matrix is convenient to use for chain-rule calculation (in e.g. the
      back-propagation algorithm).  If `f` uses `Variable`s (that are not part
      of the inputs), i.e. through `get_variable`, then `grad_fn` should have
      signature `g(*grad_ys, variables=None)`, where `variables` is a list of
      the `Variable`s, and return a 2-tuple `(grad_xs, grad_vars)`, where
      `grad_xs` is the same as above, and `grad_vars` is a `list<Tensor>` with
      the derivatives of `Tensor`s in `y` with respect to the variables (that
      is, grad_vars has one Tensor per variable in variables).

  Returns:
    A function `h(x)` which returns the same value as `f(x)[0]` and whose
    gradient (as calculated by `tf.gradients`) is determined by `f(x)[1]`.
  c                     t        |       S )Nfr   r   s    U/home/dcms/DCMS/lib/python3.12/site-packages/tensorflow/python/ops/custom_gradient.py<lambda>z!custom_gradient.<locals>.<lambda>  s    _q)     c                 ^    t        j                         rt        | ||      S t        | ||      S )z(Decorated function with custom gradient.)r   executing_eagerly_eager_mode_decorator_graph_mode_decorator)wrappedargskwargss      r   	decoratedz"custom_gradient.<locals>.decorated!  s.       ""7D&99"7D&99r!   )Bind	decoratorr   make_decorator)r   r)   s     r   r   r   -   s?    b Y))>>: : 
	$	$Q	!	55r!   c                   2    e Zd ZdZed        Zd Zd Zd Zy)r*   a  When called evaluates `d(f, args, kwargs)` but supports binding `f`.

  >>> @Bind.decorator
  ... def my_decorator(f, args, kwargs):
  ...   print("my_decorator called with", args, kwargs)
  ...   return f(*args, **kwargs)

  >>> class Foo:
  ...   @my_decorator
  ...   def bar(self, a, b, c):
  ...     return a * b * c

  >>> Foo.bar(None, 1, 2, c=3)
  my_decorator called with (None, 1, 2) {'c': 3}
  6

  >>> foo = Foo()
  >>> foo.bar(1, 2, c=3)
  my_decorator called with (1, 2) {'c': 3}
  6
  c                     fdS )Nc                     t        |       S N)r*   )r   ds    r   r    z Bind.decorator.<locals>.<lambda>E  s    T!QZ r!    )clsr1   s    `r   r+   zBind.decoratorC  s	    r!   c                      || _         || _        y r0   )_f_d)selfr   r1   s      r   __init__zBind.__init__G  s    DGDGr!   c                     |F| j                   j                  ||      }t        j                  |t	        || j
                              S | S r0   )r5   __get__r   r,   r*   r6   )r7   instanceownerr   s       r   r:   zBind.__get__K  s?    
''//(E
*a((DDGG,<==kr!   c                 <    | j                  | j                  ||      S r0   )r6   r5   )r7   aks      r   __call__zBind.__call__R  s    77477Aq!!r!   N)	__name__
__module____qualname____doc__classmethodr+   r8   r:   r@   r2   r!   r   r*   r*   ,  s*    ,    "r!   r*   c                     t        j                  t         j                  j                        } fd}t	        t        ||            }t        |      dk\  r|D cg c]  }|j                  s| }}nt        dj                               t        |      dk(  r|d   S t        |      dkD  r%t        dj                   t        |      |            yc c}w )zEGiven a variable name, retrieves a handle on the tensorflow Variable.c                 V    	 | j                   j                  k(  S # t        $ r Y yw xY w)NF)opnameAttributeError)itemvar_names    r   
_filter_fnz(get_variable_by_name.<locals>._filter_fnZ  s-    %% s    	((   z$Unsuccessful at finding variable {}.r   zWUnsuccessful at finding trainable variable {}. Number of candidates: {}. Candidates: {}N)
r   get_collection	GraphKeysGLOBAL_VARIABLESlistfilterlen	trainable
ValueErrorformat)rL   global_varsrM   candidate_varsvs   `    r   get_variable_by_namer[   V  s    ""3==#A#AB+ z;78.A!/?A1;;a?N?
;BB8L
MMA!
>Q
	#n*=~NP P  @s   C-Cc                     t        j                  t        j                  |      }t	        j
                  || dd      }d |D        }d |D        }d |D        }|D cg c]  }||	 }}|S c c}w )zFinds variables involved in the subgraph between input_ops and output_ops.

  Args:
    input_ops: Flattened list of input ops
    output_ops: Flattened list of output ops

  Returns:
    A list of variables
  FT)seed_ops
stop_at_ts	inclusiveonly_differentiablec              3   F   K   | ]  }|j                   t        v s|  y wr0   )typeVAR_OP_TYPES.0rH   s     r   	<genexpr>z+_get_dependent_variables.<locals>.<genexpr>  s     CB277l+BRCs   !!c              3   4   K   | ]  }|j                     y wr0   rI   rd   s     r   rf   z+_get_dependent_variables.<locals>.<genexpr>  s     )2rww)s   c              3   2   K   | ]  }t        |        y wr0   )r[   )re   rL   s     r   rf   z+_get_dependent_variables.<locals>.<genexpr>  s     F!(+Fs   )r   map_structurer
   identityr   get_backward_walk_ops)	input_ops
output_opsinbetween_opsvar_ops	var_namestf_varsrZ   s          r   _get_dependent_variablesrs   u  sz     !!-"8"8*E*33	 -
 D-C')))FIF'111=Q1'1	. 2s   A-%A-c                  0    dt        j                         z  S )NzCustomGradient-%s)r   uidr2   r!   r   generate_namerv     s    	swwy	((r!   c                 &
    |rt        d      t               }t        j                  |      }t	        j
                  t        j                  |d      }t        j                         }t        |j                         |j                         z   D cg c]  }|j                          c}      }t        j                         5 } | | \  ddd       t!        j"                  t	        j$                  |            }t!        j"                  t	        j$                              }	t'        |	      t        |j                         |j                         z   D cg c]  }|j                          c}      }
|
|z
  }|D cg c]  }|j)                          }}|D ]"  }t+        j,                  |      rt/        d       t1        j3                         D cg c]  }|j                          c}      }|	D ch c]  }t5        |dd       }}|j7                  d       |rTt'        |      dkD  rt        d      |j9                         }g }|D ]#  }|j:                  |k(  s|j=                  |       % n|}t1        t?        ||		      D cg c]  }|j                          c}      }tA        |jC                  |      D cg c]  }|j)                          c}d
        tE        jF                        }d|jH                  v xs d|jJ                  v xs |jL                  } r|st/        djO                               |r stQ        jR                  dd       |	|z    z   } fdt        jT                  |      fd       }|}t        jV                         jY                  d|i      5  t[        j\                  |      }ddd       |D cg c]  }t        j                  |       }}t_        |      D ]D  \  }}|j`                  tb        jd                  k(  s$tg        |d      s1|jh                  ||   _4        F t        jj                  | jl                  ||       to        ||      D ]  \  }}tq        jr                  ||        t!        jt                  t	        j$                        |d       }	t	        jv                  |	      S c c}w # 1 sw Y   	xY wc c}w c c}w c c}w c c}w c c}w c c}w # 1 sw Y   <xY wc c}w )z3Implement custom gradient decorator for graph mode.ziThe custom_gradient decorator currently supports keywords arguments only when eager execution is enabled.T)expand_compositesNzAll variables used by a function wrapped with @custom_gradient must be `ResourceVariable`s. Ensure that no `variable_scope` is created with `use_resource=False`.graphrN   z9All custom_gradient outputs should be from the same graph)rm   rn   c                     | j                   S r0   rh   )rZ   s    r   r    z'_graph_mode_decorator.<locals>.<lambda>  s
    AFF r!   )key	variablesg@tf.custom_gradient grad_fn must accept keyword argument 'variables', since function uses variables: {}zn@custom_gradient grad_fn has 'variables' in signature, but no ResourceVariables were used on the forward pass.c                  b   t        j                  t        j                        | d       }t	        |t
        t        f      s|g}r- |di\  }}t        |      t              k7  rt        d       | }g }t        j                  t        j                  |            }dgz  |z   |z   S )Custom grad fn wrapper.Nr|   EMust return gradient for each variable from @custom_gradient grad_fn.
r   "replace_flat_tensors_for_gradientsr   flatten
isinstancerR   tuplerT   rV   get_flat_tensors_for_gradients)result_grad_componentsresult_gradsinput_gradsvariable_gradsflat_result_lengrad_fnresultr|   s       r   tape_grad_fnz+_graph_mode_decorator.<locals>.tape_grad_fn  s    ,OOV45EoFHLlT5M2"^l$+\$OY$O!k>	^	I	. 5 6 	6 \*kn
 ,JJ[!#KF_$3nDDr!   c                      | S )r   r2   )	unused_opr   r   s     r   internal_grad_fnz/_graph_mode_decorator.<locals>.internal_grad_fn   s     &&r!   	IdentityN_handle_data)<rV   rv   r   convert_variables_to_tensorsr   rj   r   convert_to_tensorr   get_variable_scopesetglobal_variableslocal_variablesrefr   VariableWatcherr   r   r   rT   derefr   is_resource_variable	TypeError	frozensetwatched_variablesgetattrdiscardpopry   appendrs   sortedunionr   getfullargspecr'   
kwonlyargsvarkwrW   loggingvlogRegisterGradientget_default_graphgradient_override_mapr	   
identity_n	enumeratedtyper   resourcehasattrr   record_operationrA   zipr   copy_handle_datar   pack_sequence_as)!r   r'   r(   rI   current_var_scoperZ   before_varsvariable_watcher	flat_argsflat_result
after_varsnew_varsnew_vars_listvariables_in_tapeographsoutput_graphfiltered_input_tensorsivariables_in_subgraphgrad_argspecvariables_in_signatureall_tensorsr   original_tensorsxtotr   r   r   r   r|   s!                               @@@@@r   r%   r%     s    
	:; ; 
$		4	4T	:$			C1144	P$ %779(99;'')*aeeg +  #3hOFG (FF
ll4))HH
ll6+$/(99;'')*aeeg * +%(&./1779/-/ (a 55a8'( ((  '99;!aeeg!  0;;!GAw%;&; 	..
6{Q
EG G::<L )	
L	 %%a() '#/*{D%aeeg%  /556GHIQqwwyI
) **73,'<+<+<< /'<+B+BB/(..  -
	,,2F9,=? ? ILL	 BC i')3+E. ' ' !
44k45HI 4&&{3K4 9II1c++A.II () 3daww&//!ga&@$%NNk!n!3 	jj+/?#[1 -eb!%%b!,-)LL
ll6K(89;+			v{	33k 
 0! <"%
 J`4 4 JsH   S	SS#S(S-:S2S7S<1TTS Tc                 n   t        j                         5 } | |i |\  ddd       t        j                  t	        j
                  |            }t        j                  t	        j
                  |            }||z   }t        d j                         D              D cg c]&  t        fd|D              rj                         ( c}t        j                        }rBd|j                  vr4d|j                  vr&|j                  st        dj!                              t        j                  t	        j
                              }	|	D 
cg c]  }
t#        j$                  |
       }	}
|t'              z   D 
cg c]  }
t)        j*                  |
       }}
|}t-        |      fd}t        j.                  | j0                  |	||       t        j2                  t	        j
                        |	      }	t	        j4                  |	      S # 1 sw Y   xY wc c}w c c}
w c c}
w )z3Implement custom gradient decorator for eager mode.Nc              3   <   K   | ]  }|j                           y wr0   )r   )re   rZ   s     r   rf   z(_eager_mode_decorator.<locals>.<genexpr>%  s     Iq1557Is   c              3   B   K   | ]  }j                         |u  y wr0   )r   )re   r   rZ   s     r   rf   z(_eager_mode_decorator.<locals>.<genexpr>&  s     4AQWWYa4s   r|   r}   c                     t        j                  t        j                        |       }t	        |t
        t        f      s|g}r- |di\  }}t        |      t              k7  rt        d       | }g }t        j                  t        j                  |            }t        |      k7  rt        d dt        |       d      ||z   S )r   r|   r   z,custom_gradient function expected to return z gradients, but returned z	 instead.r   )	r   r   r   r   
flat_grads	arg_countr   r   r|   s	        r   actual_grad_fnz-_eager_mode_decorator.<locals>.actual_grad_fn:  s    ,OOV46LlT5M2"^l$+\$OY$O!k>	^	I	. 5 6 	6 \*kn*II[!#J
:)#8 D%%(_$5Y@A A &&r!   )r   r   r   r   r   r   r   r   allr   r   r   r'   r   r   r   rW   r
   rk   rR   r   r   rT   r   rA   r   r   )r   r'   r(   r   r   flat_kwargs
all_inputsrZ   r   r   r   input_tensorsrecorded_inputsr   r   r   r   r|   s          `      @@@@r   r$   r$     s    )#3((OFG)'FF
ll4))HH
ll6+;&*
 I$4$F$F$HII
	44	4 ggi)
 **73,[(9(99,111



	,,2F9,=? ? *HH
ll6+ 5@@q''*@+@ )2DO(CE#$cAE- E "/)n)'. 	!**k?(*)LL
ll6K)+			v{	33w) ) AEs   H&+H(H-H2H%recompute_gradc                 L     t          fd       }t        j                   |      S )aD
  Defines a function as a recompute-checkpoint for the tape auto-diff.

  Tape checkpointing is a technique to reduce the memory consumption of the
  auto-diff tape:

  - Without tape checkpointing operations and intermediate values are
  recorded to the tape for use in the backward pass.

  - With tape checkpointing, only the function call and its inputs are
  recorded. During back-propagation the `recompute_grad` custom gradient
  (`tf.custom_gradient`) recomputes the function under a localized Tape object.
  This recomputation of the function during backpropagation performs redundant
  calculation, but reduces the overall memory usage of the Tape.

  >>> y = tf.Variable(1.0)

  >>> def my_function(x):
  ...   tf.print('running')
  ...   z = x*y
  ...   return z

  >>> my_function_recompute = tf.recompute_grad(my_function)

  >>> with tf.GradientTape() as tape:
  ...   r = tf.constant(1.0)
  ...   for i in range(4):
  ...     r = my_function_recompute(r)
  running
  running
  running
  running

  >>> grad = tape.gradient(r, [y])
  running
  running
  running
  running

  Without `recompute_grad`, the tape contains all intermitate steps, and no
  recomputation is performed.

  >>> with tf.GradientTape() as tape:
  ...   r = tf.constant(1.0)
  ...   for i in range(4):
  ...     r = my_function(r)
  running
  running
  running
  running

  >>> grad = tape.gradient(r, [y])


  If `f` was a `tf.keras` `Model` or `Layer` object, methods and attributes
  such as `f.variables` are not available on the returned function `g`.
  Either keep a reference of `f` , or use `g.__wrapped__` for accessing
  these variables and methods.


  >>> def print_running_and_return(x):
  ...   tf.print("running")
  ...   return x

  >>> model = tf.keras.Sequential([
  ...   tf.keras.layers.Lambda(print_running_and_return),
  ...   tf.keras.layers.Dense(2)
  ... ])

  >>> model_recompute = tf.recompute_grad(model)

  >>> with tf.GradientTape(persistent=True) as tape:
  ...   r = tf.constant([[1,2]])
  ...   for i in range(4):
  ...     r = model_recompute(r)
  running
  running
  running
  running

  >>> grad = tape.gradient(r, model.variables)
  running
  running
  running
  running

  Alternatively, use the `__wrapped__` attribute to access the original
  model object.

  >>> grad = tape.gradient(r, model_recompute.__wrapped__.variables)
  running
  running
  running
  running


  Args:
    f: function `f(*x)` that returns a `Tensor` or sequence of `Tensor` outputs.

  Returns:
    A function `g` wrapping `f` that defines a custom gradient, which recomputes
    `f` on the backwards pass of a gradient call.
  c                       t        j                         t        j                         5    i }ddd       dd fd
}|fS # 1 sw Y   xY w)z1Inner function closure for calculating gradients.N)r|   c                 4     t          fd       } || S )zLWrapper function to accomodate lack of kwargs in graph mode custom_gradient.c                  v  	 t        j                         5 }t        j                  t        j
                  
      }t        |       dk\  sJ t        j                         st        j                  t        j                  | d   dg      dd       }t        j                  |t        j                        }t        j                   ||k(  dt#        d            	t        j                  	fd|      }|j%                  |       |j%                         t'        j&                        5   |i }ddd       ddd       g }t)              }j+                  t)              |z   | t,        j.                        }fd	}|dt        |       |t        |      d f|fS # 1 sw Y   rxY w# 1 sw Y   vxY w)
zYNested custom gradient function for computing grads in reverse and forward mode autodiff.rN   r   Ng        nanc                 J    | t        j                  | j                        z   S r0   )r   castr   )r   dresult_deps    r   r    zcrecompute_grad.<locals>.inner.<locals>.grad_wrapper.<locals>.inner_recompute_grad.<locals>.<lambda>  s    !hmmKAA r!   )output_gradientsunconnected_gradientsc                  L    t        dj                  j                              )z8Gradient function calculation for forward mode autodiff.zgrecompute_grad tried to transpose grad of {}. Consider not using recompute_grad in forward modeautodiff)NotImplementedErrorrW   rA   )t_argst_kwargsr   s     r   	transposezdrecompute_grad.<locals>.inner.<locals>.grad_wrapper.<locals>.inner_recompute_grad.<locals>.transpose  s%     $

+- -r!   )r   GradientTaper   rj   r
   rk   rT   r   r#   r   
reduce_maxr	   reshaper   r   boolwhere_v2floatwatchr   rR   gradientr   ZERO)dresultr   id_argselem	elem_boolrecomputed_resultkw_varsgradsr   r   r'   r   r   r(   r|   s            @r   inner_recompute_gradzQrecompute_grad.<locals>.inner.<locals>.grad_wrapper.<locals>.inner_recompute_grad  s    ""$ 	6&&}'='=tD'W"
""**, &&y'8'8bT'J2A'NOD dFKK8I#,,Y&E%L:K((A7LG '''
"GGI,,->? 6 !7 5f 561	64  O'

MG#$"6";";	  =	- ms7|$eCLM&:;YFF)6 61	6 	6s$   DF/)	F#2F/#F,	(F//F8r   )r|   wrapper_argsr   r'   r   r   r(   s   `  r   grad_wrapperz3recompute_grad.<locals>.inner.<locals>.grad_wrapper  s'     /G /Gb "<00r!   )r   r   r   stop_recording)r'   r(   r   r   r   r   s   ``  @r   innerzrecompute_grad.<locals>.inner  sb     '99;				  "$!&!f" /3 51 51n <u" "s   	AAr   r   r,   )r   r   s   ` r   r   r   X  s.    T =  = ~ 
	$	$Q	..r!   grad_pass_throughc                 L     t          fd       }t        j                   |      S )aA  Creates a grad-pass-through op with the forward behavior provided in f.

  Use this function to wrap any op, maintaining its behavior in the forward
  pass, but replacing the original op in the backward graph with an identity.
  For example:

  ```python
  x = tf.Variable(1.0, name="x")
  z = tf.Variable(3.0, name="z")

  with tf.GradientTape() as tape:
    # y will evaluate to 9.0
    y = tf.grad_pass_through(x.assign)(z**2)
  # grads will evaluate to 6.0
  grads = tape.gradient(y, z)
  ```

  Another example is a 'differentiable' moving average approximation, where
  gradients are allowed to flow into the last value fed to the moving average,
  but the moving average is still used for the forward pass:

  ```python
  x = ... # Some scalar value
  # A moving average object, we don't need to know how this is implemented
  moving_average = MovingAverage()
  with backprop.GradientTape() as tape:
    # mavg_x will evaluate to the current running average value
    mavg_x = tf.grad_pass_through(moving_average)(x)
  grads = tape.gradient(mavg_x, x) # grads will evaluate to 1.0
  ```

  Args:
    f: function `f(*x)` that returns a `Tensor` or nested structure of `Tensor`
      outputs.

  Returns:
    A function `h(x)` which returns the same values as `f(x)` and whose
    gradients are the same as those of an identity function.
  c                      d } | i ||fS )Nc                  N    |j                  d      }|| d gt        |      z  fS | S )Nr|   )getrT   )r'   r(   r|   s      r   gradz>grad_pass_through.<locals>._grad_pass_through_op.<locals>.grad0  s1    **[)i		dVc)n,,,kr!   r2   )r'   r(   r  r   s      r   _grad_pass_through_opz0grad_pass_through.<locals>._grad_pass_through_op.  s     dft##r!   r   )r   r  s   ` r   r  r    s.    R $ $ 
	$	$Q(=	>>r!   r0   )'rD   tensorflow.python.eagerr   r   r   tensorflow.python.frameworkr   r   r   tensorflow.python.opsr	   r
   r   r   r   r   r   +tensorflow.python.ops.unconnected_gradientsr   tensorflow.python.platformr   r   tensorflow.python.utilr   r   r   r    tensorflow.python.util.tf_exportr   rc   r   r*   r[   rs   rv   r%   r$   r   r  r2   r!   r   <module>r     s    : , + * A . + + / 2 * - 7 0 L < ' / - 1 6  {6 {6|'" '"T>2)C4L=4@ i/ i/X 1?  1?r!   