
    AVh<{                    	   d Z ddlZddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlm	Z	 dd	lm
Z dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ  ed      ej>                  dVd              Z  edg      ej>                  	 	 	 dWd              Z! edg       ejD                  ej>                  	 	 	 dWd                     Z#e#j                   e!_          edg       ej>                  	 dXd               Z$ edg      ej>                   edd!d"      	 	 	 	 	 dYd#                     Z% ed$g      ej>                  dXd%              Z& ed&d'      ejN                  ej>                  dZd(                     Z( ed)      ej>                  d[d*              Z) ed+d,d-g d.      ej>                   edd/d0      d\d1                     Z*ejV                  fd2Z, ed3d4      ej>                  dXd5              Z- ed6g      ej>                  	 	 	 	 d]d7              Z. ed6g       ej>                  	 	 	 dWd8              Z/ ed9g      ej>                  	 	 	 	 d]d:              Z0 ed9g       ej>                  	 	 	 dWd;              Z1 ed<g      ej>                  	 	 d]d=              Z2 ed<g       ej>                  d^d>              Z3 ed?      ej>                  dXd@              Z4 edAg      ej>                  	 	 	 	 d]dB              Z5 edAg       ej>                  	 	 	 d^dC              Z6 edDg      ej>                  	 	 dWdE              Z7 edDg       ej>                  dVdF              Z8 edG      ej>                  	 dXdH              Z9 edIg      ej>                  	 	 	 	 	 	 	 d_dJ              Z: edKg      ej>                  	 	 	 	 	 	 	 	 	 	 	 d`dL              Z; edKg       ej>                  	 dXdM              Z<dN Z=	 	 	 	 	 	 	 dadOZ> edPg       ej>                  	 	 	 	 dbdQ              Z? edPg      ej>                  	 	 	 	 	 dcdR              Z@ edSg       ej>                  	 	 	 	 	 dddT              ZA edSg      ej>                  	 	 	 	 	 	 dedU              ZBy)fz,Implementation of Neural Net (NN) functions.    N)constant_op)dtypes)ops)	array_ops)array_ops_stack)candidate_sampling_ops)cond)ctc_ops)custom_gradient)embedding_ops)
gen_nn_ops)gen_sparse_ops)
linalg_ops)math_ops)nn_fused_batch_norm_grad)nn_ops)	variables)device_context)dispatch)deprecated_args)deprecated_argument_lookup)	tf_exportznn.log_poisson_lossc           	         t        j                  |d|| g      5 }t        j                  |d      }t        j                  | d      } 	 | j                         j	                  |j                                t        j                  |      || z  z
  }|rt        j                  d| j                  	      }t        j                  d
t        j                  z  | j                  	      }| t        j                  |       z  | z
  |t        j                  || z        z  z   }t        j                  | | j                  	      }t        j                   | | j                  	      }	t        j"                  | |k\  | |	k        }
|t        j$                  |
||      z  }|cddd       S # t
        $ r/ t        d|j                          d| j                          d      w xY w# 1 sw Y   yxY w)a  Computes log Poisson loss given `log_input`.

  Gives the log-likelihood loss between the prediction and the target under the
  assumption that the target has a Poisson distribution.
  Caveat: By default, this is not the exact loss, but the loss minus a
    constant term [log(z!)]. That has no effect for optimization, but
    does not play well with relative loss comparisons. To compute an
    approximation of the log factorial term, specify
    compute_full_loss=True to enable Stirling's Approximation.

  For brevity, let `c = log(x) = log_input`, `z = targets`.  The log Poisson
  loss is

        -log(exp(-x) * (x^z) / z!)
      = -log(exp(-x) * (x^z)) + log(z!)
      ~ -log(exp(-x)) - log(x^z) [+ z * log(z) - z + 0.5 * log(2 * pi * z)]
          [ Note the second term is the Stirling's Approximation for log(z!).
            It is invariant to x and does not affect optimization, though
            important for correct relative loss comparisons. It is only
            computed when compute_full_loss == True. ]
      = x - z * log(x) [+ z * log(z) - z + 0.5 * log(2 * pi * z)]
      = exp(c) - z * c [+ z * log(z) - z + 0.5 * log(2 * pi * z)]

  Args:
    targets: A `Tensor` of the same type and shape as `log_input`.
    log_input: A `Tensor` of type `float32` or `float64`.
    compute_full_loss: whether to compute the full loss. If false, a constant
      term is dropped in favor of more efficient optimization.
    name: A name for the operation (optional).

  Returns:
    A `Tensor` of the same shape as `log_input` with the componentwise
    logistic losses.

  Raises:
    ValueError: If `log_input` and `targets` do not have the same shape.
  log_poisson_loss	log_inputnametargetsz>`log_input` and `targets` must have the same shape, received ( vs ).g      ?dtype   N)r   
name_scopeconvert_to_tensor	get_shapeassert_is_compatible_with
ValueErrorr   expr   constantr"   mathpilogr   
zeros_like	ones_likelogical_andwhere)r   r   compute_full_lossr   result
point_fivetwo_pistirling_approxzerosonesr	   s              M/home/dcms/DCMS/lib/python3.12/site-packages/tensorflow/python/ops/nn_impl.pyr   r   +   s   P ~~d.G0DE %%ikBI##G)<GB33I4G4G4IJ \\)$y7'::F ''7==Aj##AKw}}Ef 8<<#88GC
x||FW$45
57o""7'--@e  >d!!'U"2GtODd	e_==f/ 
  B!!#$D):):)<(=RAB BB s$   /G
-F"7D!G"8GGG&z$nn.sigmoid_cross_entropy_with_logits)v1c           	         t        j                  d| |       t        j                  |d|| g      5 }t        j                  |d      }t        j                  | d      } 	 | j                         j                  |j                                t        j                  ||j                  	      }||k\  }t        j                  |||      }t        j                  || |      }t        j                  ||| z  z
  t        j                  t        j                  |            |      cd
d
d
       S # t        $ r/ t        d|j                          d| j                          d      w xY w# 1 sw Y   y
xY w)z)See sigmoid_cross_entropy_with_logits_v2.!sigmoid_cross_entropy_with_logitslogistic_losslogitsr   labels:`logits` and `labels` must have the same shape, received (r   r    r!   N)r   _ensure_xent_argsr   r$   r%   r&   r'   r(   r   r.   r"   r1   r   addlog1pr)   )r?   r>   r   r7   r	   relu_logitsneg_abs_logitss          r9   r<   r<   m   sT    	>O ~~dOff-=> $""69F""69F22263C3C3EF   v||<EeOD//$6K__TF7F;N<<fvo%x||N34- 
  2 $$*$4$4$6#7t **,-R1 2 22 s$   /E+!-D0BE+08E((E++E4c                     t        || |      S )aB  Computes sigmoid cross entropy given `logits`.

  Measures the probability error in tasks with two outcomes in which each
  outcome is independent and need not have a fully certain label. For instance,
  one could perform a regression where the probability of an event happening is
  known and used as a label. This loss may also be used for binary
  classification, where labels are either zero or one.

  For brevity, let `x = logits`, `z = labels`.  The logistic loss is

        z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
      = z * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x)))
      = z * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x)))
      = z * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x))
      = (1 - z) * x + log(1 + exp(-x))
      = x - x * z + log(1 + exp(-x))

  For x < 0, to avoid overflow in exp(-x), we reformulate the above

        x - x * z + log(1 + exp(-x))
      = log(exp(x)) - x * z + log(1 + exp(-x))
      = - x * z + log(1 + exp(x))

  Hence, to ensure stability and avoid overflow, the implementation uses this
  equivalent formulation

      max(x, 0) - x * z + log(1 + exp(-abs(x)))

  `logits` and `labels` must have the same type and shape.

  >>> logits = tf.constant([1., -1., 0., 1., -1., 0., 0.])
  >>> labels = tf.constant([0., 0., 0., 1., 1., 1., 0.5])
  >>> tf.nn.sigmoid_cross_entropy_with_logits(
  ...     labels=labels, logits=logits).numpy()
  array([1.3132617, 0.3132617, 0.6931472, 0.3132617, 1.3132617, 0.6931472,
         0.6931472], dtype=float32)

  Compared to the losses which handle multiple outcomes,
  `tf.nn.softmax_cross_entropy_with_logits` for general multi-class
  classification and `tf.nn.sparse_softmax_cross_entropy_with_logits` for more
  efficient multi-class classification with hard labels,
  `sigmoid_cross_entropy_with_logits` is a slight simplification for binary
  classification:

        sigmoid(x) = softmax([x, 0])[0]

  $$\frac{1}{1 + e^{-x}} = \frac{e^x}{e^x + e^0}$$

  While `sigmoid_cross_entropy_with_logits` works for soft binary labels
  (probabilities between 0 and 1), it can also be used for binary classification
  where the labels are hard. There is an equivalence between all three symbols
  in this case, with a probability 0 indicating the second class or 1 indicating
  the first class:

  >>> sigmoid_logits = tf.constant([1., -1., 0.])
  >>> softmax_logits = tf.stack([sigmoid_logits, tf.zeros_like(sigmoid_logits)],
  ...                           axis=-1)
  >>> soft_binary_labels = tf.constant([1., 1., 0.])
  >>> soft_multiclass_labels = tf.stack(
  ...     [soft_binary_labels, 1. - soft_binary_labels], axis=-1)
  >>> hard_labels = tf.constant([0, 0, 1])
  >>> tf.nn.sparse_softmax_cross_entropy_with_logits(
  ...     labels=hard_labels, logits=softmax_logits).numpy()
  array([0.31326166, 1.3132616 , 0.6931472 ], dtype=float32)
  >>> tf.nn.softmax_cross_entropy_with_logits(
  ...     labels=soft_multiclass_labels, logits=softmax_logits).numpy()
  array([0.31326166, 1.3132616, 0.6931472], dtype=float32)
  >>> tf.nn.sigmoid_cross_entropy_with_logits(
  ...     labels=soft_binary_labels, logits=sigmoid_logits).numpy()
  array([0.31326166, 1.3132616, 0.6931472], dtype=float32)

  Args:
    labels: A `Tensor` of the same type and shape as `logits`. Between 0 and 1,
      inclusive.
    logits: A `Tensor` of type `float32` or `float64`. Any real number.
    name: A name for the operation (optional).

  Returns:
    A `Tensor` of the same shape as `logits` with the componentwise
    logistic losses.

  Raises:
    ValueError: If `logits` and `labels` do not have the same shape.
  )r>   r?   r   )r<   r?   r>   r   s      r9   $sigmoid_cross_entropy_with_logits_v2rH      s    v 
+F
/ /    z%nn.weighted_cross_entropy_with_logitsc                    t        j                  |d|| g      5 }t        j                  |d      }t        j                  | d      } 	 | j                         j	                  |j                                d|dz
  | z  z   }t        j                  d| z
  |z  |t        j                  t        j                  t        j                  |                   t        j                  |       z   z  |      cd	d	d	       S # t
        $ r/ t        d|j                          d| j                          d      w xY w# 1 sw Y   y	xY w)
aO
  Computes a weighted cross entropy.

  This is like `sigmoid_cross_entropy_with_logits()` except that `pos_weight`,
  allows one to trade off recall and precision by up- or down-weighting the
  cost of a positive error relative to a negative error.

  The usual cross-entropy cost is defined as:

      labels * -log(sigmoid(logits)) +
          (1 - labels) * -log(1 - sigmoid(logits))

  A value `pos_weight > 1` decreases the false negative count, hence increasing
  the recall.
  Conversely setting `pos_weight < 1` decreases the false positive count and
  increases the precision.
  This can be seen from the fact that `pos_weight` is introduced as a
  multiplicative coefficient for the positive labels term
  in the loss expression:

      labels * -log(sigmoid(logits)) * pos_weight +
          (1 - labels) * -log(1 - sigmoid(logits))

  For brevity, let `x = logits`, `z = labels`, `q = pos_weight`.
  The loss is:

        qz * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
      = qz * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x)))
      = qz * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x)))
      = qz * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x))
      = (1 - z) * x + (qz +  1 - z) * log(1 + exp(-x))
      = (1 - z) * x + (1 + (q - 1) * z) * log(1 + exp(-x))

  Setting `l = (1 + (q - 1) * z)`, to ensure stability and avoid overflow,
  the implementation uses

      (1 - z) * x + l * (log(1 + exp(-abs(x))) + max(-x, 0))

  `logits` and `labels` must have the same type and shape.

  >>> labels = tf.constant([1., 0.5, 0.])
  >>> logits = tf.constant([1.5, -0.1, -10.])
  >>> tf.nn.weighted_cross_entropy_with_logits(
  ...     labels=labels, logits=logits, pos_weight=tf.constant(1.5)).numpy()
  array([3.0211994e-01, 8.8049585e-01, 4.5776367e-05], dtype=float32)
  >>> tf.nn.weighted_cross_entropy_with_logits(
  ...     labels=labels, logits=logits, pos_weight=tf.constant(0.5)).numpy()
  array([1.00706644e-01, 5.08297503e-01, 4.57763672e-05], dtype=float32)

  Args:
    labels: A `Tensor` of the same type and shape as `logits`, with values
      between 0 and 1 inclusive.
    logits: A `Tensor` of type `float32` or `float64`, any real numbers.
    pos_weight: A coefficient to use on the positive examples, typically a
      scalar but otherwise broadcastable to the shape of `logits`. Its value
      should be non-negative.
    name: A name for the operation (optional).

  Returns:
    A `Tensor` of the same shape as `logits` with the componentwise
    weighted logistic losses.

  Raises:
    ValueError: If `logits` and `labels` do not have the same shape.
  r=   r>   r   r?   r@   r   r       N)r   r$   r%   r&   r'   r(   r   rB   rC   r)   absr   relu)r?   r>   
pos_weightr   
log_weights        r9   %weighted_cross_entropy_with_logits_v2rP      s4   H ~~dOff-=> $""69F""69F22263C3C3EF j1n..J<<	
VvhnnX\\8<<3G2G%HIkk6'*+ 	,	# 
  2 $$*$4$4$6#7t **,-R1 2 22 s$   /D8
-C=7A<D8=8D55D88Ez)targets is deprecated, use labels insteadr   c                 :    t        d| d|      } t        | |||      S )a  Computes a weighted cross entropy.

  This is like `sigmoid_cross_entropy_with_logits()` except that `pos_weight`,
  allows one to trade off recall and precision by up- or down-weighting the
  cost of a positive error relative to a negative error.

  The usual cross-entropy cost is defined as:

      labels * -log(sigmoid(logits)) +
          (1 - labels) * -log(1 - sigmoid(logits))

  A value `pos_weight > 1` decreases the false negative count, hence increasing
  the recall.
  Conversely setting `pos_weight < 1` decreases the false positive count and
  increases the precision.
  This can be seen from the fact that `pos_weight` is introduced as a
  multiplicative coefficient for the positive labels term
  in the loss expression:

      labels * -log(sigmoid(logits)) * pos_weight +
          (1 - labels) * -log(1 - sigmoid(logits))

  For brevity, let `x = logits`, `z = labels`, `q = pos_weight`.
  The loss is:

        qz * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
      = qz * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x)))
      = qz * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x)))
      = qz * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x))
      = (1 - z) * x + (qz +  1 - z) * log(1 + exp(-x))
      = (1 - z) * x + (1 + (q - 1) * z) * log(1 + exp(-x))

  Setting `l = (1 + (q - 1) * z)`, to ensure stability and avoid overflow,
  the implementation uses

      (1 - z) * x + l * (log(1 + exp(-abs(x))) + max(-x, 0))

  `logits` and `labels` must have the same type and shape.

  Args:
    labels: A `Tensor` of the same type and shape as `logits`.
    logits: A `Tensor` of type `float32` or `float64`.
    pos_weight: A coefficient to use on the positive examples.
    name: A name for the operation (optional).
    targets: Deprecated alias for labels.

  Returns:
    A `Tensor` of the same shape as `logits` with the componentwise
    weighted logistic losses.

  Raises:
    ValueError: If `logits` and `labels` do not have the same shape.
  r?   r   )r   rP   )r?   r>   rN   r   r   s        r9   "weighted_cross_entropy_with_logitsrR   U  s&    z &h	7K&	.vvz4	PPrI   znn.relu_layerc                 n   t        j                  |d| ||g      5 }t        j                  | d      } t        j                  |d      }t        j                  |d      }t        j                  t        j                  | |      |      }t        j                  ||      cddd       S # 1 sw Y   yxY w)a  Computes Relu(x * weight + biases).

  Args:
    x: a 2D tensor.  Dimensions typically: batch, in_units
    weights: a 2D tensor.  Dimensions typically: in_units, out_units
    biases: a 1D tensor.  Dimensions: out_units
    name: A name for the operation (optional).  If not specified
      "nn_relu_layer" is used.

  Returns:
    A 2-D Tensor computing relu(matmul(x, weights) + biases).
    Dimensions typically: batch, out_units.
  
relu_layerxr   weightsbiasesN)r   r$   r%   r   bias_addr   matmulrM   )rU   rV   rW   r   	xw_plus_bs        r9   rT   rT     s      ~~dL1gv*>? -4ac*A##G)<G""69F7 ;VDI;;yt,- - -s   BB++B4znn.siluznn.swishc                     t        j                  | d      } t        j                  |d      }t        j                  || j                        }t
        j
                  d        } || |      S )a  Computes the SiLU or Swish activation function: `x * sigmoid(beta * x)`.

  beta : Hyperparameter for Swish activation function. Default value 1.0.

  The SiLU activation function was introduced in "Gaussian Error Linear Units
  (GELUs)" [Hendrycks et al. 2016](https://arxiv.org/abs/1606.08415) and
  "Sigmoid-Weighted Linear Units for Neural Network Function Approximation in
  Reinforcement Learning"
  [Elfwing et al. 2017](https://arxiv.org/abs/1702.03118) and was independently
  discovered (and called swish) in "Searching for Activation Functions"
  [Ramachandran et al. 2017](https://arxiv.org/abs/1710.05941)

  Args:
    features: A `Tensor` representing preactivation values.
    beta: A 'Tensor' representing value of beta hyperparameter.

  Returns:
    The activation value.
  featuresr   betac                 L      fd} t        j                   z        z  |fS )Nc                 "   t        j                  | g      5  t        j                  z        }ddd       dz  d|z
  z  z   z  }t        j                  | t        j
                        z  |z  d|z
  z        }| |z  |fS # 1 sw Y   UxY w)z+Gradient for the Swish activation function.N      ?)r   control_dependenciesr   sigmoid
reduce_sumsquare)dysigmoid_featuresactivation_grad	beta_gradr]   r\   s       r9   gradz'swish.<locals>.swish_impl.<locals>.grad  s     ##RD) =#++D8O<= cTH_"%55%7 7 8  %%
xx(
(+;
;!!#$i ?"I..= =s   BB)r   rb   )r\   r]   ri   s   `` r9   
swish_implzswish.<locals>.swish_impl  s)    /& h&&th77==rI   )r   r%   r   castr"   r   )r\   r]   rj   s      r9   swishrl     sc    2 ""8*=(			t&	1$	tX^^	,$""> #>. 
Hd	##rI   zlinalg.normalizec                    t        j                  |d| g      5 }t        j                  |       } t        j                  | ||d      }t        j                  || j                        }| |z  }||fcddd       S # 1 sw Y   yxY w)a  Normalizes `tensor` along dimension `axis` using specified norm.

  This uses `tf.linalg.norm` to compute the norm along `axis`.

  This function can compute several different vector norms (the 1-norm, the
  Euclidean or 2-norm, the inf-norm, and in general the p-norm for p > 0) and
  matrix norms (Frobenius, 1-norm, 2-norm and inf-norm).

  Args:
    tensor: `Tensor` of types `float32`, `float64`, `complex64`, `complex128`
    ord: Order of the norm. Supported values are `'fro'`, `'euclidean'`, `1`,
      `2`, `np.inf` and any positive real number yielding the corresponding
      p-norm. Default is `'euclidean'` which is equivalent to Frobenius norm if
      `tensor` is a matrix and equivalent to 2-norm for vectors.
      Some restrictions apply: a) The Frobenius norm `'fro'` is not defined for
        vectors, b) If axis is a 2-tuple (matrix norm), only `'euclidean'`,
        '`fro'`, `1`, `2`, `np.inf` are supported. See the description of `axis`
        on how to compute norms for a batch of vectors or matrices stored in a
        tensor.
    axis: If `axis` is `None` (the default), the input is considered a vector
      and a single vector norm is computed over the entire set of values in the
      tensor, i.e. `norm(tensor, ord=ord)` is equivalent to
      `norm(reshape(tensor, [-1]), ord=ord)`. If `axis` is a Python integer, the
      input is considered a batch of vectors, and `axis` determines the axis in
      `tensor` over which to compute vector norms. If `axis` is a 2-tuple of
      Python integers it is considered a batch of matrices and `axis` determines
      the axes in `tensor` over which to compute a matrix norm.
      Negative indices are supported. Example: If you are passing a tensor that
        can be either a matrix or a batch of matrices at runtime, pass
        `axis=[-2,-1]` instead of `axis=None` to make sure that matrix norms are
        computed.
    name: The name of the op.

  Returns:
    normalized: A normalized `Tensor` with the same shape as `tensor`.
    norm: The computed norms with the same shape and dtype `tensor` but the
      final axis is 1 instead. Same as running
      `tf.cast(tf.linalg.norm(tensor, ord, axis keepdims=True), tensor.dtype)`.

  Raises:
    ValueError: If `ord` or `axis` is invalid.
  	normalizeTkeepdimsN)r   r$   r%   r   normr   rk   r"   )tensorordaxisr   rq   
normalizeds         r9   rn   rn     sw    Z ~~dK&2 d""6*F??63t<D==v||,D$Jt  s   AA::Bmath.l2_normalizelinalg.l2_normalizenn.l2_normalize)rv   rw   rx   z#dim is deprecated, use axis insteaddimc                    t        d|d|      }t        j                  |d| g      5 }t        j                  | d      } | j                  j
                  rt        j                  t        j                  |             }t        j                  t        j                  |             }t        j                  t        j                  ||z   |d            }t        j                  t        j                  ||            }t        j                  t        j                  |       |      }	t        j                  t        j                  |       |      }
t        j                  |	|
|      cddd       S t        j                  t        j                  |       |d      }t        j                  t        j                  ||            }t        j                  | ||      cddd       S # 1 sw Y   yxY w)	a  Normalizes along dimension `axis` using an L2 norm.

  For a 1-D tensor with `axis = 0`, computes

      output = x / sqrt(max(sum(x**2), epsilon))

  For `x` with more dimensions, independently normalizes each 1-D slice along
  dimension `axis`.

  1-D tensor example:
  >>> x = tf.constant([3.0, 4.0])
  >>> tf.math.l2_normalize(x).numpy()
  array([0.6, 0.8], dtype=float32)

  2-D tensor example:
  >>> x = tf.constant([[3.0], [4.0]])
  >>> tf.math.l2_normalize(x, 0).numpy()
  array([[0.6],
       [0.8]], dtype=float32)

  >>> x = tf.constant([[3.0], [4.0]])
  >>> tf.math.l2_normalize(x, 1).numpy()
  array([[1.],
       [1.]], dtype=float32)

  Args:
    x: A `Tensor`.
    axis: Dimension along which to normalize.  A scalar or a vector of
      integers.
    epsilon: A lower bound value for the norm. Will use `sqrt(epsilon)` as the
      divisor if `norm < sqrt(epsilon)`.
    name: A name for this operation (optional).
    dim: Deprecated, do not use.

  Returns:
    A `Tensor` with the same shape as `x`.
  rt   ry   l2_normalizerU   r   Tro   N)r   r   r$   r%   r"   
is_complexr   rd   realimagrc   rsqrtmaximummultiplycomplex)rU   rt   epsilonr   ry   square_realsquare_imag
square_sum
x_inv_norm	norm_real	norm_imags              r9   r{   r{     sj   T 
$FD%	=$
~~dNQC0 7Dac*AwwOOHMM!$45kOOHMM!$45k==


kK7
MOj>>("2"2:w"GHj##HMM!$4jAi##HMM!$4jAii>7 7 $$X__Q%7MJ 0 0W EFJQ
67 7 7s   D?G%0A+G%%G.c           	      $   t        j                  d| g      5  t        j                  g | j                        }t        j                  t        j                  t        j                  | |      |      d      }|cddd       S # 1 sw Y   yxY w)zSame as math_ops.count_nonzero.

  The reduction is done in dtype, which can be faster for 32-bit dtypes.

  Args:
      input_tensor: numeric tensor
      dtype: reduction dtype

  Returns:
      number of nonzero values with type dtype
  count_nonzero)valuesr!   nonzero_countr   N)	r   r$   r   r7   r"   r   rc   rk   	not_equal)input_tensorr"   zeror   s       r9   _count_nonzeror   W  sv     ~~o|n= ??2\%7%78D''|T2	.0M   s   A#BBzmath.zero_fractionznn.zero_fractionc                 r    t        j                  |d g      5  t        j                   d       t        j                   t
        j                        }t        j                  |t
        j                  j                  k   fd fd      }t        j                  d      5  ||z
  }t        j                  |t
        j                  	      }t        j                  |t
        j                  	      }||z  }d
d
d
       t        j                  d      cd
d
d
       S # 1 sw Y   (xY w# 1 sw Y   y
xY w)a  Returns the fraction of zeros in `value`.

  If `value` is empty, the result is `nan`.

  This is useful in summaries to measure and report sparsity.  For example,

  ```python
      z = tf.nn.relu(...)
      summ = tf.compat.v1.summary.scalar('sparsity', tf.nn.zero_fraction(z))
  ```

  Args:
    value: A tensor of numeric type.
    name: A name for the operation (optional).

  Returns:
    The fraction of zeros in `value`, with type `float32`.
  zero_fractionvaluer   )out_typec                      t        j                  t         t        j                        t        j
                        S Nr!   )r   rk   r   r   int32int64r   s   r9   <lambda>zzero_fraction.<locals>.<lambda>  s%    55,,  rI   c                  :    t         t        j                        S r   )r   r   r   r   s   r9   r   zzero_fraction.<locals>.<lambda>  s    V\\B rI   )true_fnfalse_fncounts_to_fractionr!   Nfraction)r   r$   r%   r   sizer   r   tf_condr	   r   maxr   rk   float32identity)r   r   r   num_nonzeronum_zeronum_zero_float32size_float32zero_fraction_float32s   `       r9   r   r   l  s    * ~~dOeW5 A!!%g6E>>%&,,7D,,     CDK 
,	- >#h!xv~~F]]4v~~>l.=	> 3Z@%A A> >A As%   BD-%AD!:D-!D*	&D--D6znn.depthwise_conv2dc           
         t        d|d|      }t        j                  d| g      5 t        j                  | d      } t        j                  d      |ddg}t	        j
                         Cd	k(  rdd|d
   |d   g}nd|d
   |d   dg}t        j                  | ||      cddd       S fd}t        j                  | t        j                        |||      cddd       S # 1 sw Y   yxY w)ah  Depthwise 2-D convolution.

  Given a 4D input tensor ('NHWC' or 'NCHW' data formats)
  and a filter tensor of shape
  `[filter_height, filter_width, in_channels, channel_multiplier]`
  containing `in_channels` convolutional filters of depth 1, `depthwise_conv2d`
  applies a different filter to each input channel (expanding from 1 channel
  to `channel_multiplier` channels for each), then concatenates the results
  together.  The output has `in_channels * channel_multiplier` channels.

  In detail, with the default NHWC format,

      output[b, i, j, k * channel_multiplier + q] = sum_{di, dj}
           filter[di, dj, k, q] * input[b, strides[1] * i + rate[0] * di,
                                           strides[2] * j + rate[1] * dj, k]

  Must have `strides[0] = strides[3] = 1`.  For the most common case of the
  same horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
  If any value in `rate` is greater than 1, we perform atrous depthwise
  convolution, in which case all values in the `strides` tensor must be equal
  to 1.

  Usage Example:

  >>> x = np.array([
  ...     [1., 2.],
  ...     [3., 4.],
  ...     [5., 6.]
  ... ], dtype=np.float32).reshape((1, 3, 2, 1))
  >>> kernel = np.array([
  ...     [1., 2.],
  ...     [3., 4]
  ... ], dtype=np.float32).reshape((2, 1, 1, 2))
  >>> tf.compat.v1.nn.depthwise_conv2d(x, kernel, strides=[1, 1, 1, 1],
  ...                                  padding='VALID').numpy()
    array([[[[10., 14.],
             [14., 20.]],
            [[18., 26.],
             [22., 32.]]]], dtype=float32)

  >>> tf.compat.v1.nn.depthwise_conv2d(x, kernel, strides=[1, 1, 1, 1],
  ...                                  padding=[[0, 0], [1, 0], [1, 0], [0, 0]]
  ...                                 ).numpy()
    array([[[[ 0.,  0.],
             [ 3.,  4.],
             [ 6.,  8.]],
            [[ 0.,  0.],
             [10., 14.],
             [14., 20.]],
            [[ 0.,  0.],
             [18., 26.],
             [22., 32.]]]], dtype=float32)

  Args:
    input: 4-D with shape according to `data_format`.
    filter: 4-D with shape
      `[filter_height, filter_width, in_channels, channel_multiplier]`.
    strides: 1-D of size 4.  The stride of the sliding window for each
      dimension of `input`.
    padding: Controls how to pad the image before applying the convolution. Can
      be the string `"SAME"` or `"VALID"` indicating the type of padding
      algorithm to use, or a list indicating the explicit paddings at the start
      and end of each dimension. When explicit padding is used and data_format
      is `"NHWC"`, this should be in the form `[[0, 0], [pad_top, pad_bottom],
      [pad_left, pad_right], [0, 0]]`. When explicit padding used and
      data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],
      [pad_top, pad_bottom], [pad_left, pad_right]]`.
    rate: 1-D of size 2. The dilation rate in which we sample input values
      across the `height` and `width` dimensions in atrous convolution. If it is
      greater than 1, then all values of strides must be 1.
    name: A name for this operation (optional).
    data_format: The data format for input. Either "NHWC" (default) or "NCHW".
    dilations: Alias of rate.

  Returns:
    A 4-D `Tensor` with shape according to `data_format`.  E.g., for
    "NHWC" format, shape is
    `[batch, out_height, out_width, in_channels * channel_multiplier].`
  	dilationsrate	depthwise	tensor_inr   	filter_inNrK   NCHWr   inputfilterstridespaddingdata_formatr   r   c                 :    t        j                  | |      S )Nr   r   r   r   r   r   r   depthwise_conv2d_native)input_converted_r   r   r   r   r   s      r9   opzdepthwise_conv2d.<locals>.op  s(    ++! rI   r   filter_shapedilation_rater   r   r   )r   r   r$   r%   r   enclosing_tpu_contextr   r   with_space_to_batchr   shape)	r   r   r   r   r   r   r   r   r   s	    ``  ``  r9   depthwise_conv2dr     s   r 
$KFD	I$
~~dK%9 $T!!%k:E""6<F|Vd ++-9		47DG,	Qa!,	++!$ $* %%__V,=$ $ $s   BC785C77D c           	      &    t        | ||||||      S )a  Depthwise 2-D convolution.

  Given a 4D input tensor ('NHWC' or 'NCHW' data formats)
  and a filter tensor of shape
  `[filter_height, filter_width, in_channels, channel_multiplier]`
  containing `in_channels` convolutional filters of depth 1, `depthwise_conv2d`
  applies a different filter to each input channel (expanding from 1 channel
  to `channel_multiplier` channels for each), then concatenates the results
  together.  The output has `in_channels * channel_multiplier` channels.

  In detail, with the default NHWC format,

      output[b, i, j, k * channel_multiplier + q] =
          sum_{di, dj} filter[di, dj, k, q] *
                       input[b, strides[1] * i + dilations[0] * di,
                                strides[2] * j + dilations[1] * dj, k]

  Must have `strides[0] = strides[3] = 1`.  For the most common case of the
  same horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
  If any value in `dilations` is greater than 1, we perform atrous depthwise
  convolution, in which case all values in the `strides` tensor must be equal
  to 1.

  Usage Example:

  >>> x = np.array([
  ...     [1., 2.],
  ...     [3., 4.],
  ...     [5., 6.]
  ... ], dtype=np.float32).reshape((1, 3, 2, 1))
  >>> kernel = np.array([
  ...     [1., 2.],
  ...     [3., 4]
  ... ], dtype=np.float32).reshape((2, 1, 1, 2))
  >>> tf.nn.depthwise_conv2d(x, kernel, strides=[1, 1, 1, 1],
  ...                        padding='VALID').numpy()
    array([[[[10., 14.],
             [14., 20.]],
            [[18., 26.],
             [22., 32.]]]], dtype=float32)

  >>> tf.nn.depthwise_conv2d(x, kernel, strides=[1, 1, 1, 1],
  ...                        padding=[[0, 0], [1, 0], [1, 0], [0, 0]]).numpy()
    array([[[[ 0.,  0.],
             [ 3.,  4.],
             [ 6.,  8.]],
            [[ 0.,  0.],
             [10., 14.],
             [14., 20.]],
            [[ 0.,  0.],
             [18., 26.],
             [22., 32.]]]], dtype=float32)

  Args:
    input: 4-D with shape according to `data_format`.
    filter: 4-D with shape
      `[filter_height, filter_width, in_channels, channel_multiplier]`.
    strides: 1-D of size 4.  The stride of the sliding window for each
      dimension of `input`.
    padding: Controls how to pad the image before applying the convolution. Can
      be the string `"SAME"` or `"VALID"` indicating the type of padding
      algorithm to use, or a list indicating the explicit paddings at the start
      and end of each dimension. See
      [here](https://www.tensorflow.org/api_docs/python/tf/nn#notes_on_padding_2)
      for more information. When explicit padding is used and data_format
      is `"NHWC"`, this should be in the form `[[0, 0], [pad_top, pad_bottom],
      [pad_left, pad_right], [0, 0]]`. When explicit padding used and
      data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],
      [pad_top, pad_bottom], [pad_left, pad_right]]`.
    data_format: The data format for input. Either "NHWC" (default) or "NCHW".
    dilations: 1-D of size 2. The dilation rate in which we sample input values
      across the `height` and `width` dimensions in atrous convolution. If it is
      greater than 1, then all values of strides must be 1.
    name: A name for this operation (optional).

  Returns:
    A 4-D `Tensor` with shape according to `data_format`.  E.g., for
    "NHWC" format, shape is
    `[batch, out_height, out_width, in_channels * channel_multiplier].`
  )r   r   r   r   r   r   r   )r   r   s          r9   depthwise_conv2d_v2r     s&    r 
!'")")(#&1
3 3rI   znn.separable_conv2dc	           	      t   t        d|d|      }t        j                  |d| |g      5 }t        j                  | d      } t        j                  d      t        j                  |d      }|j	                         j                  d      }	|	j                  d	   j                  d
       |	j                  d
   j                  d
       |d
d
g}fd}
t        j                  | t        j                        |||
      }t        j                  ||g dd|      cddd       S # 1 sw Y   yxY w)a
  2-D convolution with separable filters.

  Performs a depthwise convolution that acts separately on channels followed by
  a pointwise convolution that mixes channels.  Note that this is separability
  between dimensions `[1, 2]` and `3`, not spatial separability between
  dimensions `1` and `2`.

  In detail, with the default NHWC format,

      output[b, i, j, k] = sum_{di, dj, q, r}
          input[b, strides[1] * i + di, strides[2] * j + dj, q] *
          depthwise_filter[di, dj, q, r] *
          pointwise_filter[0, 0, q * channel_multiplier + r, k]

  `strides` controls the strides for the depthwise convolution only, since
  the pointwise convolution has implicit strides of `[1, 1, 1, 1]`.  Must have
  `strides[0] = strides[3] = 1`.  For the most common case of the same
  horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
  If any value in `rate` is greater than 1, we perform atrous depthwise
  convolution, in which case all values in the `strides` tensor must be equal
  to 1.

  Args:
    input: 4-D `Tensor` with shape according to `data_format`.
    depthwise_filter: 4-D `Tensor` with shape
      `[filter_height, filter_width, in_channels, channel_multiplier]`.
      Contains `in_channels` convolutional filters of depth 1.
    pointwise_filter: 4-D `Tensor` with shape
      `[1, 1, channel_multiplier * in_channels, out_channels]`.  Pointwise
      filter to mix channels after `depthwise_filter` has convolved spatially.
    strides: 1-D of size 4.  The strides for the depthwise convolution for
      each dimension of `input`.
    padding: Controls how to pad the image before applying the depthwise
      convolution. Can be the string `"SAME"` or `"VALID"` indicating the type
      of padding algorithm to use, or a Python list indicating the explicit
      paddings at the start and end of each dimension. When explicit padding is
      used and data_format is `"NHWC"`, this should be in the form `[[0, 0],
      [pad_top, pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit
      padding used and data_format is `"NCHW"`, this should be in the form
      `[[0, 0], [0, 0], [pad_top, pad_bottom], [pad_left, pad_right]]`.
    rate: 1-D of size 2. The dilation rate in which we sample input values
      across the `height` and `width` dimensions in atrous convolution. If it is
      greater than 1, then all values of strides must be 1.
    name: A name for this operation (optional).
    data_format: The data format for input. Either "NHWC" (default) or "NCHW".
    dilations: Alias of rate.

  Returns:
    A 4-D `Tensor` with shape according to 'data_format'. For
      example, with data_format="NHWC", shape is [batch, out_height,
      out_width, out_channels].
  r   r   separable_conv2dr   r   depthwise_filterpointwise_filter   r   rK   Nc                 :    t        j                  | |d      S )Nr   r   r   )r   r   r   r   r   r   s      r9   r   zseparable_conv2d.<locals>.op  s(    ++!! rI   r   )rK   rK   rK   rK   VALID)r   r   r   )r   r   r$   r%   r&   	with_rankdimsr'   r   r   r   r   conv2d)r   r   r   r   r   r   r   r   r   pointwise_filter_shaper   r   s    ` `   `    r9   r   r   }  s;   ~ 
$KFD	I$
~~d..0@AC )FJ!!%k:E,,13,,13 .779CCAF"<<Q?"<<Q?|Vd **__%56I ==,I) ) )s   C8D..D7c           
      (    t        | |||||||      S )a
  2-D convolution with separable filters.

  Performs a depthwise convolution that acts separately on channels followed by
  a pointwise convolution that mixes channels.  Note that this is separability
  between dimensions `[1, 2]` and `3`, not spatial separability between
  dimensions `1` and `2`.

  In detail, with the default NHWC format,

      output[b, i, j, k] = sum_{di, dj, q, r}
          input[b, strides[1] * i + di, strides[2] * j + dj, q] *
          depthwise_filter[di, dj, q, r] *
          pointwise_filter[0, 0, q * channel_multiplier + r, k]

  `strides` controls the strides for the depthwise convolution only, since
  the pointwise convolution has implicit strides of `[1, 1, 1, 1]`.  Must have
  `strides[0] = strides[3] = 1`.  For the most common case of the same
  horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
  If any value in `rate` is greater than 1, we perform atrous depthwise
  convolution, in which case all values in the `strides` tensor must be equal
  to 1.

  Args:
    input: 4-D `Tensor` with shape according to `data_format`.
    depthwise_filter: 4-D `Tensor` with shape `[filter_height, filter_width,
      in_channels, channel_multiplier]`. Contains `in_channels` convolutional
      filters of depth 1.
    pointwise_filter: 4-D `Tensor` with shape `[1, 1, channel_multiplier *
      in_channels, out_channels]`.  Pointwise filter to mix channels after
      `depthwise_filter` has convolved spatially.
    strides: 1-D of size 4.  The strides for the depthwise convolution for each
      dimension of `input`.
    padding: Controls how to pad the image before applying the depthwise
      convolution. Can be the string `"SAME"` or `"VALID"` indicating the type
      of padding algorithm to use, or a Python list indicating the explicit
      paddings at the start and end of each dimension. When explicit padding is
      used and data_format is `"NHWC"`, this should be in the form `[[0, 0],
      [pad_top, pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit
      padding used and data_format is `"NCHW"`, this should be in the form
      `[[0, 0], [0, 0], [pad_top, pad_bottom], [pad_left, pad_right]]`.
    data_format: The data format for input. Either "NHWC" (default) or "NCHW".
    dilations: 1-D of size 2. The dilation rate in which we sample input values
      across the `height` and `width` dimensions in atrous convolution. If it is
      greater than 1, then all values of strides must be 1.
    name: A name for this operation (optional).

  Returns:
    A 4-D `Tensor` with shape according to 'data_format'. For
      example, with data_format="NHWC", shape is [batch, out_height,
      out_width, out_channels].
  )r   r   r   )r   )r   r   r   r   r   r   r   r   s           r9   separable_conv2d_v2r     s*    ~ 

 rI   znn.sufficient_statisticsc                    t        t        |            }t        d|d|      }|d}t        j                  |d| |g      5  t        j
                  | d      } | j                         j                  [t        fd|D              rGd	}|D ]  }|j                  |   j                  z  }  t        j                  || j                  
      }nt        j                  |       }|D 	cg c]  }	|	dk  r|	|z   n|	 }
}	t        j                  t!        j"                  t        j$                  |       | j                        |
      }t!        j&                  |d      }|Dt        j
                  |d      }t!        j(                  | |      }t!        j*                  | |      }n| }t!        j,                  |       }t!        j.                  |||d      }t!        j.                  |||d      }ddd       |fS c c}	w # 1 sw Y   xY w)a8  Calculate the sufficient statistics for the mean and variance of `x`.

  These sufficient statistics are computed using the one pass algorithm on
  an input that's optionally shifted. See:
  https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Computing_shifted_data

  For example:
  >>> t = [[1, 2, 3], [4, 5, 6]]
  >>> sufficient_statistics(t, [1])
  (<tf.Tensor: shape=(), dtype=int32, numpy=3>, <tf.Tensor: shape=(2,),
  dtype=int32, numpy=array([ 6, 15], dtype=int32)>, <tf.Tensor: shape=(2,),
  dtype=int32, numpy=array([14, 77], dtype=int32)>, None)
  >>> sufficient_statistics(t, [-1])
  (<tf.Tensor: shape=(), dtype=int32, numpy=3>, <tf.Tensor: shape=(2,),
  dtype=int32, numpy=array([ 6, 15], dtype=int32)>, <tf.Tensor: shape=(2,),
  dtype=int32, numpy=array([14, 77], dtype=int32)>, None)

  Args:
    x: A `Tensor`.
    axes: Array of ints. Axes along which to compute mean and variance. As in
      Python, the axes can also be negative numbers. A negative axis is
      interpreted as counting from the end of the rank, i.e., axis +
      rank(values)-th dimension.
    shift: A `Tensor` containing the value by which to shift the data for
      numerical stability, or `None` if no shift is to be performed. A shift
      close to the true mean provides the most numerically stable results.
    keep_dims: produce statistics with the same dimensionality as the input.
    name: Name used to scope the operations that compute the sufficient stats.
    keepdims: Alias for keep_dims.

  Returns:
    Four `Tensor` objects of the same type as `x`:

    * the count (number of elements to average over).
    * the (possibly shifted) sum of the elements in the array.
    * the (possibly shifted) sum of squares of the elements in the array.
    * the shift by which the mean must be corrected or None if `shift` is None.
  rp   	keep_dimsNFsufficient_statisticsrU   r   c              3   T   K   | ]  }j                   |   j                  d u ! y wN)r   r   ).0dx_shapes     r9   	<genexpr>z(sufficient_statistics.<locals>.<genexpr>g  s)      (9./QT)(9s   %(rK   r!   r   countshiftmean_ssrp   r   var_ss)listsetr   r   r$   r%   r&   rankallr   r   r   r*   r"   r   gatherr   rk   r   reduce_prodsubtractsquared_differencerd   rc   )rU   axesr   r   r   rp   countsr   r   rt   positive_axesx_dimsm_ssv_ssr   s                 @r9   r   r   5  s   T 
c$i$((K4)I
~~d3aZ@ Nac*AkkmG||C (937(9 %9f (!',,q/'''(##F!'':f ^^AdEIJTdQhtd{D8JmJ
--	*AGG
4mEf##F9f##E8eq%(d((E2dd__QdtTIINDtTIHMD1N2 
tU	"" KN Ns    B(G>)G9<C/G>9G>>Hc                 "    t        | ||||      S )aJ  Calculate the sufficient statistics for the mean and variance of `x`.

  These sufficient statistics are computed using the one pass algorithm on
  an input that's optionally shifted. See:
  https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Computing_shifted_data

  Args:
    x: A `Tensor`.
    axes: Array of ints. Axes along which to compute mean and variance.
    shift: A `Tensor` containing the value by which to shift the data for
      numerical stability, or `None` if no shift is to be performed. A shift
      close to the true mean provides the most numerically stable results.
    keepdims: produce statistics with the same dimensionality as the input.
    name: Name used to scope the operations that compute the sufficient stats.

  Returns:
    Four `Tensor` objects of the same type as `x`:

    * the count (number of elements to average over).
    * the (possibly shifted) sum of the elements in the array.
    * the (possibly shifted) sum of squares of the elements in the array.
    * the shift by which the mean must be corrected or None if `shift` is None.
  )rU   r   r   r   r   )r   rU   r   r   rp   r   s        r9   sufficient_statistics_v2r     s    4 
	EXD
B BrI   znn.normalize_momentsc                    t        j                  |d| |||g      5  t        j                  | d      }|1t        j                  ||d      }t        j
                  ||d      }nt        j                  ||d      }|}t        j                  t        j                  ||      t        j                  |      d      }ddd       ||fS # 1 sw Y   fS xY w)a  Calculate the mean and variance of based on the sufficient statistics.

  Args:
    counts: A `Tensor` containing the total count of the data (one value).
    mean_ss: A `Tensor` containing the mean sufficient statistics: the (possibly
      shifted) sum of the elements to average over.
    variance_ss: A `Tensor` containing the variance sufficient statistics: the
      (possibly shifted) squared sum of the data to compute the variance over.
    shift: A `Tensor` containing the value by which the data is shifted for
      numerical stability, or `None` if no shift was performed.
    name: Name used to scope the operations that compute the moments.

  Returns:
    Two `Tensor` objects: `mean` and `variance`.
  rn   divisorr   Nshifted_meanmeanvariance)r   r$   r   
reciprocalr   rB   r   rd   )	r   r   variance_ssr   r   r   r   r   r   s	            r9   normalize_momentsr     s    $ ~~dK&';)NO !!&y9G&&wnMl\\,F;d&&wfEld  +w/%H 	 	s   B$CCz
nn.momentsc           	         t        d|d|      }|d}t        j                  |d| |g      5  | j                  t        j
                  k(  r$t        j                  | t        j                        n| }t        j                  ||dd      }t        j                  t        j                  |t        j                  |            |dd	      }|s,t        j                  ||      }t        j                  ||      }| j                  t        j
                  k(  rQt        j                  |t        j
                        t        j                  |t        j
                        fcddd       S ||fcddd       S # 1 sw Y   yxY w)
a  Calculate the mean and variance of `x`.

  The mean and variance are calculated by aggregating the contents of `x`
  across `axes`.  If `x` is 1-D and `axes = [0]` this is just the mean
  and variance of a vector.

  Note: shift is currently not used; the true mean is computed and used.

  When using these moments for batch normalization (see
  `tf.nn.batch_normalization`):

   * for so-called "global normalization", used with convolutional filters with
     shape `[batch, height, width, depth]`, pass `axes=[0, 1, 2]`.
   * for simple batch normalization pass `axes=[0]` (batch only).

  Args:
    x: A `Tensor`.
    axes: Array of ints.  Axes along which to compute mean and
      variance.
    shift: Not used in the current implementation
    name: Name used to scope the operations that compute the moments.
    keep_dims: produce moments with the same dimensionality as the input.
    keepdims: Alias to keep_dims.

  Returns:
    Two `Tensor` objects: `mean` and `variance`.
  rp   r   NFmomentsTr   r   r   )r   r   r$   r"   r   float16r   rk   r   reduce_meanr   r   stop_gradientsqueeze)	rU   r   r   r   r   rp   yr   r   s	            r9   r   r     s5   H )(K4)I
~~dI4y1  -.GGv~~,Ea(1A4$VDD
 ####Ay'>'>t'DE	H
 tT*d""8T2hww&.. mmD&..1mmHfnn57) . H/  s   D/E2%E22E;c                 "    t        | ||||      S )a  Calculates the mean and variance of `x`.

  The mean and variance are calculated by aggregating the contents of `x`
  across `axes`.  If `x` is 1-D and `axes = [0]` this is just the mean
  and variance of a vector.

  Note: shift is currently not used; the true mean is computed and used.

  When using these moments for batch normalization (see
  `tf.nn.batch_normalization`):

   * for so-called "global normalization", used with convolutional filters with
     shape `[batch, height, width, depth]`, pass `axes=[0, 1, 2]`.
   * for simple batch normalization pass `axes=[0]` (batch only).

  Args:
    x: A `Tensor`.
    axes: Array of ints.  Axes along which to compute mean and
      variance.
    shift: Not used in the current implementation.
    keepdims: produce moments with the same dimensionality as the input.
    name: Name used to scope the operations that compute the moments.

  Returns:
    Two `Tensor` objects: `mean` and `variance`.
  )rU   r   r   r   r   )r   r   s        r9   
moments_v2r    s    D 
14u48	LLrI   znn.weighted_momentsc                 (   t        d|d|      }|d}t        j                  |d| ||g      5  t        j                  | d      } t        j                  |d      }| j                  t
        j                  k(  }|r$t        j                  | t
        j                        } |j                  | j                  k7  r t        j                  || j                        }t        j                  || z  |d	d
      }|t        j                  |       z   }t        j                  ||dd
      }	t        j                  ||	      }
t        j                  |t        j                  | |
      z  |dd
      }t        j                  ||	      }|s.t        j                  |
|      }
t        j                  ||      }|rHt        j                  |
t
        j                        }
t        j                  |t
        j                        }|
|fcddd       S # 1 sw Y   yxY w)a  Returns the frequency-weighted mean and variance of `x`.

  Args:
    x: A tensor.
    axes: 1-d tensor of int32 values; these are the axes along which
      to compute mean and variance.
    frequency_weights: A tensor of positive weights which can be
      broadcast with x.
    name: Name used to scope the operation.
    keep_dims: Produce moments with the same dimensionality as the input.
    keepdims: Alias of keep_dims.

  Returns:
    Two tensors: `weighted_mean` and `weighted_variance`.
  rp   r   NFweighted_momentsrU   r   frequency_weightsweighted_input_sumT)r   rp   sum_of_weightsweighted_distsq)rt   )r   r   r$   r%   r"   r   r   r   rk   r   rc   r   r.   
div_no_nanr   r  )rU   r   r  r   r   rp   
needs_castr  broadcasted_weightsr	  weighted_meanr
  weighted_variances                r9   r  r  &  s   & )(K4)I
~~d.4Et0LM 3,ac*A-- 35 FNN*J
--6>>
*a!'')"--(9177C ",,At*>O ,i.B.B1.EE((T(84IN ''(:NKM ))H77=II	O !++O^L''DAm#++
$( mmM6>>Bm"--(96>>J++g3, 3, 3,s   GHHc                 "    t        | ||||      S )a  Returns the frequency-weighted mean and variance of `x`.

  Args:
    x: A tensor.
    axes: 1-d tensor of int32 values; these are the axes along which
      to compute mean and variance.
    frequency_weights: A tensor of positive weights which can be
      broadcast with x.
    keepdims: Produce moments with the same dimensionality as the input.
    name: Name used to scope the operation.

  Returns:
    Two tensors: `weighted_mean` and `weighted_variance`.
  )rU   r   r  r   r   )r  )rU   r   r  rp   r   s        r9   weighted_moments_v2r  s  s     " 
	)
 rI   znn.batch_normalizationc           	      H   t        j                  |d| ||||g      5  t        j                  ||z         }|||z  }| t        j                  || j
                        z  t        j                  ||||z  z
  n| |z  | j
                        z   cddd       S # 1 sw Y   yxY w)a	  Batch normalization.

  Normalizes a tensor by `mean` and `variance`, and applies (optionally) a
  `scale` \\(\gamma\\) to it, as well as an `offset` \\(\beta\\):

  \\(\frac{\gamma(x-\mu)}{\sigma}+\beta\\)

  `mean`, `variance`, `offset` and `scale` are all expected to be of one of two
  shapes:

    * In all generality, they can have the same number of dimensions as the
      input `x`, with identical sizes as `x` for the dimensions that are not
      normalized over (the 'depth' dimension(s)), and dimension 1 for the
      others which are being normalized over.
      `mean` and `variance` in this case would typically be the outputs of
      `tf.nn.moments(..., keepdims=True)` during training, or running averages
      thereof during inference.
    * In the common case where the 'depth' dimension is the last dimension in
      the input tensor `x`, they may be one dimensional tensors of the same
      size as the 'depth' dimension.
      This is the case for example for the common `[batch, depth]` layout of
      fully-connected layers, and `[batch, height, width, depth]` for
      convolutions.
      `mean` and `variance` in this case would typically be the outputs of
      `tf.nn.moments(..., keepdims=False)` during training, or running averages
      thereof during inference.

  See equation 11 in Algorithm 2 of source:
  [Batch Normalization: Accelerating Deep Network Training by
  Reducing Internal Covariate Shift; S. Ioffe, C. Szegedy]
  (http://arxiv.org/abs/1502.03167).

  Args:
    x: Input `Tensor` of arbitrary dimensionality.
    mean: A mean `Tensor`.
    variance: A variance `Tensor`.
    offset: An offset `Tensor`, often denoted \\(\beta\\) in equations, or
      None. If present, will be added to the normalized tensor.
    scale: A scale `Tensor`, often denoted \\(\gamma\\) in equations, or
      `None`. If present, the scale is applied to the normalized tensor.
    variance_epsilon: A small float number to avoid dividing by 0.
    name: A name for this operation (optional).

  Returns:
    the normalized, scaled, offset tensor.

  References:
    Batch Normalization - Accelerating Deep Network Training by Reducing
    Internal Covariate Shift:
      [Ioffe et al., 2015](http://arxiv.org/abs/1502.03167)
      ([pdf](http://proceedings.mlr.press/v37/ioffe15.pdf))
  	batchnormN)r   r$   r   r   rk   r"   )rU   r   r   offsetscalevariance_epsilonr   invs           r9   batch_normalizationr    s    z ~~dK!T8UF)KL M
..$44
5C	Ulc x}}S!''**X]]%1us{AGG.M MM M Ms   A1BB!znn.fused_batch_normc
                 v   |r|	dk7  r||t        d|d|      t        j                  | d      } t        j                  |d      }t        j                  |d      }|t        j                  g       }|t        j                  g       }t        j                  | ||||||	|||
      \  }
}}}}}|
||fS )	a  Batch normalization.


  See Source: [Batch Normalization: Accelerating Deep Network Training by
  Reducing Internal Covariate Shift; S. Ioffe, C. Szegedy]
  (http://arxiv.org/abs/1502.03167).

  Args:
    x: Input `Tensor` of 4 or 5 dimensions.
    scale: A `Tensor` of 1 dimension for scaling.
    offset: A `Tensor` of 1 dimension for bias.
    mean: A `Tensor` of 1 dimension for population mean. The shape and meaning
          of this argument depends on the value of is_training and
          exponential_avg_factor as follows:
          is_training==False (inference):
            Mean must be a `Tensor` of the same shape as scale containing the
            estimated population mean computed during training.
          is_training==True and exponential_avg_factor == 1.0:
            Mean must be None.
          is_training==True and exponential_avg_factor != 1.0:
            Mean must be a `Tensor` of the same shape as scale containing the
            exponential running mean.
    variance: A `Tensor` of 1 dimension for population variance. The shape and
          meaning of this argument depends on the value of is_training and
          exponential_avg_factor as follows:
          is_training==False (inference):
            Variance must be a `Tensor` of the same shape as scale containing
            the estimated population variance computed during training.
          is_training==True and exponential_avg_factor == 1.0:
            Variance must be None.
          is_training==True and exponential_avg_factor != 1.0:
            Variance must be a `Tensor` of the same shape as scale containing
            the exponential running variance.
    epsilon: A small float number added to the variance of x.
    data_format: The data format for x. Support "NHWC" (default) or "NCHW" for
                 4D tenors and "NDHWC" or "NCDHW" for 5D tensors.
    is_training: A bool value to specify if the operation is used for
                 training or inference.
    name: A name for this operation (optional).
    exponential_avg_factor: A float number (usually between 0 and 1) used
                            for controlling the decay of the running
                            population average of mean and variance.
                            If set to 1.0, the current batch average is
                            returned.

  Returns:
    y: A 4D or 5D Tensor for the normalized, scaled, offsetted x.
    running_mean: A 1D Tensor for the exponential running mean of x.
                  The output value is (1 - exponential_avg_factor) * mean +
                  exponential_avg_factor * batch_mean), where batch_mean
                  is the mean of the current batch in x.
    running_var: A 1D Tensor for the exponential running variance
                 The output value is (1 - exponential_avg_factor) * variance +
                 exponential_avg_factor * batch_variance), where batch_variance
                 is the variance of the current batch in x.

  References:
    Batch Normalization - Accelerating Deep Network Training by Reducing
    Internal Covariate Shift:
      [Ioffe et al., 2015](http://proceedings.mlr.press/v37/ioffe15.html)
      ([pdf](http://proceedings.mlr.press/v37/ioffe15.pdf))
  r`   zBoth `mean` and `variance` must be a 1D tensor when `is_training` is False or `exponential_avg_factor` != 1.0. Received: `mean` z and `variance` r   r   r  r  )r   exponential_avg_factorr   is_trainingr   )r(   r   r%   r   r*   r   fused_batch_norm_v3)rU   r  r  r   r   r   r   r  r   r  r  running_meanrunning_varr   s                 r9   fused_batch_normr    s    V /36|)
 ..2X5E |% & & 
AG,!


G
4%  h7&	\#D##B'H*4*H*H
3
+'!\;1a 
L+	%%rI   z'nn.batch_norm_with_global_normalizationc           	          t        d|d|       } t        d|	d|      }t        d|
d|      }t        | ||||r|||      S d||      S )aG  Batch normalization.

  This op is deprecated. See `tf.nn.batch_normalization`.

  Args:
    t: A 4D input Tensor.
    m: A 1D mean Tensor with size matching the last dimension of t.
      This is the first output from tf.nn.moments,
      or a saved moving average thereof.
    v: A 1D variance Tensor with size matching the last dimension of t.
      This is the second output from tf.nn.moments,
      or a saved moving average thereof.
    beta: A 1D beta Tensor with size matching the last dimension of t.
      An offset to be added to the normalized tensor.
    gamma: A 1D gamma Tensor with size matching the last dimension of t.
      If "scale_after_normalization" is true, this tensor will be multiplied
      with the normalized tensor.
    variance_epsilon: A small float number to avoid dividing by 0.
    scale_after_normalization: A bool indicating whether the resulted tensor
      needs to be multiplied with gamma.
    name: A name for this operation (optional).
    input: Alias for t.
    mean: Alias for m.
    variance: Alias for v.

  Returns:
     A batch-normalized `t`.

  References:
    Batch Normalization - Accelerating Deep Network Training by Reducing
    Internal Covariate Shift:
      [Ioffe et al., 2015](http://proceedings.mlr.press/v37/ioffe15.html)
      ([pdf](http://proceedings.mlr.press/v37/ioffe15.pdf))
  r   tr   mr   vN)r   r  )r!  r"  r#  r]   gammar  scale_after_normalizationr   r   r   r   s              r9   $batch_norm_with_global_normalizationr&  :  sj    ^ !%a8! sA6! XsA>!	Q1d5NE(8$
@ @"&(8$
@ @rI   c           
      (    t        | |||||||      S )a  Batch normalization.

  This op is deprecated. See `tf.nn.batch_normalization`.

  Args:
    input: A 4D input Tensor.
    mean: A 1D mean Tensor with size matching the last dimension of t.
      This is the first output from tf.nn.moments,
      or a saved moving average thereof.
    variance: A 1D variance Tensor with size matching the last dimension of t.
      This is the second output from tf.nn.moments,
      or a saved moving average thereof.
    beta: A 1D beta Tensor with size matching the last dimension of t.
      An offset to be added to the normalized tensor.
    gamma: A 1D gamma Tensor with size matching the last dimension of t.
      If "scale_after_normalization" is true, this tensor will be multiplied
      with the normalized tensor.
    variance_epsilon: A small float number to avoid dividing by 0.
    scale_after_normalization: A bool indicating whether the resulted tensor
      needs to be multiplied with gamma.
    name: A name for this operation (optional).

  Returns:
     A batch-normalized `t`.

  References:
    Batch Normalization - Accelerating Deep Network Training by Reducing Internal Covariate Shift:
      [Ioffe et al., 2015](http://proceedings.mlr.press/v37/ioffe15.html)
      ([pdf](http://proceedings.mlr.press/v37/ioffe15.pdf))
  )r!  r"  r#  r]   r$  r  r%  r   )r&  )r   r   r   r]   r$  r  r%  r   s           r9   'batch_norm_with_global_normalization_v2r(  q  s)    P 
.04083749?OHa37
9 9rI   c                     t        j                  |       d   }t        j                  |dg      }t        j                  || j
                        }t        j                  t        j                  | |      dg      S )z5Returns a vector summing up each row of the matrix x.rK   )	r   r   r   stackr8   r"   reshaper   rY   )rU   cols
ones_shaper8   s       r9   	_sum_rowsr/    s_     
	A	$$$dAY/*	
AGG	,$			8??1d3bT	::rI   c           
      
   t        | t        j                        rt        |       } t        | t              s| g} t	        j
                  |d| |||gz         5  |j                  t        j                  k7  r$t        j                  |t        j                        }t        j                  |dg      }|t        j                  |||d||      }d |D        \  }}}t        j                  |t        j                        }t        j                  ||gd      }t!        j"                  | ||
      }|j                  |j                  k7  r t        j                  ||j                        }t        j$                  |ddgt'        j(                  t        j*                  |      d   dg            }t        j$                  |t'        j(                  t        j*                  |      d   dg      ddg      }t        j,                  ||d	      }t!        j"                  |||
      }|j                  |j                  k7  r t        j                  ||j                        }t        j$                  |dgt        j*                  |            }t        j$                  |t        j*                  |      dg      }t        j*                  |      d
d }t        j                  d|g|gd      }t        j.                  t        j0                  |d
      t        j                  ||            }t        j                  |t        j                  dg|gd            }t        j                  t3        |      d|g      }t        j                  |d|g      }||z  }||z  }|	r!t        j4                  |||      }|\  }} }!t        j                  |dd
g      }"t        j                  t        j                  | t        j6                        dd
g      }#t        j                  |"|#gd
d      }$t        j                  t        j*                  |      dd
 t        j0                  |d      gd      }%|j                  |!j                  k7  r t        j                  |!|j                        }!|t9        j:                  |$|%|!dd      z  }|r0|t        j<                  |      z  }|t        j<                  |      z  }t        j                  ||gd
      }&t        j                  t        j>                  |      |z  t        j@                  |      gd
      }'|&|'fcddd       S # 1 sw Y   yxY w)a(
  Helper function for nce_loss and sampled_softmax_loss functions.

  Computes sampled output training logits and labels suitable for implementing
  e.g. noise-contrastive estimation (see nce_loss) or sampled softmax (see
  sampled_softmax_loss).

  Note: In the case where num_true > 1, we assign to each target class
  the target probability 1 / num_true so that the target probabilities
  sum to 1 per-example.

  Args:
    weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
        objects whose concatenation along dimension 0 has shape
        `[num_classes, dim]`.  The (possibly-partitioned) class embeddings.
    biases: A `Tensor` of shape `[num_classes]`.  The (possibly-partitioned)
        class biases.
    labels: A `Tensor` of type `int64` and shape `[batch_size,
        num_true]`. The target classes.  Note that this format differs from
        the `labels` argument of `nn.softmax_cross_entropy_with_logits`.
    inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward
        activations of the input network.
    num_sampled: An `int`.  The number of classes to randomly sample per batch.
    num_classes: An `int`. The number of possible classes.
    num_true: An `int`.  The number of target classes per training example.
    sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
        `sampled_expected_count`) returned by a `*_candidate_sampler` function.
        (if None, we default to `log_uniform_candidate_sampler`)
    subtract_log_q: A `bool`.  whether to subtract the log expected count of
        the labels in the sample to get the logits of the true labels.
        Default is True.  Turn off for Negative Sampling.
    remove_accidental_hits:  A `bool`.  whether to remove "accidental hits"
        where a sampled class equals one of the target classes.  Default is
        False.
    partition_strategy: A string specifying the partitioning strategy, relevant
        if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
        Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.
    name: A name for the operation (optional).
    seed: random seed for candidate sampling. Default to None, which doesn't set
        the op-level random seed for candidate sampling.
  Returns:
    out_logits: `Tensor` object with shape
        `[batch_size, num_true + num_sampled]`, for passing to either
        `nn.sigmoid_cross_entropy_with_logits` (NCE) or
        `nn.softmax_cross_entropy_with_logits` (sampled softmax).
    out_labels: A Tensor object with the same shape as `out_logits`.
  compute_sampled_logitsr*  NT)true_classesnum_truenum_sampledunique	range_maxseedc              3   F   K   | ]  }t        j                  |        y wr   )r   r   )r   ss     r9   r   z*_compute_sampled_logits.<locals>.<genexpr>  s       <<'(	"<<s   !r   )partition_strategy)transpose_brK   r#   )r3  sparse_indicesg        F)default_valuevalidate_indices)!
isinstancer   PartitionedVariabler   r   r$   r"   r   r   r   rk   r   r,  r   log_uniform_candidate_samplerconcatr   embedding_lookupslicer   r+  r   rY   r   expand_dimsr/  compute_accidental_hitsr   r   sparse_to_denser-   r/   r.   )(rV   rW   r?   inputsr4  num_classesr3  sampled_valuessubtract_log_qremove_accidental_hitsr:  r   r7  labels_flatsampledtrue_expected_countsampled_expected_countall_idsall_wtrue_w	sampled_wsampled_logitsall_btrue_b	sampled_bry   new_true_w_shaperow_wise_dotsdots_as_matrixtrue_logitsacc_hitsacc_indicesacc_idsacc_weightsacc_indices_2dacc_ids_2d_int32r<  sampled_logits_shape
out_logits
out_labelss(                                           r9   _compute_sampled_logitsrf    s   x 6677mG	GT	"iG
~~d4 88: v"||v||#}}VV\\2f##FRD1K -KK!n<<,:<<8G "8 mmGV\\2G W5q9G
 **-?AE{{fll"mmE6<<0e __UQF,22"+//+">q"A2!FHIF y{;A>BCb"XOI __VYDIN **,>@E{{fll"mmE6<<0e __UQC)EFFy{'CbTJI
 //&
!!A
&C ''"h(=qA%%fa(&"235M
 &&}'0'7'7"sQ'GIN##In$=H~NKvH~6F6KiN'??
'H.h*2'k7K !((r1g>n"**
--
.Q9 ''9I(JA(8:n '--??6"2A&  a02346 
		!2!2	2mmK1E1EF66


 " "n X\\"566k%;<<n !!;"?CJ
 !!K(83^,# 	
J
 z!mv" v" v"s   S)U		Uznn.nce_lossc
                 .    t        | ||||||||d|	      S )a  Computes and returns the noise-contrastive estimation training loss.

  See [Noise-contrastive estimation: A new estimation principle for
  unnormalized statistical
  models](https://arxiv.org/abs/1806.03664).
  Also see our [Candidate Sampling Algorithms
  Reference](https://www.tensorflow.org/extras/candidate_sampling.pdf)

  A common use case is to use this method for training, and calculate the full
  sigmoid loss for evaluation or inference as in the following example:

  ```python
  if mode == "train":
    loss = tf.nn.nce_loss(
        weights=weights,
        biases=biases,
        labels=labels,
        inputs=inputs,
        ...)
  elif mode == "eval":
    logits = tf.matmul(inputs, tf.transpose(weights))
    logits = tf.nn.bias_add(logits, biases)
    labels_one_hot = tf.one_hot(labels, n_classes)
    loss = tf.nn.sigmoid_cross_entropy_with_logits(
        labels=labels_one_hot,
        logits=logits)
    loss = tf.reduce_sum(loss, axis=1)
  ```

  Note: when doing embedding lookup on `weights` and `bias`, "div" partition
  strategy will be used. Support for other partition strategy will be added
  later.

  Note: By default this uses a log-uniform (Zipfian) distribution for sampling,
  so your labels must be sorted in order of decreasing frequency to achieve
  good results.  For more details, see
  `tf.random.log_uniform_candidate_sampler`.

  Note: In the case where `num_true` > 1, we assign to each target class
  the target probability 1 / `num_true` so that the target probabilities
  sum to 1 per-example.

  Note: It would be useful to allow a variable number of target classes per
  example.  We hope to provide this functionality in a future release.
  For now, if you have a variable number of target classes, you can pad them
  out to a constant number by either repeating them or by padding
  with an otherwise unused class.

  Args:
    weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
      objects whose concatenation along dimension 0 has shape [num_classes,
      dim].  The (possibly-partitioned) class embeddings.
    biases: A `Tensor` of shape `[num_classes]`.  The class biases.
    labels: A `Tensor` of type `int64` and shape `[batch_size, num_true]`. The
      target classes.
    inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward activations of
      the input network.
    num_sampled: An `int`.  The number of negative classes to randomly sample
      per batch. This single sample of negative classes is evaluated for each
      element in the batch.
    num_classes: An `int`. The number of possible classes.
    num_true: An `int`.  The number of target classes per training example.
    sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
      `sampled_expected_count`) returned by a `*_candidate_sampler` function.
      (if None, we default to `log_uniform_candidate_sampler`)
    remove_accidental_hits:  A `bool`.  Whether to remove "accidental hits"
      where a sampled class equals one of the target classes.  If set to `True`,
      this is a "Sampled Logistic" loss instead of NCE, and we are learning to
      generate log-odds instead of log probabilities.  See our [Candidate
      Sampling Algorithms Reference]
        (https://www.tensorflow.org/extras/candidate_sampling.pdf). Default is
          False.
    name: A name for the operation (optional).

  Returns:
    A `batch_size` 1-D tensor of per-example NCE losses.
  div)r3  rJ  rL  r:  r   )nce_loss)
rV   rW   r?   rH  r4  rI  r3  rJ  rL  r   s
             r9   nce_loss_v2rj  l  s3    v 
#3
 rI   c                 h    t        | |||||||d||	|
      \  }}t        ||d      }t        |      S )an  Computes and returns the noise-contrastive estimation training loss.

  A common use case is to use this method for training, and calculate the full
  sigmoid loss for evaluation or inference. In this case, you must set
  `partition_strategy="div"` for the two losses to be consistent, as in the
  following example:

  ```python
  if mode == "train":
    loss = tf.nn.nce_loss(
        weights=weights,
        biases=biases,
        labels=labels,
        inputs=inputs,
        ...,
        partition_strategy="div")
  elif mode == "eval":
    logits = tf.matmul(inputs, tf.transpose(weights))
    logits = tf.nn.bias_add(logits, biases)
    labels_one_hot = tf.one_hot(labels, n_classes)
    loss = tf.nn.sigmoid_cross_entropy_with_logits(
        labels=labels_one_hot,
        logits=logits)
    loss = tf.reduce_sum(loss, axis=1)
  ```

  Note: By default this uses a log-uniform (Zipfian) distribution for sampling,
  so your labels must be sorted in order of decreasing frequency to achieve
  good results.  For more details, see
  `tf.random.log_uniform_candidate_sampler`.

  Note: In the case where `num_true` > 1, we assign to each target class
  the target probability 1 / `num_true` so that the target probabilities
  sum to 1 per-example.

  Note: It would be useful to allow a variable number of target classes per
  example.  We hope to provide this functionality in a future release.
  For now, if you have a variable number of target classes, you can pad them
  out to a constant number by either repeating them or by padding
  with an otherwise unused class.

  Args:
    weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
        objects whose concatenation along dimension 0 has shape
        [num_classes, dim].  The (possibly-partitioned) class embeddings.
    biases: A `Tensor` of shape `[num_classes]`.  The class biases.
    labels: A `Tensor` of type `int64` and shape `[batch_size,
        num_true]`. The target classes.
    inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward
        activations of the input network.
    num_sampled: An `int`.  The number of negative classes to randomly sample
        per batch. This single sample of negative classes is evaluated for each
        element in the batch.
    num_classes: An `int`. The number of possible classes.
    num_true: An `int`.  The number of target classes per training example.
    sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
        `sampled_expected_count`) returned by a `*_candidate_sampler` function.
        (if None, we default to `log_uniform_candidate_sampler`)
    remove_accidental_hits:  A `bool`.  Whether to remove "accidental hits"
        where a sampled class equals one of the target classes.  If set to
        `True`, this is a "Sampled Logistic" loss instead of NCE, and we are
        learning to generate log-odds instead of log probabilities. See
        our Candidate Sampling Algorithms Reference
        ([pdf](https://www.tensorflow.org/extras/candidate_sampling.pdf)).
        Default is False.
    partition_strategy: A string specifying the partitioning strategy, relevant
        if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
        Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.
    name: A name for the operation (optional).

  Returns:
    A `batch_size` 1-D tensor of per-example NCE losses.

  References:
    Noise-contrastive estimation - A new estimation principle for unnormalized
    statistical models:
      [Gutmann et al., 2010](http://proceedings.mlr.press/v9/gutmann10a)
      ([pdf](http://proceedings.mlr.press/v9/gutmann10a/gutmann10a.pdf))
  T)rV   rW   r?   rH  r4  rI  r3  rJ  rK  rL  r:  r   sampled_lossesrG   )rf  r<   r/  )rV   rW   r?   rH  r4  rI  r3  rJ  rL  r:  r   r>   rl  s                r9   ri  ri    sX    x +#3+.&& 5F)9;. 
>	""rI   znn.sampled_softmax_lossc                 0    t        | ||||||||d|
|	      S )a
  Computes and returns the sampled softmax training loss.

  This is a faster way to train a softmax classifier over a huge number of
  classes.

  This operation is for training only.  It is generally an underestimate of
  the full softmax loss.

  A common use case is to use this method for training, and calculate the full
  softmax loss for evaluation or inference as in the following example:

  ```python
  if mode == "train":
    loss = tf.nn.sampled_softmax_loss(
        weights=weights,
        biases=biases,
        labels=labels,
        inputs=inputs,
        ...)
  elif mode == "eval":
    logits = tf.matmul(inputs, tf.transpose(weights))
    logits = tf.nn.bias_add(logits, biases)
    labels_one_hot = tf.one_hot(labels, n_classes)
    loss = tf.nn.softmax_cross_entropy_with_logits(
        labels=labels_one_hot,
        logits=logits)
  ```

  See our [Candidate Sampling Algorithms Reference]
  (https://www.tensorflow.org/extras/candidate_sampling.pdf)

  Also see Section 3 of [Jean et al., 2014](http://arxiv.org/abs/1412.2007)
  ([pdf](http://arxiv.org/pdf/1412.2007.pdf)) for the math.

  Note: when doing embedding lookup on `weights` and `bias`, "div" partition
  strategy will be used. Support for other partition strategy will be added
  later.

  Args:
    weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
      objects whose concatenation along dimension 0 has shape [num_classes,
      dim].  The (possibly-sharded) class embeddings.
    biases: A `Tensor` of shape `[num_classes]`.  The class biases.
    labels: A `Tensor` of type `int64` and shape `[batch_size, num_true]`. The
      target classes.  Note that this format differs from the `labels` argument
      of `nn.softmax_cross_entropy_with_logits`.
    inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward activations of
      the input network.
    num_sampled: An `int`.  The number of classes to randomly sample per batch.
    num_classes: An `int`. The number of possible classes.
    num_true: An `int`.  The number of target classes per training example.
    sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
      `sampled_expected_count`) returned by a `*_candidate_sampler` function.
      (if None, we default to `log_uniform_candidate_sampler`)
    remove_accidental_hits:  A `bool`.  whether to remove "accidental hits"
      where a sampled class equals one of the target classes.  Default is True.
    seed: random seed for candidate sampling. Default to None, which doesn't set
      the op-level random seed for candidate sampling.
    name: A name for the operation (optional).

  Returns:
    A `batch_size` 1-D tensor of per-example sampled softmax losses.

  rh  )r3  rJ  rL  r:  r   r7  )sampled_softmax_loss)rV   rW   r?   rH  r4  rI  r3  rJ  rL  r7  r   s              r9   sampled_softmax_loss_v2ro  E  s6    Z 
#3
 rI   c                     t        | |||||||d||	|
|      \  }}t        j                  |d      }t        j                  ||      }|S )a2  Computes and returns the sampled softmax training loss.

  This is a faster way to train a softmax classifier over a huge number of
  classes.

  This operation is for training only.  It is generally an underestimate of
  the full softmax loss.

  A common use case is to use this method for training, and calculate the full
  softmax loss for evaluation or inference. In this case, you must set
  `partition_strategy="div"` for the two losses to be consistent, as in the
  following example:

  ```python
  if mode == "train":
    loss = tf.nn.sampled_softmax_loss(
        weights=weights,
        biases=biases,
        labels=labels,
        inputs=inputs,
        ...,
        partition_strategy="div")
  elif mode == "eval":
    logits = tf.matmul(inputs, tf.transpose(weights))
    logits = tf.nn.bias_add(logits, biases)
    labels_one_hot = tf.one_hot(labels, n_classes)
    loss = tf.nn.softmax_cross_entropy_with_logits(
        labels=labels_one_hot,
        logits=logits)
  ```

  See our Candidate Sampling Algorithms Reference
  ([pdf](https://www.tensorflow.org/extras/candidate_sampling.pdf)).
  Also see Section 3 of (Jean et al., 2014) for the math.

  Args:
    weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
        objects whose concatenation along dimension 0 has shape
        [num_classes, dim].  The (possibly-sharded) class embeddings.
    biases: A `Tensor` of shape `[num_classes]`.  The class biases.
    labels: A `Tensor` of type `int64` and shape `[batch_size,
        num_true]`. The target classes.  Note that this format differs from
        the `labels` argument of `nn.softmax_cross_entropy_with_logits`.
    inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward
        activations of the input network.
    num_sampled: An `int`.  The number of classes to randomly sample per batch.
    num_classes: An `int`. The number of possible classes.
    num_true: An `int`.  The number of target classes per training example.
    sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
        `sampled_expected_count`) returned by a `*_candidate_sampler` function.
        (if None, we default to `log_uniform_candidate_sampler`)
    remove_accidental_hits:  A `bool`.  whether to remove "accidental hits"
        where a sampled class equals one of the target classes.  Default is
        True.
    partition_strategy: A string specifying the partitioning strategy, relevant
        if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
        Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.
    name: A name for the operation (optional).
    seed: random seed for candidate sampling. Default to None, which doesn't set
        the op-level random seed for candidate sampling.

  Returns:
    A `batch_size` 1-D tensor of per-example sampled softmax losses.

  References:
    On Using Very Large Target Vocabulary for Neural Machine Translation:
      [Jean et al., 2014]
      (https://aclanthology.coli.uni-saarland.de/papers/P15-1001/p15-1001)
      ([pdf](http://aclweb.org/anthology/P15-1001))
  T)rV   rW   r?   rH  r4  rI  r3  rJ  rK  rL  r:  r   r7  labels_stop_gradientr   )r?   r>   )rf  r   r   r   $softmax_cross_entropy_with_logits_v2)rV   rW   r?   rH  r4  rI  r3  rJ  rL  r:  r   r7  r>   rl  s                 r9   rn  rn    sl    h +#3+.&& ""60FG&>>F$. 
rI   )FN)NNNr   )NNNNN)r`   )	euclideanNN)Ng-q=NN)NNNN)NFN)NNgMbP?NHWCTNr`   )NNNNNNNNNNN)rK   NTFmodNN)rK   NFri  )rK   NFru  ri  )rK   NTNrn  )rK   NTru  rn  N)C__doc__r+   tensorflow.python.frameworkr   r   r   tensorflow.python.opsr   r   r   r	   r   r
   r   r   r   r   r   r   r   r   r   tensorflow.python.platformr   tensorflow.python.utilr   "tensorflow.python.util.deprecationr   r    tensorflow.python.util.tf_exportr   add_dispatch_supportr   r<   register_binary_elementwise_apirH   rP   rR   rT   register_unary_elementwise_apirl   rn   r{   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r&  r(  r/  rf  rj  ri  ro  rn   rI   r9   <module>r     s   3  3 . + + 1 8 1 ) 1 / , 0 , * : ( + 5 + > I 6  !	=  "=@ 567		"  8"N 1b9	))		Y/  * :Y/z )00 " ) 2r:	/3W  ;Wt 678	BIN.2.226,0/3	;Q O  9;Q|  	-  !-, 9j!	((	2$  ) "2$l 	0  0f  57HMO	<eD47 E O47n (.|| * !34	%A  5%AR $%&	
 !%#|  '|~  R(	
 %)"&!]3  )]3F $%&	 !%#g  'gT  R(	 	E  )ET )*+	DH#'F#  ,F#R %"-	B  .B8 !"	  #> |n	 	=  =@ <B	 	 M    MF $%&	FJ"H,  'H,V  R(	  ). #$	 "BM  %BMJ $%&	
 
	b&  'b&J 89:	+/+/+/.2/3:>CG.2/3.2261@  ;1@j 4<	 26-9  =-9d
;& &'+/+/38/4!%!%w"t =R 	 #',d  !dN }o	  $) %k#  k#\ $,	 &'+/37!%!7W  -Wt ()*	 #$(,04,14"d  +drI   