Ë
    AÆVh<{ ã                   ó²	  — d Z ddlZddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlm	Z	 dd	lm
Z dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ  ed«      ej>                  dVd„«       «       Z  edg¬«      ej>                  	 	 	 dWd„«       «       Z! edg ¬«      ejD                  ej>                  	 	 	 dWd„«       «       «       Z#e#j                   e!_          edg ¬«      ej>                  	 dXd „«       «       Z$ edg¬«      ej>                   edd!d"«      	 	 	 	 	 dYd#„«       «       «       Z% ed$g¬«      ej>                  dXd%„«       «       Z& ed&d'«      ejN                  ej>                  dZd(„«       «       «       Z( ed)«      ej>                  d[d*„«       «       Z) ed+d,d-g d.¢¬«      ej>                   edd/d0«      d\d1„«       «       «       Z*ejV                  fd2„Z, ed3d4«      ej>                  dXd5„«       «       Z- ed6g¬«      ej>                  	 	 	 	 d]d7„«       «       Z. ed6g ¬«      ej>                  	 	 	 dWd8„«       «       Z/ ed9g¬«      ej>                  	 	 	 	 d]d:„«       «       Z0 ed9g ¬«      ej>                  	 	 	 dWd;„«       «       Z1 ed<g¬«      ej>                  	 	 d]d=„«       «       Z2 ed<g ¬«      ej>                  d^d>„«       «       Z3 ed?«      ej>                  dXd@„«       «       Z4 edAg¬«      ej>                  	 	 	 	 d]dB„«       «       Z5 edAg ¬«      ej>                  	 	 	 d^dC„«       «       Z6 edDg¬«      ej>                  	 	 dWdE„«       «       Z7 edDg ¬«      ej>                  dVdF„«       «       Z8 edG«      ej>                  	 dXdH„«       «       Z9 edIg¬«      ej>                  	 	 	 	 	 	 	 d_dJ„«       «       Z: edKg¬«      ej>                  	 	 	 	 	 	 	 	 	 	 	 d`dL„«       «       Z; edKg ¬«      ej>                  	 dXdM„«       «       Z<dN„ Z=	 	 	 	 	 	 	 dadO„Z> edPg ¬«      ej>                  	 	 	 	 dbdQ„«       «       Z? edPg¬«      ej>                  	 	 	 	 	 dcdR„«       «       Z@ edSg ¬«      ej>                  	 	 	 	 	 dddT„«       «       ZA edSg¬«      ej>                  	 	 	 	 	 	 dedU„«       «       ZBy)fz,Implementation of Neural Net (NN) functions.é    N)Úconstant_op)Údtypes)Úops)Ú	array_ops)Úarray_ops_stack)Úcandidate_sampling_ops)Úcond)Úctc_ops)Úcustom_gradient)Úembedding_ops)Ú
gen_nn_ops)Úgen_sparse_ops)Ú
linalg_ops)Úmath_ops)Únn_fused_batch_norm_grad)Únn_ops)Ú	variables)Údevice_context)Údispatch)Údeprecated_args)Údeprecated_argument_lookup)Ú	tf_exportznn.log_poisson_lossc           	      óÒ  — t        j                  |d|| g«      5 }t        j                  |d¬«      }t        j                  | d¬«      } 	 | j                  «       j	                  |j                  «       «       t        j                  |«      || z  z
  }|rt        j                  d| j                  ¬	«      }t        j                  d
t        j                  z  | j                  ¬	«      }| t        j                  | «      z  | z
  |t        j                  || z  «      z  z   }t        j                  | | j                  ¬	«      }t        j                   | | j                  ¬	«      }	t        j"                  | |k\  | |	k  «      }
|t        j$                  |
||«      z  }|cddd«       S # t
        $ r/ t        d|j                  «       › d| j                  «       › d«      ‚w xY w# 1 sw Y   yxY w)aˆ  Computes log Poisson loss given `log_input`.

  Gives the log-likelihood loss between the prediction and the target under the
  assumption that the target has a Poisson distribution.
  Caveat: By default, this is not the exact loss, but the loss minus a
    constant term [log(z!)]. That has no effect for optimization, but
    does not play well with relative loss comparisons. To compute an
    approximation of the log factorial term, specify
    compute_full_loss=True to enable Stirling's Approximation.

  For brevity, let `c = log(x) = log_input`, `z = targets`.  The log Poisson
  loss is

        -log(exp(-x) * (x^z) / z!)
      = -log(exp(-x) * (x^z)) + log(z!)
      ~ -log(exp(-x)) - log(x^z) [+ z * log(z) - z + 0.5 * log(2 * pi * z)]
          [ Note the second term is the Stirling's Approximation for log(z!).
            It is invariant to x and does not affect optimization, though
            important for correct relative loss comparisons. It is only
            computed when compute_full_loss == True. ]
      = x - z * log(x) [+ z * log(z) - z + 0.5 * log(2 * pi * z)]
      = exp(c) - z * c [+ z * log(z) - z + 0.5 * log(2 * pi * z)]

  Args:
    targets: A `Tensor` of the same type and shape as `log_input`.
    log_input: A `Tensor` of type `float32` or `float64`.
    compute_full_loss: whether to compute the full loss. If false, a constant
      term is dropped in favor of more efficient optimization.
    name: A name for the operation (optional).

  Returns:
    A `Tensor` of the same shape as `log_input` with the componentwise
    logistic losses.

  Raises:
    ValueError: If `log_input` and `targets` do not have the same shape.
  Úlog_poisson_lossÚ	log_input©ÚnameÚtargetsz>`log_input` and `targets` must have the same shape, received (ú vs ú).g      à?©Údtypeé   N)r   Ú
name_scopeÚconvert_to_tensorÚ	get_shapeÚassert_is_compatible_withÚ
ValueErrorr   Úexpr   Úconstantr"   ÚmathÚpiÚlogr   Ú
zeros_likeÚ	ones_likeÚlogical_andÚwhere)r   r   Úcompute_full_lossr   ÚresultÚ
point_fiveÚtwo_piÚstirling_approxÚzerosÚonesr	   s              úM/home/dcms/DCMS/lib/python3.12/site-packages/tensorflow/python/ops/nn_impl.pyr   r   +   s¶  € ôP ‡~~dÐ.°¸GÐ0DÓEð ÈÜ×%Ñ% i°kÔB€IÜ×#Ñ# G°)Ô<€GðBØ×ÑÓ×3Ñ3°I×4GÑ4GÓ4IÔJô \‰\˜)Ó$ y°7Ñ':Ñ:€FÚô ×'Ñ'¨°7·=±=ÔA€jÜ×#Ñ# A¬¯©¡K°w·}±}ÔE€fà ¤8§<¡<°Ó#8Ñ8¸GÑCØ
”x—|‘| F¨WÑ$4Ó5Ñ
5ñ7€oä×"Ñ" 7°'·-±-Ô@€eÜ× Ñ  °·±Ô>€dÜ×!Ñ! '¨UÑ"2°G¸t±OÓD€dØ”	—‘  e¨_Ó=Ñ=€fØ÷/ñ øô
 ò BÜðØ×!Ñ!Ó#Ð$ D¨×):Ñ):Ó)<Ð(=¸RðAóBð BðBú÷ð ús$   š/GÁ
-F"Á7D!GÆ"8GÇGÇG&z$nn.sigmoid_cross_entropy_with_logits)Úv1c           	      óî  — t        j                  d| |«       t        j                  |d|| g«      5 }t        j                  |d¬«      }t        j                  | d¬«      } 	 | j                  «       j                  |j                  «       «       t        j                  ||j                  ¬	«      }||k\  }t        j                  |||«      }t        j                  || |«      }t        j                  ||| z  z
  t        j                  t        j                  |«      «      |¬«      cd
d
d
«       S # t        $ r/ t        d|j                  «       › d| j                  «       › d«      ‚w xY w# 1 sw Y   y
xY w)z)See sigmoid_cross_entropy_with_logits_v2.Ú!sigmoid_cross_entropy_with_logitsÚlogistic_lossÚlogitsr   Úlabelsú:`logits` and `labels` must have the same shape, received (r   r    r!   N)r   Ú_ensure_xent_argsr   r$   r%   r&   r'   r(   r   r.   r"   r1   r   ÚaddÚlog1pr)   )r?   r>   r   r7   r	   Úrelu_logitsÚneg_abs_logitss          r9   r<   r<   m   sT  € ô 	×ÑÐ>ÀÈÔOô ‡~~d˜O¨f°fÐ-=Ó>ð À$Ü×"Ñ" 6°Ô9€FÜ×"Ñ" 6°Ô9€Fð2Ø×ÑÓ×2Ñ2°6×3CÑ3CÓ3EÔFô × Ñ  ¨v¯|©|Ô<€EØe‰O€DÜ—/‘/ $¨°Ó6€KÜ—_‘_ T¨F¨7°FÓ;€NÜ<‰<Øf˜v‘oÑ%Ü‰”x—|‘| NÓ3Ó4Øô÷-ñ øô
 ò 2Üð $Ø$*×$4Ñ$4Ó$6Ð#7°tØ ×*Ñ*Ó,Ð-¨Rð1ó 2ð 2ð2ú÷ð ús$   ±/E+Á!-D0ÂBE+Ä08E(Å(E+Å+E4c                 ó   — t        || |¬«      S )aB  Computes sigmoid cross entropy given `logits`.

  Measures the probability error in tasks with two outcomes in which each
  outcome is independent and need not have a fully certain label. For instance,
  one could perform a regression where the probability of an event happening is
  known and used as a label. This loss may also be used for binary
  classification, where labels are either zero or one.

  For brevity, let `x = logits`, `z = labels`.  The logistic loss is

        z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
      = z * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x)))
      = z * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x)))
      = z * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x))
      = (1 - z) * x + log(1 + exp(-x))
      = x - x * z + log(1 + exp(-x))

  For x < 0, to avoid overflow in exp(-x), we reformulate the above

        x - x * z + log(1 + exp(-x))
      = log(exp(x)) - x * z + log(1 + exp(-x))
      = - x * z + log(1 + exp(x))

  Hence, to ensure stability and avoid overflow, the implementation uses this
  equivalent formulation

      max(x, 0) - x * z + log(1 + exp(-abs(x)))

  `logits` and `labels` must have the same type and shape.

  >>> logits = tf.constant([1., -1., 0., 1., -1., 0., 0.])
  >>> labels = tf.constant([0., 0., 0., 1., 1., 1., 0.5])
  >>> tf.nn.sigmoid_cross_entropy_with_logits(
  ...     labels=labels, logits=logits).numpy()
  array([1.3132617, 0.3132617, 0.6931472, 0.3132617, 1.3132617, 0.6931472,
         0.6931472], dtype=float32)

  Compared to the losses which handle multiple outcomes,
  `tf.nn.softmax_cross_entropy_with_logits` for general multi-class
  classification and `tf.nn.sparse_softmax_cross_entropy_with_logits` for more
  efficient multi-class classification with hard labels,
  `sigmoid_cross_entropy_with_logits` is a slight simplification for binary
  classification:

        sigmoid(x) = softmax([x, 0])[0]

  $$\frac{1}{1 + e^{-x}} = \frac{e^x}{e^x + e^0}$$

  While `sigmoid_cross_entropy_with_logits` works for soft binary labels
  (probabilities between 0 and 1), it can also be used for binary classification
  where the labels are hard. There is an equivalence between all three symbols
  in this case, with a probability 0 indicating the second class or 1 indicating
  the first class:

  >>> sigmoid_logits = tf.constant([1., -1., 0.])
  >>> softmax_logits = tf.stack([sigmoid_logits, tf.zeros_like(sigmoid_logits)],
  ...                           axis=-1)
  >>> soft_binary_labels = tf.constant([1., 1., 0.])
  >>> soft_multiclass_labels = tf.stack(
  ...     [soft_binary_labels, 1. - soft_binary_labels], axis=-1)
  >>> hard_labels = tf.constant([0, 0, 1])
  >>> tf.nn.sparse_softmax_cross_entropy_with_logits(
  ...     labels=hard_labels, logits=softmax_logits).numpy()
  array([0.31326166, 1.3132616 , 0.6931472 ], dtype=float32)
  >>> tf.nn.softmax_cross_entropy_with_logits(
  ...     labels=soft_multiclass_labels, logits=softmax_logits).numpy()
  array([0.31326166, 1.3132616, 0.6931472], dtype=float32)
  >>> tf.nn.sigmoid_cross_entropy_with_logits(
  ...     labels=soft_binary_labels, logits=sigmoid_logits).numpy()
  array([0.31326166, 1.3132616, 0.6931472], dtype=float32)

  Args:
    labels: A `Tensor` of the same type and shape as `logits`. Between 0 and 1,
      inclusive.
    logits: A `Tensor` of type `float32` or `float64`. Any real number.
    name: A name for the operation (optional).

  Returns:
    A `Tensor` of the same shape as `logits` with the componentwise
    logistic losses.

  Raises:
    ValueError: If `logits` and `labels` do not have the same shape.
  )r>   r?   r   )r<   ©r?   r>   r   s      r9   Ú$sigmoid_cross_entropy_with_logits_v2rH   –   s   € ôv 
+Ø˜F¨ô
/ð /ó    z%nn.weighted_cross_entropy_with_logitsc                 óˆ  — t        j                  |d|| g«      5 }t        j                  |d¬«      }t        j                  | d¬«      } 	 | j                  «       j	                  |j                  «       «       d|dz
  | z  z   }t        j                  d| z
  |z  |t        j                  t        j                  t        j                  |«       «      «      t        j                  | «      z   z  |¬«      cd	d	d	«       S # t
        $ r/ t        d|j                  «       › d| j                  «       › d«      ‚w xY w# 1 sw Y   y	xY w)
aO
  Computes a weighted cross entropy.

  This is like `sigmoid_cross_entropy_with_logits()` except that `pos_weight`,
  allows one to trade off recall and precision by up- or down-weighting the
  cost of a positive error relative to a negative error.

  The usual cross-entropy cost is defined as:

      labels * -log(sigmoid(logits)) +
          (1 - labels) * -log(1 - sigmoid(logits))

  A value `pos_weight > 1` decreases the false negative count, hence increasing
  the recall.
  Conversely setting `pos_weight < 1` decreases the false positive count and
  increases the precision.
  This can be seen from the fact that `pos_weight` is introduced as a
  multiplicative coefficient for the positive labels term
  in the loss expression:

      labels * -log(sigmoid(logits)) * pos_weight +
          (1 - labels) * -log(1 - sigmoid(logits))

  For brevity, let `x = logits`, `z = labels`, `q = pos_weight`.
  The loss is:

        qz * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
      = qz * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x)))
      = qz * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x)))
      = qz * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x))
      = (1 - z) * x + (qz +  1 - z) * log(1 + exp(-x))
      = (1 - z) * x + (1 + (q - 1) * z) * log(1 + exp(-x))

  Setting `l = (1 + (q - 1) * z)`, to ensure stability and avoid overflow,
  the implementation uses

      (1 - z) * x + l * (log(1 + exp(-abs(x))) + max(-x, 0))

  `logits` and `labels` must have the same type and shape.

  >>> labels = tf.constant([1., 0.5, 0.])
  >>> logits = tf.constant([1.5, -0.1, -10.])
  >>> tf.nn.weighted_cross_entropy_with_logits(
  ...     labels=labels, logits=logits, pos_weight=tf.constant(1.5)).numpy()
  array([3.0211994e-01, 8.8049585e-01, 4.5776367e-05], dtype=float32)
  >>> tf.nn.weighted_cross_entropy_with_logits(
  ...     labels=labels, logits=logits, pos_weight=tf.constant(0.5)).numpy()
  array([1.00706644e-01, 5.08297503e-01, 4.57763672e-05], dtype=float32)

  Args:
    labels: A `Tensor` of the same type and shape as `logits`, with values
      between 0 and 1 inclusive.
    logits: A `Tensor` of type `float32` or `float64`, any real numbers.
    pos_weight: A coefficient to use on the positive examples, typically a
      scalar but otherwise broadcastable to the shape of `logits`. Its value
      should be non-negative.
    name: A name for the operation (optional).

  Returns:
    A `Tensor` of the same shape as `logits` with the componentwise
    weighted logistic losses.

  Raises:
    ValueError: If `logits` and `labels` do not have the same shape.
  r=   r>   r   r?   r@   r   r    é   N)r   r$   r%   r&   r'   r(   r   rB   rC   r)   Úabsr   Úrelu)r?   r>   Ú
pos_weightr   Ú
log_weights        r9   Ú%weighted_cross_entropy_with_logits_v2rP   ù   s4  € ôH ‡~~d˜O¨f°fÐ-=Ó>ð À$Ü×"Ñ" 6°Ô9€FÜ×"Ñ" 6°Ô9€Fð2Ø×ÑÓ×2Ñ2°6×3CÑ3CÓ3EÔFð j 1‘n¨Ñ.Ñ.€JÜ<‰<Ø	
ˆV‰vÑØ”h—n‘n¤X§\¡\´8·<±<ÀÓ3GÐ2GÓ%HÓIÜ—k‘k 6 'Ó*ñ+ñ 	,àô	÷#ñ øô
 ò 2Üð $Ø$*×$4Ñ$4Ó$6Ð#7°tØ ×*Ñ*Ó,Ð-¨Rð1ó 2ð 2ð2ú÷ð ús$   š/D8Á
-C=Á7A<D8Ã=8D5Ä5D8Ä8Ez)targets is deprecated, use labels insteadr   c                 ó:   — t        d| d|«      } t        | |||«      S )aá  Computes a weighted cross entropy.

  This is like `sigmoid_cross_entropy_with_logits()` except that `pos_weight`,
  allows one to trade off recall and precision by up- or down-weighting the
  cost of a positive error relative to a negative error.

  The usual cross-entropy cost is defined as:

      labels * -log(sigmoid(logits)) +
          (1 - labels) * -log(1 - sigmoid(logits))

  A value `pos_weight > 1` decreases the false negative count, hence increasing
  the recall.
  Conversely setting `pos_weight < 1` decreases the false positive count and
  increases the precision.
  This can be seen from the fact that `pos_weight` is introduced as a
  multiplicative coefficient for the positive labels term
  in the loss expression:

      labels * -log(sigmoid(logits)) * pos_weight +
          (1 - labels) * -log(1 - sigmoid(logits))

  For brevity, let `x = logits`, `z = labels`, `q = pos_weight`.
  The loss is:

        qz * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
      = qz * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x)))
      = qz * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x)))
      = qz * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x))
      = (1 - z) * x + (qz +  1 - z) * log(1 + exp(-x))
      = (1 - z) * x + (1 + (q - 1) * z) * log(1 + exp(-x))

  Setting `l = (1 + (q - 1) * z)`, to ensure stability and avoid overflow,
  the implementation uses

      (1 - z) * x + l * (log(1 + exp(-abs(x))) + max(-x, 0))

  `logits` and `labels` must have the same type and shape.

  Args:
    labels: A `Tensor` of the same type and shape as `logits`.
    logits: A `Tensor` of type `float32` or `float64`.
    pos_weight: A coefficient to use on the positive examples.
    name: A name for the operation (optional).
    targets: Deprecated alias for labels.

  Returns:
    A `Tensor` of the same shape as `logits` with the componentwise
    weighted logistic losses.

  Raises:
    ValueError: If `logits` and `labels` do not have the same shape.
  r?   r   )r   rP   )r?   r>   rN   r   r   s        r9   Ú"weighted_cross_entropy_with_logitsrR   U  s&   € ôz & h°¸	À7ÓK€&Ü	.¨v°v¸zÈ4Ó	PÐPrI   znn.relu_layerc                 ón  — t        j                  |d| ||g«      5 }t        j                  | d¬«      } t        j                  |d¬«      }t        j                  |d¬«      }t        j                  t        j                  | |«      |«      }t        j                  ||¬«      cddd«       S # 1 sw Y   yxY w)aµ  Computes Relu(x * weight + biases).

  Args:
    x: a 2D tensor.  Dimensions typically: batch, in_units
    weights: a 2D tensor.  Dimensions typically: in_units, out_units
    biases: a 1D tensor.  Dimensions: out_units
    name: A name for the operation (optional).  If not specified
      "nn_relu_layer" is used.

  Returns:
    A 2-D Tensor computing relu(matmul(x, weights) + biases).
    Dimensions typically: batch, out_units.
  Ú
relu_layerÚxr   ÚweightsÚbiasesN)r   r$   r%   r   Úbias_addr   ÚmatmulrM   )rU   rV   rW   r   Ú	xw_plus_bs        r9   rT   rT   –  s‘   € ô  ‡~~d˜L¨1¨g°vÐ*>Ó?ð -À4Ü×Ñ˜a cÔ*€AÜ×#Ñ# G°)Ô<€GÜ×"Ñ" 6°Ô9€FÜ—‘¤§¡°°7Ó ;¸VÓD€IÜ;‰;y tÔ,÷-÷ -ò -ús   ›BB+Â+B4znn.siluznn.swishc                 óÜ   — t        j                  | d¬«      } t        j                  |d¬«      }t        j                  || j                  «      }t
        j
                  d„ «       } || |«      S )a  Computes the SiLU or Swish activation function: `x * sigmoid(beta * x)`.

  beta : Hyperparameter for Swish activation function. Default value 1.0.

  The SiLU activation function was introduced in "Gaussian Error Linear Units
  (GELUs)" [Hendrycks et al. 2016](https://arxiv.org/abs/1606.08415) and
  "Sigmoid-Weighted Linear Units for Neural Network Function Approximation in
  Reinforcement Learning"
  [Elfwing et al. 2017](https://arxiv.org/abs/1702.03118) and was independently
  discovered (and called swish) in "Searching for Activation Functions"
  [Ramachandran et al. 2017](https://arxiv.org/abs/1710.05941)

  Args:
    features: A `Tensor` representing preactivation values.
    beta: A 'Tensor' representing value of beta hyperparameter.

  Returns:
    The activation value.
  Úfeaturesr   Úbetac                 óL   ‡ ‡— ˆˆ fd„}‰ t        j                  ‰‰ z  «      z  |fS )Nc                 ó"  •— t        j                  | g«      5  t        j                  ‰‰z  «      }ddd«       d‰‰z  d|z
  z  z   z  }t        j                  | t        j
                  ‰«      z  |z  d|z
  z  «      }| |z  |fS # 1 sw Y   ŒUxY w)z+Gradient for the Swish activation function.Nç      ð?)r   Úcontrol_dependenciesr   ÚsigmoidÚ
reduce_sumÚsquare)ÚdyÚsigmoid_featuresÚactivation_gradÚ	beta_gradr]   r\   s       €€r9   Úgradz'swish.<locals>.swish_impl.<locals>.gradÎ  s²   ø€ ô ×#Ñ# R DÓ)ñ =Ü#×+Ñ+¨D°8©OÓ<Ð÷=ð ˜c T¨H¡_Ø"Ð%5Ñ5ñ%7ñ 7ñ 8ð ô ×%Ñ%Ø
Œx‰˜xÓ(Ñ
(Ð+;Ñ
;ØÐ!Ñ!ñ#ó$€ið ?Ñ" IÐ.Ð.÷=ð =ús   ˜BÂB)r   rb   )r\   r]   ri   s   `` r9   Ú
swish_implzswish.<locals>.swish_implË  s)   ù€ õ/ð& ”h×&Ñ& t¨h¡Ó7Ñ7¸Ð=Ð=rI   )r   r%   r   Úcastr"   r   )r\   r]   rj   s      r9   Úswishrl   ®  sc   € ô2 ×"Ñ" 8°*Ô=€(Ü	×	Ñ	˜t¨&Ô	1€$Ü	‰t˜XŸ^™^Ó	,€$ä×"Ñ"ñ>ó #ð>ñ. 
H˜dÓ	#Ð#rI   zlinalg.normalizec                 ó  — t        j                  |d| g«      5 }t        j                  | «      } t        j                  | ||d¬«      }t        j                  || j                  «      }| |z  }||fcddd«       S # 1 sw Y   yxY w)aî  Normalizes `tensor` along dimension `axis` using specified norm.

  This uses `tf.linalg.norm` to compute the norm along `axis`.

  This function can compute several different vector norms (the 1-norm, the
  Euclidean or 2-norm, the inf-norm, and in general the p-norm for p > 0) and
  matrix norms (Frobenius, 1-norm, 2-norm and inf-norm).

  Args:
    tensor: `Tensor` of types `float32`, `float64`, `complex64`, `complex128`
    ord: Order of the norm. Supported values are `'fro'`, `'euclidean'`, `1`,
      `2`, `np.inf` and any positive real number yielding the corresponding
      p-norm. Default is `'euclidean'` which is equivalent to Frobenius norm if
      `tensor` is a matrix and equivalent to 2-norm for vectors.
      Some restrictions apply: a) The Frobenius norm `'fro'` is not defined for
        vectors, b) If axis is a 2-tuple (matrix norm), only `'euclidean'`,
        '`fro'`, `1`, `2`, `np.inf` are supported. See the description of `axis`
        on how to compute norms for a batch of vectors or matrices stored in a
        tensor.
    axis: If `axis` is `None` (the default), the input is considered a vector
      and a single vector norm is computed over the entire set of values in the
      tensor, i.e. `norm(tensor, ord=ord)` is equivalent to
      `norm(reshape(tensor, [-1]), ord=ord)`. If `axis` is a Python integer, the
      input is considered a batch of vectors, and `axis` determines the axis in
      `tensor` over which to compute vector norms. If `axis` is a 2-tuple of
      Python integers it is considered a batch of matrices and `axis` determines
      the axes in `tensor` over which to compute a matrix norm.
      Negative indices are supported. Example: If you are passing a tensor that
        can be either a matrix or a batch of matrices at runtime, pass
        `axis=[-2,-1]` instead of `axis=None` to make sure that matrix norms are
        computed.
    name: The name of the op.

  Returns:
    normalized: A normalized `Tensor` with the same shape as `tensor`.
    norm: The computed norms with the same shape and dtype `tensor` but the
      final axis is 1 instead. Same as running
      `tf.cast(tf.linalg.norm(tensor, ord, axis keepdims=True), tensor.dtype)`.

  Raises:
    ValueError: If `ord` or `axis` is invalid.
  Ú	normalizeT©ÚkeepdimsN)r   r$   r%   r   Únormr   rk   r"   )ÚtensorÚordÚaxisr   rq   Ú
normalizeds         r9   rn   rn   ç  sw   € ôZ ‡~~d˜K¨&¨Ó2ð °dÜ×"Ñ" 6Ó*€FÜ?‰?˜6 3¨°tÔ<€DÜ=‰=˜˜vŸ|™|Ó,€DØ˜$‘€JØtÐ÷÷ ò ús   ™AA:Á:Búmath.l2_normalizeúlinalg.l2_normalizeúnn.l2_normalize)rv   rw   rx   z#dim is deprecated, use axis insteadÚdimc                 óâ  — t        d|d|«      }t        j                  |d| g«      5 }t        j                  | d¬«      } | j                  j
                  rt        j                  t        j                  | «      «      }t        j                  t        j                  | «      «      }t        j                  t        j                  ||z   |d¬«      «      }t        j                  t        j                  ||«      «      }t        j                  t        j                  | «      |«      }	t        j                  t        j                  | «      |«      }
t        j                  |	|
|¬«      cddd«       S t        j                  t        j                  | «      |d¬«      }t        j                  t        j                  ||«      «      }t        j                  | ||¬«      cddd«       S # 1 sw Y   yxY w)	aþ  Normalizes along dimension `axis` using an L2 norm.

  For a 1-D tensor with `axis = 0`, computes

      output = x / sqrt(max(sum(x**2), epsilon))

  For `x` with more dimensions, independently normalizes each 1-D slice along
  dimension `axis`.

  1-D tensor example:
  >>> x = tf.constant([3.0, 4.0])
  >>> tf.math.l2_normalize(x).numpy()
  array([0.6, 0.8], dtype=float32)

  2-D tensor example:
  >>> x = tf.constant([[3.0], [4.0]])
  >>> tf.math.l2_normalize(x, 0).numpy()
  array([[0.6],
       [0.8]], dtype=float32)

  >>> x = tf.constant([[3.0], [4.0]])
  >>> tf.math.l2_normalize(x, 1).numpy()
  array([[1.],
       [1.]], dtype=float32)

  Args:
    x: A `Tensor`.
    axis: Dimension along which to normalize.  A scalar or a vector of
      integers.
    epsilon: A lower bound value for the norm. Will use `sqrt(epsilon)` as the
      divisor if `norm < sqrt(epsilon)`.
    name: A name for this operation (optional).
    dim: Deprecated, do not use.

  Returns:
    A `Tensor` with the same shape as `x`.
  rt   ry   Úl2_normalizerU   r   Tro   N)r   r   r$   r%   r"   Ú
is_complexr   rd   ÚrealÚimagrc   ÚrsqrtÚmaximumÚmultiplyÚcomplex)rU   rt   Úepsilonr   ry   Úsquare_realÚsquare_imagÚ
square_sumÚ
x_inv_normÚ	norm_realÚ	norm_imags              r9   r{   r{     sj  € ôT 
$ F¨D°%¸Ó	=€$Ü
‡~~d˜N¨Q¨CÓ0ð 7°DÜ×Ñ˜a cÔ*€AØ‡ww×ÓÜ—O‘O¤H§M¡M°!Ó$4Ó5€kÜ—O‘O¤H§M¡M°!Ó$4Ó5€kÜ—=‘=Ü
×
Ñ
˜k¨KÑ7¸ÈÔ
MóO€jä—>‘>¤(×"2Ñ"2°:¸wÓ"GÓH€jÜ×#Ñ#¤H§M¡M°!Ó$4°jÓA€iÜ×#Ñ#¤H§M¡M°!Ó$4°jÓA€iÜ×Ñ˜i¨¸Ô>÷7ñ 7ô ×$Ñ$¤X§_¡_°QÓ%7¸ÈÔM€JÜ—‘¤× 0Ñ 0°¸WÓ EÓF€JÜ×Ñ˜Q 
°Ô6÷7÷ 7ò 7ús   §D?G%Å0A+G%Ç%G.c           	      ó$  — t        j                  d| g¬«      5  t        j                  g | j                  ¬«      }t        j                  t        j                  t        j                  | |«      |¬«      d¬«      }|cddd«       S # 1 sw Y   yxY w)zðSame as math_ops.count_nonzero.

  The reduction is done in dtype, which can be faster for 32-bit dtypes.

  Args:
      input_tensor: numeric tensor
      dtype: reduction dtype

  Returns:
      number of nonzero values with type dtype
  Úcount_nonzero)Úvaluesr!   Únonzero_countr   N)	r   r$   r   r7   r"   r   rc   rk   Ú	not_equal)Úinput_tensorr"   Úzeror   s       r9   Ú_count_nonzeror‘   W  sv   € ô ‡~~o¨|¨nÔ=ñ Ü?‰?˜2 \×%7Ñ%7Ô8€DÜ×'Ñ'Ü‰Ü×Ñ˜|¨TÓ2Øô	à.ô0€Mð ÷÷ ò ús   ™A#BÂBzmath.zero_fractionznn.zero_fractionc                 ór  ‡ — t        j                  |d‰ g«      5  t        j                  ‰ d¬«      Š t        j                  ‰ t
        j                  ¬«      }t        j                  |t
        j                  j                  k  ˆ fd„ˆ fd„¬«      }t        j                  d«      5  ||z
  }t        j                  |t
        j                  ¬	«      }t        j                  |t
        j                  ¬	«      }||z  }d
d
d
«       t        j                  d«      cd
d
d
«       S # 1 sw Y   Œ(xY w# 1 sw Y   y
xY w)aÄ  Returns the fraction of zeros in `value`.

  If `value` is empty, the result is `nan`.

  This is useful in summaries to measure and report sparsity.  For example,

  ```python
      z = tf.nn.relu(...)
      summ = tf.compat.v1.summary.scalar('sparsity', tf.nn.zero_fraction(z))
  ```

  Args:
    value: A tensor of numeric type.
    name: A name for the operation (optional).

  Returns:
    The fraction of zeros in `value`, with type `float32`.
  Úzero_fractionÚvaluer   )Úout_typec                  ó€   •— t        j                  t        ‰ t        j                  ¬«      t        j
                  ¬«      S ©Nr!   )r   rk   r‘   r   Úint32Úint64©r”   s   €r9   ú<lambda>zzero_fraction.<locals>.<lambda>ˆ  s%   ø€ œŸ™Ü˜5¬¯©Ô5Ü—,‘,ô € rI   c                  ó:   •— t        ‰ t        j                  ¬«      S r—   )r‘   r   r™   rš   s   €r9   r›   zzero_fraction.<locals>.<lambda>‹  s   ø€ œ¨´V·\±\ÔB€ rI   )Útrue_fnÚfalse_fnÚcounts_to_fractionr!   NÚfraction)r   r$   r%   r   Úsizer   r™   Útf_condr	   r˜   Úmaxr   rk   Úfloat32Úidentity)r”   r   r¡   Únum_nonzeroÚnum_zeroÚnum_zero_float32Úsize_float32Úzero_fraction_float32s   `       r9   r“   r“   l  sý   ø€ ô* ‡~~d˜O¨e¨WÓ5ñ AÜ×!Ñ! %¨gÔ6€EÜ>‰>˜%¬&¯,©,Ô7€Dä—,‘,Ø”—‘× Ñ Ñ ó ó CôD€Kô 
‰Ð,Ó	-ñ >Ø˜Ñ#€hÜ!Ÿ™ x´v·~±~ÔFÐÜ—]‘] 4¬v¯~©~Ô>€lØ.°Ñ=Ð÷	>ô ×ÑÐ3°ZÓ@÷%Añ A÷>ð >ú÷Að Aús%   šBD-Â%AD!Ã:D-Ä!D*	Ä&D-Ä-D6znn.depthwise_conv2dc           
      ó  ‡‡‡‡— t        d|d|«      }t        j                  ‰d| ‰g«      5 Št        j                  | d¬«      } t        j                  ‰d¬«      Š|€ddg}t	        j
                  «       C‰d	k(  rdd|d
   |d   g}nd|d
   |d   dg}t        j                  | ‰‰|‰|‰¬«      cddd«       S ˆˆˆˆfd„}t        j                  | t        j                  ‰«      ||‰|¬«      cddd«       S # 1 sw Y   yxY w)ah  Depthwise 2-D convolution.

  Given a 4D input tensor ('NHWC' or 'NCHW' data formats)
  and a filter tensor of shape
  `[filter_height, filter_width, in_channels, channel_multiplier]`
  containing `in_channels` convolutional filters of depth 1, `depthwise_conv2d`
  applies a different filter to each input channel (expanding from 1 channel
  to `channel_multiplier` channels for each), then concatenates the results
  together.  The output has `in_channels * channel_multiplier` channels.

  In detail, with the default NHWC format,

      output[b, i, j, k * channel_multiplier + q] = sum_{di, dj}
           filter[di, dj, k, q] * input[b, strides[1] * i + rate[0] * di,
                                           strides[2] * j + rate[1] * dj, k]

  Must have `strides[0] = strides[3] = 1`.  For the most common case of the
  same horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
  If any value in `rate` is greater than 1, we perform atrous depthwise
  convolution, in which case all values in the `strides` tensor must be equal
  to 1.

  Usage Example:

  >>> x = np.array([
  ...     [1., 2.],
  ...     [3., 4.],
  ...     [5., 6.]
  ... ], dtype=np.float32).reshape((1, 3, 2, 1))
  >>> kernel = np.array([
  ...     [1., 2.],
  ...     [3., 4]
  ... ], dtype=np.float32).reshape((2, 1, 1, 2))
  >>> tf.compat.v1.nn.depthwise_conv2d(x, kernel, strides=[1, 1, 1, 1],
  ...                                  padding='VALID').numpy()
    array([[[[10., 14.],
             [14., 20.]],
            [[18., 26.],
             [22., 32.]]]], dtype=float32)

  >>> tf.compat.v1.nn.depthwise_conv2d(x, kernel, strides=[1, 1, 1, 1],
  ...                                  padding=[[0, 0], [1, 0], [1, 0], [0, 0]]
  ...                                 ).numpy()
    array([[[[ 0.,  0.],
             [ 3.,  4.],
             [ 6.,  8.]],
            [[ 0.,  0.],
             [10., 14.],
             [14., 20.]],
            [[ 0.,  0.],
             [18., 26.],
             [22., 32.]]]], dtype=float32)

  Args:
    input: 4-D with shape according to `data_format`.
    filter: 4-D with shape
      `[filter_height, filter_width, in_channels, channel_multiplier]`.
    strides: 1-D of size 4.  The stride of the sliding window for each
      dimension of `input`.
    padding: Controls how to pad the image before applying the convolution. Can
      be the string `"SAME"` or `"VALID"` indicating the type of padding
      algorithm to use, or a list indicating the explicit paddings at the start
      and end of each dimension. When explicit padding is used and data_format
      is `"NHWC"`, this should be in the form `[[0, 0], [pad_top, pad_bottom],
      [pad_left, pad_right], [0, 0]]`. When explicit padding used and
      data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],
      [pad_top, pad_bottom], [pad_left, pad_right]]`.
    rate: 1-D of size 2. The dilation rate in which we sample input values
      across the `height` and `width` dimensions in atrous convolution. If it is
      greater than 1, then all values of strides must be 1.
    name: A name for this operation (optional).
    data_format: The data format for input. Either "NHWC" (default) or "NCHW".
    dilations: Alias of rate.

  Returns:
    A 4-D `Tensor` with shape according to `data_format`.  E.g., for
    "NHWC" format, shape is
    `[batch, out_height, out_width, in_channels * channel_multiplier].`
  Ú	dilationsÚrateÚ	depthwiseÚ	tensor_inr   Ú	filter_inNrK   ÚNCHWr   ©ÚinputÚfilterÚstridesÚpaddingÚdata_formatr¬   r   c                 ó:   •— t        j                  | ‰‰|‰‰¬«      S )N©r³   r´   rµ   r¶   r·   r   ©r   Údepthwise_conv2d_native)Úinput_convertedÚ_r¶   r·   r´   r   rµ   s      €€€€r9   Úopzdepthwise_conv2d.<locals>.op  s(   ø€ Ü×+Ñ+ØØØØØ!Øôð rI   ©r³   Úfilter_shapeÚdilation_rater¶   r·   r¾   )r   r   r$   r%   r   Úenclosing_tpu_contextr   r»   Úwith_space_to_batchr   Úshape)	r³   r´   rµ   r¶   r­   r   r·   r¬   r¾   s	    ``  ``  r9   Údepthwise_conv2drÅ   —  s  û€ ôr 
$ K°¸FÀDÓ	I€$Ü
‡~~d˜K¨%°¨Ó9ð $¸TÜ×!Ñ! %¨kÔ:€EÜ×"Ñ" 6°Ô<€FØ€|ØˆV€dô ×+Ñ+Ó-Ð9Ø	˜Ò	Ø˜˜4 ™7 D¨¡GÐ,‰	à˜˜Q™  a¡¨!Ð,ˆ	Ü×+Ñ+ØØØØØ!ØØô÷$ñ $÷*ô ×%Ñ%ØÜ—_‘_ VÓ,ØØØØô÷=$÷ $ò $ús   ¬BC7Â85C7Ã7D c           	      ó&   — t        | ||||||¬«      S )aŒ  Depthwise 2-D convolution.

  Given a 4D input tensor ('NHWC' or 'NCHW' data formats)
  and a filter tensor of shape
  `[filter_height, filter_width, in_channels, channel_multiplier]`
  containing `in_channels` convolutional filters of depth 1, `depthwise_conv2d`
  applies a different filter to each input channel (expanding from 1 channel
  to `channel_multiplier` channels for each), then concatenates the results
  together.  The output has `in_channels * channel_multiplier` channels.

  In detail, with the default NHWC format,

      output[b, i, j, k * channel_multiplier + q] =
          sum_{di, dj} filter[di, dj, k, q] *
                       input[b, strides[1] * i + dilations[0] * di,
                                strides[2] * j + dilations[1] * dj, k]

  Must have `strides[0] = strides[3] = 1`.  For the most common case of the
  same horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
  If any value in `dilations` is greater than 1, we perform atrous depthwise
  convolution, in which case all values in the `strides` tensor must be equal
  to 1.

  Usage Example:

  >>> x = np.array([
  ...     [1., 2.],
  ...     [3., 4.],
  ...     [5., 6.]
  ... ], dtype=np.float32).reshape((1, 3, 2, 1))
  >>> kernel = np.array([
  ...     [1., 2.],
  ...     [3., 4]
  ... ], dtype=np.float32).reshape((2, 1, 1, 2))
  >>> tf.nn.depthwise_conv2d(x, kernel, strides=[1, 1, 1, 1],
  ...                        padding='VALID').numpy()
    array([[[[10., 14.],
             [14., 20.]],
            [[18., 26.],
             [22., 32.]]]], dtype=float32)

  >>> tf.nn.depthwise_conv2d(x, kernel, strides=[1, 1, 1, 1],
  ...                        padding=[[0, 0], [1, 0], [1, 0], [0, 0]]).numpy()
    array([[[[ 0.,  0.],
             [ 3.,  4.],
             [ 6.,  8.]],
            [[ 0.,  0.],
             [10., 14.],
             [14., 20.]],
            [[ 0.,  0.],
             [18., 26.],
             [22., 32.]]]], dtype=float32)

  Args:
    input: 4-D with shape according to `data_format`.
    filter: 4-D with shape
      `[filter_height, filter_width, in_channels, channel_multiplier]`.
    strides: 1-D of size 4.  The stride of the sliding window for each
      dimension of `input`.
    padding: Controls how to pad the image before applying the convolution. Can
      be the string `"SAME"` or `"VALID"` indicating the type of padding
      algorithm to use, or a list indicating the explicit paddings at the start
      and end of each dimension. See
      [here](https://www.tensorflow.org/api_docs/python/tf/nn#notes_on_padding_2)
      for more information. When explicit padding is used and data_format
      is `"NHWC"`, this should be in the form `[[0, 0], [pad_top, pad_bottom],
      [pad_left, pad_right], [0, 0]]`. When explicit padding used and
      data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],
      [pad_top, pad_bottom], [pad_left, pad_right]]`.
    data_format: The data format for input. Either "NHWC" (default) or "NCHW".
    dilations: 1-D of size 2. The dilation rate in which we sample input values
      across the `height` and `width` dimensions in atrous convolution. If it is
      greater than 1, then all values of strides must be 1.
    name: A name for this operation (optional).

  Returns:
    A 4-D `Tensor` with shape according to `data_format`.  E.g., for
    "NHWC" format, shape is
    `[batch, out_height, out_width, in_channels * channel_multiplier].`
  )r³   r´   rµ   r¶   r­   r   r·   )rÅ   r²   s          r9   Údepthwise_conv2d_v2rÇ     s&   € ôr 
 Ø!'Ø")Ø")Ø(Ø#Ø&1ô
3ð 3rI   znn.separable_conv2dc	           	      ót  ‡‡‡— t        d|d|«      }t        j                  |d| ‰|g«      5 }t        j                  | d¬«      } t        j                  ‰d¬«      Št        j                  |d¬«      }|j	                  «       j                  d«      }	|	j                  d	   j                  d
«       |	j                  d
   j                  d
«       |€d
d
g}ˆˆˆfd„}
t        j                  | t        j                  ‰«      ||‰|
¬«      }t        j                  ||g d¢d‰|¬«      cddd«       S # 1 sw Y   yxY w)a°
  2-D convolution with separable filters.

  Performs a depthwise convolution that acts separately on channels followed by
  a pointwise convolution that mixes channels.  Note that this is separability
  between dimensions `[1, 2]` and `3`, not spatial separability between
  dimensions `1` and `2`.

  In detail, with the default NHWC format,

      output[b, i, j, k] = sum_{di, dj, q, r}
          input[b, strides[1] * i + di, strides[2] * j + dj, q] *
          depthwise_filter[di, dj, q, r] *
          pointwise_filter[0, 0, q * channel_multiplier + r, k]

  `strides` controls the strides for the depthwise convolution only, since
  the pointwise convolution has implicit strides of `[1, 1, 1, 1]`.  Must have
  `strides[0] = strides[3] = 1`.  For the most common case of the same
  horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
  If any value in `rate` is greater than 1, we perform atrous depthwise
  convolution, in which case all values in the `strides` tensor must be equal
  to 1.

  Args:
    input: 4-D `Tensor` with shape according to `data_format`.
    depthwise_filter: 4-D `Tensor` with shape
      `[filter_height, filter_width, in_channels, channel_multiplier]`.
      Contains `in_channels` convolutional filters of depth 1.
    pointwise_filter: 4-D `Tensor` with shape
      `[1, 1, channel_multiplier * in_channels, out_channels]`.  Pointwise
      filter to mix channels after `depthwise_filter` has convolved spatially.
    strides: 1-D of size 4.  The strides for the depthwise convolution for
      each dimension of `input`.
    padding: Controls how to pad the image before applying the depthwise
      convolution. Can be the string `"SAME"` or `"VALID"` indicating the type
      of padding algorithm to use, or a Python list indicating the explicit
      paddings at the start and end of each dimension. When explicit padding is
      used and data_format is `"NHWC"`, this should be in the form `[[0, 0],
      [pad_top, pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit
      padding used and data_format is `"NCHW"`, this should be in the form
      `[[0, 0], [0, 0], [pad_top, pad_bottom], [pad_left, pad_right]]`.
    rate: 1-D of size 2. The dilation rate in which we sample input values
      across the `height` and `width` dimensions in atrous convolution. If it is
      greater than 1, then all values of strides must be 1.
    name: A name for this operation (optional).
    data_format: The data format for input. Either "NHWC" (default) or "NCHW".
    dilations: Alias of rate.

  Returns:
    A 4-D `Tensor` with shape according to 'data_format'. For
      example, with data_format="NHWC", shape is [batch, out_height,
      out_width, out_channels].
  r¬   r­   Úseparable_conv2dr¯   r   Údepthwise_filterÚpointwise_filteré   r   rK   Nc                 ó:   •— t        j                  | ‰‰|‰d¬«      S )Nr®   r¹   rº   )r¼   r½   r¶   r·   rÊ   rµ   s      €€€r9   r¾   zseparable_conv2d.<locals>.opÐ  s(   ø€ Ü×+Ñ+ØØ!ØØØ!Øôð rI   r¿   )rK   rK   rK   rK   ÚVALID)r¶   r·   r   )r   r   r$   r%   r&   Ú	with_rankÚdimsr'   r   rÃ   r   rÄ   Úconv2d)r³   rÊ   rË   rµ   r¶   r­   r   r·   r¬   Úpointwise_filter_shaper¾   r®   s    ` `   `    r9   rÉ   rÉ   }  s;  ú€ ô~ 
$ K°¸FÀDÓ	I€$Ü
‡~~dÐ.ØÐ.Ð0@ÐAóCð )ØFJÜ×!Ñ! %¨kÔ:€EÜ×,Ñ,ØÐ1ô3Ðä×,Ñ,ØÐ1ô3Ðð .×7Ñ7Ó9×CÑCÀAÓFÐØ×Ñ Ñ"×<Ñ<¸QÔ?Ø×Ñ Ñ"×<Ñ<¸QÔ?à€|ØˆV€döô ×*Ñ*ØÜ—_‘_Ð%5Ó6ØØØØô€Iô =‰=ØØš,ØØØô÷I)÷ )ò )ús   ¬C8D.Ä.D7c           
      ó(   — t        | |||||||¬«      S )a—
  2-D convolution with separable filters.

  Performs a depthwise convolution that acts separately on channels followed by
  a pointwise convolution that mixes channels.  Note that this is separability
  between dimensions `[1, 2]` and `3`, not spatial separability between
  dimensions `1` and `2`.

  In detail, with the default NHWC format,

      output[b, i, j, k] = sum_{di, dj, q, r}
          input[b, strides[1] * i + di, strides[2] * j + dj, q] *
          depthwise_filter[di, dj, q, r] *
          pointwise_filter[0, 0, q * channel_multiplier + r, k]

  `strides` controls the strides for the depthwise convolution only, since
  the pointwise convolution has implicit strides of `[1, 1, 1, 1]`.  Must have
  `strides[0] = strides[3] = 1`.  For the most common case of the same
  horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
  If any value in `rate` is greater than 1, we perform atrous depthwise
  convolution, in which case all values in the `strides` tensor must be equal
  to 1.

  Args:
    input: 4-D `Tensor` with shape according to `data_format`.
    depthwise_filter: 4-D `Tensor` with shape `[filter_height, filter_width,
      in_channels, channel_multiplier]`. Contains `in_channels` convolutional
      filters of depth 1.
    pointwise_filter: 4-D `Tensor` with shape `[1, 1, channel_multiplier *
      in_channels, out_channels]`.  Pointwise filter to mix channels after
      `depthwise_filter` has convolved spatially.
    strides: 1-D of size 4.  The strides for the depthwise convolution for each
      dimension of `input`.
    padding: Controls how to pad the image before applying the depthwise
      convolution. Can be the string `"SAME"` or `"VALID"` indicating the type
      of padding algorithm to use, or a Python list indicating the explicit
      paddings at the start and end of each dimension. When explicit padding is
      used and data_format is `"NHWC"`, this should be in the form `[[0, 0],
      [pad_top, pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit
      padding used and data_format is `"NCHW"`, this should be in the form
      `[[0, 0], [0, 0], [pad_top, pad_bottom], [pad_left, pad_right]]`.
    data_format: The data format for input. Either "NHWC" (default) or "NCHW".
    dilations: 1-D of size 2. The dilation rate in which we sample input values
      across the `height` and `width` dimensions in atrous convolution. If it is
      greater than 1, then all values of strides must be 1.
    name: A name for this operation (optional).

  Returns:
    A 4-D `Tensor` with shape according to 'data_format'. For
      example, with data_format="NHWC", shape is [batch, out_height,
      out_width, out_channels].
  )r­   r   r·   )rÉ   )r³   rÊ   rË   rµ   r¶   r·   r¬   r   s           r9   Úseparable_conv2d_v2rÔ   é  s*   € ô~ 
ØØØØØØØØô
ð rI   znn.sufficient_statisticsc                 ó  ‡— t        t        |«      «      }t        d|d|«      }|€d}t        j                  |d| |g«      5  t        j
                  | d¬«      } | j                  «       Š‰j                  [t        ˆfd„|D «       «      rGd	}|D ]  }|‰j                  |   j                  z  }Œ  t        j                  || j                  ¬
«      }nŒt        j                  | «      }|D 	cg c]  }	|	dk  r|	|z   n|	‘Œ }
}	t        j                  t!        j"                  t        j$                  | «      | j                  «      |
«      }t!        j&                  |d¬«      }|Dt        j
                  |d¬«      }t!        j(                  | |«      }t!        j*                  | |«      }n| }t!        j,                  | «      }t!        j.                  |||d¬«      }t!        j.                  |||d¬«      }ddd«       |fS c c}	w # 1 sw Y   ŒxY w)a8  Calculate the sufficient statistics for the mean and variance of `x`.

  These sufficient statistics are computed using the one pass algorithm on
  an input that's optionally shifted. See:
  https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Computing_shifted_data

  For example:
  >>> t = [[1, 2, 3], [4, 5, 6]]
  >>> sufficient_statistics(t, [1])
  (<tf.Tensor: shape=(), dtype=int32, numpy=3>, <tf.Tensor: shape=(2,),
  dtype=int32, numpy=array([ 6, 15], dtype=int32)>, <tf.Tensor: shape=(2,),
  dtype=int32, numpy=array([14, 77], dtype=int32)>, None)
  >>> sufficient_statistics(t, [-1])
  (<tf.Tensor: shape=(), dtype=int32, numpy=3>, <tf.Tensor: shape=(2,),
  dtype=int32, numpy=array([ 6, 15], dtype=int32)>, <tf.Tensor: shape=(2,),
  dtype=int32, numpy=array([14, 77], dtype=int32)>, None)

  Args:
    x: A `Tensor`.
    axes: Array of ints. Axes along which to compute mean and variance. As in
      Python, the axes can also be negative numbers. A negative axis is
      interpreted as counting from the end of the rank, i.e., axis +
      rank(values)-th dimension.
    shift: A `Tensor` containing the value by which to shift the data for
      numerical stability, or `None` if no shift is to be performed. A shift
      close to the true mean provides the most numerically stable results.
    keep_dims: produce statistics with the same dimensionality as the input.
    name: Name used to scope the operations that compute the sufficient stats.
    keepdims: Alias for keep_dims.

  Returns:
    Four `Tensor` objects of the same type as `x`:

    * the count (number of elements to average over).
    * the (possibly shifted) sum of the elements in the array.
    * the (possibly shifted) sum of squares of the elements in the array.
    * the shift by which the mean must be corrected or None if `shift` is None.
  rp   Ú	keep_dimsNFÚsufficient_statisticsrU   r   c              3   óT   •K  — | ]  }‰j                   |   j                  d u–— Œ! y ­w©N)rÐ   r”   )Ú.0ÚdÚx_shapes     €r9   ú	<genexpr>z(sufficient_statistics.<locals>.<genexpr>g  s)   øè ø€ ò (9Ø./ˆ‰Q‰×Ñ TÔ)ñ(9ùs   ƒ%(rK   r!   r   ÚcountÚshiftÚmean_ss©rp   r   Úvar_ss)ÚlistÚsetr   r   r$   r%   r&   ÚrankÚallrÐ   r”   r   r*   r"   r   Úgatherr   rk   rÄ   Úreduce_prodÚsubtractÚsquared_differencerd   rc   )rU   Úaxesrß   rÖ   r   rp   ÚcountsrÛ   rå   rt   Úpositive_axesÚx_dimsÚm_ssÚv_ssrÜ   s                 @r9   r×   r×   5  sà  ø€ ôT 
Œc$‹i‹€$Ü(Ø(˜K¨ó4€)àÐØ€IÜ
‡~~dÐ3°a¸°ZÓ@ñ NÜ×Ñ˜a cÔ*€AØk‰k‹m€GØ‡||Ð¤Có (9Ø37ô(9ô %9à€fØò (ˆ!Ø'—,‘,˜q‘/×'Ñ'Ñ'‰ð(ä×#Ñ# F°!·'±'Ô:fô ^‰^˜AÓ€dØEIÖJ¸T d¨Q¢ht˜d’{°DÑ8ÐJ€mÐJÜ×ÑÜ
-‰-œ	Ÿ™¨Ó*¨A¯G©GÓ
4°móE€fä×#Ñ# F°Ô9€fØÐÜ×#Ñ# E°Ô8€eÜ×Ñ˜q %Ó(€dÜ×(Ñ(¨¨EÓ2dà€dÜ_‰_˜QÓ€dÜ×Ñ˜t T°IÀIÔN€DÜ×Ñ˜t T°IÀHÔM€D÷1Nð2 
t˜UÐ	"Ð"ùò K÷Nð Nús    ÁB(G>Ã)G9Ã<C/G>Ç9G>Ç>Hc                 ó"   — t        | ||||¬«      S )aJ  Calculate the sufficient statistics for the mean and variance of `x`.

  These sufficient statistics are computed using the one pass algorithm on
  an input that's optionally shifted. See:
  https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Computing_shifted_data

  Args:
    x: A `Tensor`.
    axes: Array of ints. Axes along which to compute mean and variance.
    shift: A `Tensor` containing the value by which to shift the data for
      numerical stability, or `None` if no shift is to be performed. A shift
      close to the true mean provides the most numerically stable results.
    keepdims: produce statistics with the same dimensionality as the input.
    name: Name used to scope the operations that compute the sufficient stats.

  Returns:
    Four `Tensor` objects of the same type as `x`:

    * the count (number of elements to average over).
    * the (possibly shifted) sum of the elements in the array.
    * the (possibly shifted) sum of squares of the elements in the array.
    * the shift by which the mean must be corrected or None if `shift` is None.
  )rU   rë   rß   rÖ   r   )r×   ©rU   rë   rß   rp   r   s        r9   Úsufficient_statistics_v2ró   €  s   € ô4 
Ø	˜E¨X¸Dô
Bð BrI   znn.normalize_momentsc                 ó¶  — t        j                  |d| |||g«      5  t        j                  | d¬«      }|1t        j                  ||d¬«      }t        j
                  ||d¬«      }nt        j                  ||d¬«      }|}t        j                  t        j                  ||«      t        j                  |«      d¬«      }ddd«       ||fS # 1 sw Y   fS xY w)aÏ  Calculate the mean and variance of based on the sufficient statistics.

  Args:
    counts: A `Tensor` containing the total count of the data (one value).
    mean_ss: A `Tensor` containing the mean sufficient statistics: the (possibly
      shifted) sum of the elements to average over.
    variance_ss: A `Tensor` containing the variance sufficient statistics: the
      (possibly shifted) squared sum of the data to compute the variance over.
    shift: A `Tensor` containing the value by which the data is shifted for
      numerical stability, or `None` if no shift was performed.
    name: Name used to scope the operations that compute the moments.

  Returns:
    Two `Tensor` objects: `mean` and `variance`.
  rn   Údivisorr   NÚshifted_meanÚmeanÚvariance)r   r$   r   Ú
reciprocalr   rB   ré   rd   )	rì   rà   Úvariance_ssrß   r   rõ   rö   r÷   rø   s	            r9   Únormalize_momentsrû   ž  sÍ   € ô$ ‡~~d˜K¨&°'¸;ÈÐ)NÓOñ Ü×!Ñ! &¨yÔ9€GØÐÜ×&Ñ& w°¸nÔM€lÜ\‰\˜,¨°FÔ;dä×&Ñ& w°¸fÔE€lØ€dÜ× Ñ Ü×Ñ˜+ wÓ/Ü‰˜Ó%Øô€H÷ð Ð	Ð÷ð Ð	Ðús   œB$CÃCz
nn.momentsc           	      óü  — t        d|d|«      }|€d}t        j                  |d| |g«      5  | j                  t        j
                  k(  r$t        j                  | t        j                  «      n| }t        j                  ||dd¬«      }t        j                  t        j                  |t        j                  |«      «      |dd	¬«      }|s,t        j                  ||«      }t        j                  ||«      }| j                  t        j
                  k(  rQt        j                  |t        j
                  «      t        j                  |t        j
                  «      fcddd«       S ||fcddd«       S # 1 sw Y   yxY w)
aÝ  Calculate the mean and variance of `x`.

  The mean and variance are calculated by aggregating the contents of `x`
  across `axes`.  If `x` is 1-D and `axes = [0]` this is just the mean
  and variance of a vector.

  Note: shift is currently not used; the true mean is computed and used.

  When using these moments for batch normalization (see
  `tf.nn.batch_normalization`):

   * for so-called "global normalization", used with convolutional filters with
     shape `[batch, height, width, depth]`, pass `axes=[0, 1, 2]`.
   * for simple batch normalization pass `axes=[0]` (batch only).

  Args:
    x: A `Tensor`.
    axes: Array of ints.  Axes along which to compute mean and
      variance.
    shift: Not used in the current implementation
    name: Name used to scope the operations that compute the moments.
    keep_dims: produce moments with the same dimensionality as the input.
    keepdims: Alias to keep_dims.

  Returns:
    Two `Tensor` objects: `mean` and `variance`.
  rp   rÖ   NFÚmomentsTr÷   rá   rø   )r   r   r$   r"   r   Úfloat16r   rk   r¤   Úreduce_meanrê   r   Ústop_gradientÚsqueeze)	rU   rë   rß   r   rÖ   rp   Úyr÷   rø   s	            r9   rý   rý   ¿  s5  € ôH )Ø(˜K¨ó4€)àÐØ€IÜ
‡~~d˜I¨¨4 yÓ1ñ ð -.¯G©G´v·~±~Ò,EŒ‰aœŸ™Ô(È1€Aä×Ñ  4°$¸VÔD€Dô
 ×#Ñ#Ü×#Ñ# A¤y×'>Ñ'>¸tÓ'DÓEØØØô	€Hñ
 Ü×Ñ˜t TÓ*€dÜ×"Ñ" 8¨TÓ2€hØ‡ww”&—.‘.Ò Üm‰m˜D¤&§.¡.Ó1Üm‰m˜H¤f§n¡nÓ5ð7÷)ñ ð. HÐ÷/÷ ò ús   ¬D/E2Å%E2Å2E;c                 ó"   — t        | ||||¬«      S )a¼  Calculates the mean and variance of `x`.

  The mean and variance are calculated by aggregating the contents of `x`
  across `axes`.  If `x` is 1-D and `axes = [0]` this is just the mean
  and variance of a vector.

  Note: shift is currently not used; the true mean is computed and used.

  When using these moments for batch normalization (see
  `tf.nn.batch_normalization`):

   * for so-called "global normalization", used with convolutional filters with
     shape `[batch, height, width, depth]`, pass `axes=[0, 1, 2]`.
   * for simple batch normalization pass `axes=[0]` (batch only).

  Args:
    x: A `Tensor`.
    axes: Array of ints.  Axes along which to compute mean and
      variance.
    shift: Not used in the current implementation.
    keepdims: produce moments with the same dimensionality as the input.
    name: Name used to scope the operations that compute the moments.

  Returns:
    Two `Tensor` objects: `mean` and `variance`.
  )rU   rë   rß   r   rÖ   )rý   rò   s        r9   Ú
moments_v2r    s   € ôD 
1˜4 u°4À8Ô	LÐLrI   znn.weighted_momentsc                 ó(  — t        d|d|«      }|€d}t        j                  |d| ||g«      5  t        j                  | d¬«      } t        j                  |d¬«      }| j                  t
        j                  k(  }|r$t        j                  | t
        j                  «      } |j                  | j                  k7  r t        j                  || j                  «      }t        j                  || z  |d	d
¬«      }|t        j                  | «      z   }t        j                  ||dd
¬«      }	t        j                  ||	«      }
t        j                  |t        j                  | |
«      z  |dd
¬«      }t        j                  ||	«      }|s.t        j                  |
|¬«      }
t        j                  ||¬«      }|rHt        j                  |
t
        j                  «      }
t        j                  |t
        j                  «      }|
|fcddd«       S # 1 sw Y   yxY w)aõ  Returns the frequency-weighted mean and variance of `x`.

  Args:
    x: A tensor.
    axes: 1-d tensor of int32 values; these are the axes along which
      to compute mean and variance.
    frequency_weights: A tensor of positive weights which can be
      broadcast with x.
    name: Name used to scope the operation.
    keep_dims: Produce moments with the same dimensionality as the input.
    keepdims: Alias of keep_dims.

  Returns:
    Two tensors: `weighted_mean` and `weighted_variance`.
  rp   rÖ   NFÚweighted_momentsrU   r   Úfrequency_weightsÚweighted_input_sumT)r   rp   Úsum_of_weightsÚweighted_distsq)rt   )r   r   r$   r%   r"   r   rþ   r   rk   r¤   rc   r   r.   Ú
div_no_nanrê   r  )rU   rë   r  r   rÖ   rp   Ú
needs_castr  Úbroadcasted_weightsr	  Úweighted_meanr
  Úweighted_variances                r9   r  r  &  sà  € ô& )Ø(˜K¨ó4€)àÐØ€IÜ
‡~~dÐ.°Ð4EÀtÐ0LÓMñ 3,Ü×Ñ˜a cÔ*€AÜ×-Ñ-ØÐ 3ô5Ðð —‘œFŸN™NÑ*€JÙÜ
-‰-˜œ6Ÿ>™>Ó
*€aà×Ñ !§'¡'Ò)Ü"Ÿ-™-Ð(9¸1¿7¹7ÓCÐô "×,Ñ,Ø˜AÑ˜tÐ*>ÈôOÐð ,¬i×.BÑ.BÀ1Ó.EÑEÐä×(Ñ(Ø˜TÐ(8À4ôI€Nô ×'Ñ'Ð(:¸NÓK€Mô ×)Ñ)ØœH×7Ñ7¸¸=ÓIÑIØØØô	€Oô !×+Ñ+¨O¸^ÓLÐáÜ×'Ñ'¨¸DÔA€mÜ#×+Ñ+Ø
 $ô(Ðñ Ü—m‘m M´6·>±>ÓB€mÜ"Ÿ-™-Ð(9¼6¿>¹>ÓJÐàÐ+Ð+÷g3,÷ 3,ò 3,ús   ­GHÈHc                 ó"   — t        | ||||¬«      S )aÒ  Returns the frequency-weighted mean and variance of `x`.

  Args:
    x: A tensor.
    axes: 1-d tensor of int32 values; these are the axes along which
      to compute mean and variance.
    frequency_weights: A tensor of positive weights which can be
      broadcast with x.
    keepdims: Produce moments with the same dimensionality as the input.
    name: Name used to scope the operation.

  Returns:
    Two tensors: `weighted_mean` and `weighted_variance`.
  )rU   rë   r  r   rÖ   )r  )rU   rë   r  rp   r   s        r9   Úweighted_moments_v2r  s  s    € ô" 
Ø	ØØ)ØØô
ð rI   znn.batch_normalizationc           	      óH  — t        j                  |d| ||||g«      5  t        j                  ||z   «      }|||z  }| t        j                  || j
                  «      z  t        j                  ||||z  z
  n| |z  | j
                  «      z   cddd«       S # 1 sw Y   yxY w)a	  Batch normalization.

  Normalizes a tensor by `mean` and `variance`, and applies (optionally) a
  `scale` \\(\gamma\\) to it, as well as an `offset` \\(\beta\\):

  \\(\frac{\gamma(x-\mu)}{\sigma}+\beta\\)

  `mean`, `variance`, `offset` and `scale` are all expected to be of one of two
  shapes:

    * In all generality, they can have the same number of dimensions as the
      input `x`, with identical sizes as `x` for the dimensions that are not
      normalized over (the 'depth' dimension(s)), and dimension 1 for the
      others which are being normalized over.
      `mean` and `variance` in this case would typically be the outputs of
      `tf.nn.moments(..., keepdims=True)` during training, or running averages
      thereof during inference.
    * In the common case where the 'depth' dimension is the last dimension in
      the input tensor `x`, they may be one dimensional tensors of the same
      size as the 'depth' dimension.
      This is the case for example for the common `[batch, depth]` layout of
      fully-connected layers, and `[batch, height, width, depth]` for
      convolutions.
      `mean` and `variance` in this case would typically be the outputs of
      `tf.nn.moments(..., keepdims=False)` during training, or running averages
      thereof during inference.

  See equation 11 in Algorithm 2 of source:
  [Batch Normalization: Accelerating Deep Network Training by
  Reducing Internal Covariate Shift; S. Ioffe, C. Szegedy]
  (http://arxiv.org/abs/1502.03167).

  Args:
    x: Input `Tensor` of arbitrary dimensionality.
    mean: A mean `Tensor`.
    variance: A variance `Tensor`.
    offset: An offset `Tensor`, often denoted \\(\beta\\) in equations, or
      None. If present, will be added to the normalized tensor.
    scale: A scale `Tensor`, often denoted \\(\gamma\\) in equations, or
      `None`. If present, the scale is applied to the normalized tensor.
    variance_epsilon: A small float number to avoid dividing by 0.
    name: A name for this operation (optional).

  Returns:
    the normalized, scaled, offset tensor.

  References:
    Batch Normalization - Accelerating Deep Network Training by Reducing
    Internal Covariate Shift:
      [Ioffe et al., 2015](http://arxiv.org/abs/1502.03167)
      ([pdf](http://proceedings.mlr.press/v37/ioffe15.pdf))
  Ú	batchnormN)r   r$   r   r   rk   r"   )rU   r÷   rø   ÚoffsetÚscaleÚvariance_epsilonr   Úinvs           r9   Úbatch_normalizationr  Œ  s¦   € ôz ‡~~d˜K¨!¨T°8¸UÀFÐ)KÓLñ MÜ
.‰.˜Ð$4Ñ4Ó
5€CØÐØ	ˆUl€cð Œx}‰}˜S !§'¡'Ó*Ñ*¬X¯]©]Ø%Ð1ˆ˜‘Ò¸°u¸s±{ÀAÇGÁGó.Mñ M÷M÷ Mò Mús   A1BÂB!znn.fused_batch_normc
                 óv  — |r|	dk7  r||€t        d|›d|›«      ‚t        j                  | d¬«      } t        j                  |d¬«      }t        j                  |d¬«      }|€t        j                  g «      }|€t        j                  g «      }t        j                  | ||||||	|||¬«
      \  }
}}}}}|
||fS )	a¯  Batch normalization.


  See Source: [Batch Normalization: Accelerating Deep Network Training by
  Reducing Internal Covariate Shift; S. Ioffe, C. Szegedy]
  (http://arxiv.org/abs/1502.03167).

  Args:
    x: Input `Tensor` of 4 or 5 dimensions.
    scale: A `Tensor` of 1 dimension for scaling.
    offset: A `Tensor` of 1 dimension for bias.
    mean: A `Tensor` of 1 dimension for population mean. The shape and meaning
          of this argument depends on the value of is_training and
          exponential_avg_factor as follows:
          is_training==False (inference):
            Mean must be a `Tensor` of the same shape as scale containing the
            estimated population mean computed during training.
          is_training==True and exponential_avg_factor == 1.0:
            Mean must be None.
          is_training==True and exponential_avg_factor != 1.0:
            Mean must be a `Tensor` of the same shape as scale containing the
            exponential running mean.
    variance: A `Tensor` of 1 dimension for population variance. The shape and
          meaning of this argument depends on the value of is_training and
          exponential_avg_factor as follows:
          is_training==False (inference):
            Variance must be a `Tensor` of the same shape as scale containing
            the estimated population variance computed during training.
          is_training==True and exponential_avg_factor == 1.0:
            Variance must be None.
          is_training==True and exponential_avg_factor != 1.0:
            Variance must be a `Tensor` of the same shape as scale containing
            the exponential running variance.
    epsilon: A small float number added to the variance of x.
    data_format: The data format for x. Support "NHWC" (default) or "NCHW" for
                 4D tenors and "NDHWC" or "NCDHW" for 5D tensors.
    is_training: A bool value to specify if the operation is used for
                 training or inference.
    name: A name for this operation (optional).
    exponential_avg_factor: A float number (usually between 0 and 1) used
                            for controlling the decay of the running
                            population average of mean and variance.
                            If set to 1.0, the current batch average is
                            returned.

  Returns:
    y: A 4D or 5D Tensor for the normalized, scaled, offsetted x.
    running_mean: A 1D Tensor for the exponential running mean of x.
                  The output value is (1 - exponential_avg_factor) * mean +
                  exponential_avg_factor * batch_mean), where batch_mean
                  is the mean of the current batch in x.
    running_var: A 1D Tensor for the exponential running variance
                 The output value is (1 - exponential_avg_factor) * variance +
                 exponential_avg_factor * batch_variance), where batch_variance
                 is the variance of the current batch in x.

  References:
    Batch Normalization - Accelerating Deep Network Training by Reducing
    Internal Covariate Shift:
      [Ioffe et al., 2015](http://proceedings.mlr.press/v37/ioffe15.html)
      ([pdf](http://proceedings.mlr.press/v37/ioffe15.pdf))
  r`   z€Both `mean` and `variance` must be a 1D tensor when `is_training` is False or `exponential_avg_factor` != 1.0. Received: `mean` z and `variance` r³   r   r  r  )rƒ   Úexponential_avg_factorr·   Úis_trainingr   )r(   r   r%   r   r*   r   Úfused_batch_norm_v3)rU   r  r  r÷   rø   rƒ   r·   r  r   r  r  Úrunning_meanÚrunning_varr½   s                 r9   Úfused_batch_normr  Ó  së   € ñV Ð/°3Ò6Ø€|˜Ð)Ü
ð .à.2¨XÐ5EØ |ð%ó &ð &ô 
×Ñ˜A GÔ,€!Ü
×
Ñ
 ¨GÔ
4€%Ü× Ñ  ¨hÔ7€&Ø	€\Ü×Ñ Ó#€DØÐÜ×#Ñ# BÓ'€Hä*4×*HÑ*HØØØØ
ØØØ3ØØØô
+Ñ'€!€\;  1 að 
ˆL˜+Ð	%Ð%rI   z'nn.batch_norm_with_global_normalizationc           	      óŒ   — t        d|d| «      } t        d|	d|«      }t        d|
d|«      }t        | ||||r|||«      S d||«      S )aG  Batch normalization.

  This op is deprecated. See `tf.nn.batch_normalization`.

  Args:
    t: A 4D input Tensor.
    m: A 1D mean Tensor with size matching the last dimension of t.
      This is the first output from tf.nn.moments,
      or a saved moving average thereof.
    v: A 1D variance Tensor with size matching the last dimension of t.
      This is the second output from tf.nn.moments,
      or a saved moving average thereof.
    beta: A 1D beta Tensor with size matching the last dimension of t.
      An offset to be added to the normalized tensor.
    gamma: A 1D gamma Tensor with size matching the last dimension of t.
      If "scale_after_normalization" is true, this tensor will be multiplied
      with the normalized tensor.
    variance_epsilon: A small float number to avoid dividing by 0.
    scale_after_normalization: A bool indicating whether the resulted tensor
      needs to be multiplied with gamma.
    name: A name for this operation (optional).
    input: Alias for t.
    mean: Alias for m.
    variance: Alias for v.

  Returns:
     A batch-normalized `t`.

  References:
    Batch Normalization - Accelerating Deep Network Training by Reducing
    Internal Covariate Shift:
      [Ioffe et al., 2015](http://proceedings.mlr.press/v37/ioffe15.html)
      ([pdf](http://proceedings.mlr.press/v37/ioffe15.pdf))
  r³   Útr÷   Úmrø   ÚvN)r   r  )r!  r"  r#  r]   Úgammar  Úscale_after_normalizationr   r³   r÷   rø   s              r9   Ú$batch_norm_with_global_normalizationr&  :  sj   € ô^ ! ¨%°°aÓ8€!Ü  ¨¨s°AÓ6€!Ü  ¨X°s¸AÓ>€!Ü	˜Q  1 dÑ5N¨EØ(8¸$ó
@ð @Ø"&Ð(8¸$ó
@ð @rI   c           
      ó(   — t        | |||||||¬«      S )a  Batch normalization.

  This op is deprecated. See `tf.nn.batch_normalization`.

  Args:
    input: A 4D input Tensor.
    mean: A 1D mean Tensor with size matching the last dimension of t.
      This is the first output from tf.nn.moments,
      or a saved moving average thereof.
    variance: A 1D variance Tensor with size matching the last dimension of t.
      This is the second output from tf.nn.moments,
      or a saved moving average thereof.
    beta: A 1D beta Tensor with size matching the last dimension of t.
      An offset to be added to the normalized tensor.
    gamma: A 1D gamma Tensor with size matching the last dimension of t.
      If "scale_after_normalization" is true, this tensor will be multiplied
      with the normalized tensor.
    variance_epsilon: A small float number to avoid dividing by 0.
    scale_after_normalization: A bool indicating whether the resulted tensor
      needs to be multiplied with gamma.
    name: A name for this operation (optional).

  Returns:
     A batch-normalized `t`.

  References:
    Batch Normalization - Accelerating Deep Network Training by Reducing Internal Covariate Shift:
      [Ioffe et al., 2015](http://proceedings.mlr.press/v37/ioffe15.html)
      ([pdf](http://proceedings.mlr.press/v37/ioffe15.pdf))
  )r!  r"  r#  r]   r$  r  r%  r   )r&  )r³   r÷   rø   r]   r$  r  r%  r   s           r9   Ú'batch_norm_with_global_normalization_v2r(  q  s)   € ôP 
.°Ø04Ø08Ø37Ø49Ø?OØHaØ37ô
9ð 9rI   c                 óö   — t        j                  | «      d   }t        j                  |dg«      }t        j                  || j
                  «      }t        j                  t        j                  | |«      dg«      S )z5Returns a vector summing up each row of the matrix x.rK   éÿÿÿÿ)	r   rÄ   r   Ústackr8   r"   Úreshaper   rY   )rU   ÚcolsÚ
ones_shaper8   s       r9   Ú	_sum_rowsr/  ¥  s_   € ô 
‰˜Ó	˜AÑ	€$Ü×$Ñ$ d¨A YÓ/€*Ü	‰˜
 A§G¡GÓ	,€$Ü	×	Ñ	œ8Ÿ?™?¨1¨dÓ3°b°TÓ	:Ð:rI   c           
      óª
  — t        | t        j                  «      rt        | «      } t        | t        «      s| g} t	        j
                  |d| |||gz   «      5  |j                  t        j                  k7  r$t        j                  |t        j                  «      }t        j                  |dg«      }|€t        j                  |||d||¬«      }d„ |D «       \  }}}t        j                  |t        j                  «      }t        j                  ||gd«      }t!        j"                  | ||
¬«      }|j                  |j                  k7  r t        j                  ||j                  «      }t        j$                  |ddgt'        j(                  t        j*                  |«      d   dg«      «      }t        j$                  |t'        j(                  t        j*                  |«      d   dg«      ddg«      }t        j,                  ||d¬	«      }t!        j"                  |||
¬«      }|j                  |j                  k7  r t        j                  ||j                  «      }t        j$                  |dgt        j*                  |«      «      }t        j$                  |t        j*                  |«      dg«      }t        j*                  |«      d
d }t        j                  d|g|gd«      }t        j.                  t        j0                  |d
«      t        j                  ||«      «      }t        j                  |t        j                  dg|gd«      «      }t        j                  t3        |«      d|g«      }t        j                  |d|g«      }||z  }||z  }|	r!t        j4                  |||¬«      }|\  }} }!t        j                  |dd
g«      }"t        j                  t        j                  | t        j6                  «      dd
g«      }#t        j                  |"|#gd
d«      }$t        j                  t        j*                  |«      dd
 t        j0                  |d«      gd«      }%|j                  |!j                  k7  r t        j                  |!|j                  «      }!|t9        j:                  |$|%|!dd¬«      z  }|r0|t        j<                  |«      z  }|t        j<                  |«      z  }t        j                  ||gd
«      }&t        j                  t        j>                  |«      |z  t        j@                  |«      gd
«      }'|&|'fcddd«       S # 1 sw Y   yxY w)a(
  Helper function for nce_loss and sampled_softmax_loss functions.

  Computes sampled output training logits and labels suitable for implementing
  e.g. noise-contrastive estimation (see nce_loss) or sampled softmax (see
  sampled_softmax_loss).

  Note: In the case where num_true > 1, we assign to each target class
  the target probability 1 / num_true so that the target probabilities
  sum to 1 per-example.

  Args:
    weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
        objects whose concatenation along dimension 0 has shape
        `[num_classes, dim]`.  The (possibly-partitioned) class embeddings.
    biases: A `Tensor` of shape `[num_classes]`.  The (possibly-partitioned)
        class biases.
    labels: A `Tensor` of type `int64` and shape `[batch_size,
        num_true]`. The target classes.  Note that this format differs from
        the `labels` argument of `nn.softmax_cross_entropy_with_logits`.
    inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward
        activations of the input network.
    num_sampled: An `int`.  The number of classes to randomly sample per batch.
    num_classes: An `int`. The number of possible classes.
    num_true: An `int`.  The number of target classes per training example.
    sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
        `sampled_expected_count`) returned by a `*_candidate_sampler` function.
        (if None, we default to `log_uniform_candidate_sampler`)
    subtract_log_q: A `bool`.  whether to subtract the log expected count of
        the labels in the sample to get the logits of the true labels.
        Default is True.  Turn off for Negative Sampling.
    remove_accidental_hits:  A `bool`.  whether to remove "accidental hits"
        where a sampled class equals one of the target classes.  Default is
        False.
    partition_strategy: A string specifying the partitioning strategy, relevant
        if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
        Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.
    name: A name for the operation (optional).
    seed: random seed for candidate sampling. Default to None, which doesn't set
        the op-level random seed for candidate sampling.
  Returns:
    out_logits: `Tensor` object with shape
        `[batch_size, num_true + num_sampled]`, for passing to either
        `nn.sigmoid_cross_entropy_with_logits` (NCE) or
        `nn.softmax_cross_entropy_with_logits` (sampled softmax).
    out_labels: A Tensor object with the same shape as `out_logits`.
  Úcompute_sampled_logitsr*  NT)Útrue_classesÚnum_trueÚnum_sampledÚuniqueÚ	range_maxÚseedc              3   óF   K  — | ]  }t        j                  |«      –— Œ y ­wrÙ   )r   r   )rÚ   Úss     r9   rÝ   z*_compute_sampled_logits.<locals>.<genexpr>  s    è ø€ ò <<Ø'(Œ	×Ñ ×"ñ<<ùs   ‚!r   )Úpartition_strategy)Útranspose_brK   r#   )r3  Úsparse_indicesg        F)Údefault_valueÚvalidate_indices)!Ú
isinstancer   ÚPartitionedVariablerã   r   r$   r"   r   r™   r   rk   r   r,  r   Úlog_uniform_candidate_samplerÚconcatr   Úembedding_lookupÚslicer   r+  rÄ   rY   r   Úexpand_dimsr/  Úcompute_accidental_hitsr˜   r   Úsparse_to_denser-   r/   r.   )(rV   rW   r?   Úinputsr4  Únum_classesr3  Úsampled_valuesÚsubtract_log_qÚremove_accidental_hitsr:  r   r7  Úlabels_flatÚsampledÚtrue_expected_countÚsampled_expected_countÚall_idsÚall_wÚtrue_wÚ	sampled_wÚsampled_logitsÚall_bÚtrue_bÚ	sampled_bry   Únew_true_w_shapeÚrow_wise_dotsÚdots_as_matrixÚtrue_logitsÚacc_hitsÚacc_indicesÚacc_idsÚacc_weightsÚacc_indices_2dÚacc_ids_2d_int32r<  Úsampled_logits_shapeÚ
out_logitsÚ
out_labelss(                                           r9   Ú_compute_sampled_logitsrf  ²  sÎ  € ôx œ×6Ñ6Ô7Ü7‹m€GÜ	GœTÔ	"Øˆi€Gä
‡~~dÐ4Ø ¨°Ð 8Ñ8ó:ñ v"à‡||”v—|‘|Ò#Ü}‰}˜V¤V§\¡\Ó2€fÜ×#Ñ# F¨R¨DÓ1€Kð ÐÜ-×KÑKØØØ!ØØØô€nñ<<Ø,:ô<<Ñ8€GÐ Ð"8ô m‰m˜G¤V§\¡\Ó2€Gô ×Ñ ¨WÐ5°qÓ9€Gô
 ×*Ñ*ØÐ-?ôA€Eà‡{{f—l‘lÒ"Üm‰m˜E 6§<¡<Ó0€eô _‰_˜U Q¨ FÜ,×2Ñ2Ü"+§/¡/°+Ó">¸qÑ"AÀ2Ð!FóHóI€Fô —‘ØÜ×ÑœyŸ™¨{Ó;¸AÑ>ÀÐBÓCÀbÈ"ÀXóO€Iô —_‘_ V¨YÀDÔI€Nô ×*Ñ*ØÐ,>ô@€Eà‡{{f—l‘lÒ"Üm‰m˜E 6§<¡<Ó0€eô _‰_˜U Q C¬¯©¸Ó)EÓF€FÜ—‘ ¤y§¡°{Ó'CÀbÀTÓJ€Iô
 /‰/˜&Ó
! ! AÐ
&€CÜ ×'Ñ'¨"¨h¨¸Ð(=¸qÓAÐÜ×%Ñ%Ü×Ñ˜f aÓ(Ü×Ñ˜&Ð"2Ó3ó5€Mô
 ×&Ñ& }Ü'0×'7Ñ'7¸"¸¸s¸ÀQÓ'GóI€Nä×#Ñ#¤I¨nÓ$=ÀÀH¸~ÓN€KÜ×Ñ˜v¨¨H ~Ó6€FØ6Ñ€KØiÑ€NâÜ'×?Ñ?Ø
' Hô.€hà*2Ñ'€k7˜Kô !×(Ñ(¨°r¸1°gÓ>€nÜ"×*Ñ*Ü
-‰-˜¤§¡Ó
.°°Q°ó9Ðä ×'Ñ'¨Ð9IÐ(JÈAØ(8ó:€nô '×-Ñ-Ü?‰?˜6Ó" 2 AÐ&Ü× Ñ  ¨aÓ0ð2Ø34ó6Ðð 
×	Ñ	 ×!2Ñ!2Ò	2Ü—m‘m K°×1EÑ1EÓFˆØœ×6Ñ6Ø
Ø
Ø
ØØ ô"ñ "€nñ à”X—\‘\Ð"5Ó6Ñ6€kØœŸ™Ð%;Ó<Ñ<€nô ×!Ñ! ;°Ð"?ÀÓC€Jô
 ×!Ñ!Ü×Ñ˜KÓ(¨8Ñ3Ü×Ñ˜^Ó,ð#ð 	ó
€Jð
 zÐ!÷mv"÷ v"ò v"ús   ÁS)U	Õ	Uznn.nce_lossc
                 ó.   — t        | ||||||||d|	¬«      S )aˆ  Computes and returns the noise-contrastive estimation training loss.

  See [Noise-contrastive estimation: A new estimation principle for
  unnormalized statistical
  models](https://arxiv.org/abs/1806.03664).
  Also see our [Candidate Sampling Algorithms
  Reference](https://www.tensorflow.org/extras/candidate_sampling.pdf)

  A common use case is to use this method for training, and calculate the full
  sigmoid loss for evaluation or inference as in the following example:

  ```python
  if mode == "train":
    loss = tf.nn.nce_loss(
        weights=weights,
        biases=biases,
        labels=labels,
        inputs=inputs,
        ...)
  elif mode == "eval":
    logits = tf.matmul(inputs, tf.transpose(weights))
    logits = tf.nn.bias_add(logits, biases)
    labels_one_hot = tf.one_hot(labels, n_classes)
    loss = tf.nn.sigmoid_cross_entropy_with_logits(
        labels=labels_one_hot,
        logits=logits)
    loss = tf.reduce_sum(loss, axis=1)
  ```

  Note: when doing embedding lookup on `weights` and `bias`, "div" partition
  strategy will be used. Support for other partition strategy will be added
  later.

  Note: By default this uses a log-uniform (Zipfian) distribution for sampling,
  so your labels must be sorted in order of decreasing frequency to achieve
  good results.  For more details, see
  `tf.random.log_uniform_candidate_sampler`.

  Note: In the case where `num_true` > 1, we assign to each target class
  the target probability 1 / `num_true` so that the target probabilities
  sum to 1 per-example.

  Note: It would be useful to allow a variable number of target classes per
  example.  We hope to provide this functionality in a future release.
  For now, if you have a variable number of target classes, you can pad them
  out to a constant number by either repeating them or by padding
  with an otherwise unused class.

  Args:
    weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
      objects whose concatenation along dimension 0 has shape [num_classes,
      dim].  The (possibly-partitioned) class embeddings.
    biases: A `Tensor` of shape `[num_classes]`.  The class biases.
    labels: A `Tensor` of type `int64` and shape `[batch_size, num_true]`. The
      target classes.
    inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward activations of
      the input network.
    num_sampled: An `int`.  The number of negative classes to randomly sample
      per batch. This single sample of negative classes is evaluated for each
      element in the batch.
    num_classes: An `int`. The number of possible classes.
    num_true: An `int`.  The number of target classes per training example.
    sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
      `sampled_expected_count`) returned by a `*_candidate_sampler` function.
      (if None, we default to `log_uniform_candidate_sampler`)
    remove_accidental_hits:  A `bool`.  Whether to remove "accidental hits"
      where a sampled class equals one of the target classes.  If set to `True`,
      this is a "Sampled Logistic" loss instead of NCE, and we are learning to
      generate log-odds instead of log probabilities.  See our [Candidate
      Sampling Algorithms Reference]
        (https://www.tensorflow.org/extras/candidate_sampling.pdf). Default is
          False.
    name: A name for the operation (optional).

  Returns:
    A `batch_size` 1-D tensor of per-example NCE losses.
  Údiv)r3  rJ  rL  r:  r   )Únce_loss)
rV   rW   r?   rH  r4  rI  r3  rJ  rL  r   s
             r9   Únce_loss_v2rj  l  s3   € ôv 
ØØØØØØØØ#Ø3ØØô
ð rI   c                 óh   — t        | |||||||d||	|
¬«      \  }}t        ||d¬«      }t        |«      S )an  Computes and returns the noise-contrastive estimation training loss.

  A common use case is to use this method for training, and calculate the full
  sigmoid loss for evaluation or inference. In this case, you must set
  `partition_strategy="div"` for the two losses to be consistent, as in the
  following example:

  ```python
  if mode == "train":
    loss = tf.nn.nce_loss(
        weights=weights,
        biases=biases,
        labels=labels,
        inputs=inputs,
        ...,
        partition_strategy="div")
  elif mode == "eval":
    logits = tf.matmul(inputs, tf.transpose(weights))
    logits = tf.nn.bias_add(logits, biases)
    labels_one_hot = tf.one_hot(labels, n_classes)
    loss = tf.nn.sigmoid_cross_entropy_with_logits(
        labels=labels_one_hot,
        logits=logits)
    loss = tf.reduce_sum(loss, axis=1)
  ```

  Note: By default this uses a log-uniform (Zipfian) distribution for sampling,
  so your labels must be sorted in order of decreasing frequency to achieve
  good results.  For more details, see
  `tf.random.log_uniform_candidate_sampler`.

  Note: In the case where `num_true` > 1, we assign to each target class
  the target probability 1 / `num_true` so that the target probabilities
  sum to 1 per-example.

  Note: It would be useful to allow a variable number of target classes per
  example.  We hope to provide this functionality in a future release.
  For now, if you have a variable number of target classes, you can pad them
  out to a constant number by either repeating them or by padding
  with an otherwise unused class.

  Args:
    weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
        objects whose concatenation along dimension 0 has shape
        [num_classes, dim].  The (possibly-partitioned) class embeddings.
    biases: A `Tensor` of shape `[num_classes]`.  The class biases.
    labels: A `Tensor` of type `int64` and shape `[batch_size,
        num_true]`. The target classes.
    inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward
        activations of the input network.
    num_sampled: An `int`.  The number of negative classes to randomly sample
        per batch. This single sample of negative classes is evaluated for each
        element in the batch.
    num_classes: An `int`. The number of possible classes.
    num_true: An `int`.  The number of target classes per training example.
    sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
        `sampled_expected_count`) returned by a `*_candidate_sampler` function.
        (if None, we default to `log_uniform_candidate_sampler`)
    remove_accidental_hits:  A `bool`.  Whether to remove "accidental hits"
        where a sampled class equals one of the target classes.  If set to
        `True`, this is a "Sampled Logistic" loss instead of NCE, and we are
        learning to generate log-odds instead of log probabilities. See
        our Candidate Sampling Algorithms Reference
        ([pdf](https://www.tensorflow.org/extras/candidate_sampling.pdf)).
        Default is False.
    partition_strategy: A string specifying the partitioning strategy, relevant
        if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
        Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.
    name: A name for the operation (optional).

  Returns:
    A `batch_size` 1-D tensor of per-example NCE losses.

  References:
    Noise-contrastive estimation - A new estimation principle for unnormalized
    statistical models:
      [Gutmann et al., 2010](http://proceedings.mlr.press/v9/gutmann10a)
      ([pdf](http://proceedings.mlr.press/v9/gutmann10a/gutmann10a.pdf))
  T)rV   rW   r?   rH  r4  rI  r3  rJ  rK  rL  r:  r   Úsampled_lossesrG   )rf  r<   r/  )rV   rW   r?   rH  r4  rI  r3  rJ  rL  r:  r   r>   rl  s                r9   ri  ri  Õ  sX   € ôx +ØØØØØØØØ#ØØ3Ø+Øô.€&ˆ&ô 5Ø˜FÐ)9ô;€.ô 
>Ó	"Ð"rI   znn.sampled_softmax_lossc                 ó0   — t        | ||||||||d|
|	¬«      S )aª
  Computes and returns the sampled softmax training loss.

  This is a faster way to train a softmax classifier over a huge number of
  classes.

  This operation is for training only.  It is generally an underestimate of
  the full softmax loss.

  A common use case is to use this method for training, and calculate the full
  softmax loss for evaluation or inference as in the following example:

  ```python
  if mode == "train":
    loss = tf.nn.sampled_softmax_loss(
        weights=weights,
        biases=biases,
        labels=labels,
        inputs=inputs,
        ...)
  elif mode == "eval":
    logits = tf.matmul(inputs, tf.transpose(weights))
    logits = tf.nn.bias_add(logits, biases)
    labels_one_hot = tf.one_hot(labels, n_classes)
    loss = tf.nn.softmax_cross_entropy_with_logits(
        labels=labels_one_hot,
        logits=logits)
  ```

  See our [Candidate Sampling Algorithms Reference]
  (https://www.tensorflow.org/extras/candidate_sampling.pdf)

  Also see Section 3 of [Jean et al., 2014](http://arxiv.org/abs/1412.2007)
  ([pdf](http://arxiv.org/pdf/1412.2007.pdf)) for the math.

  Note: when doing embedding lookup on `weights` and `bias`, "div" partition
  strategy will be used. Support for other partition strategy will be added
  later.

  Args:
    weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
      objects whose concatenation along dimension 0 has shape [num_classes,
      dim].  The (possibly-sharded) class embeddings.
    biases: A `Tensor` of shape `[num_classes]`.  The class biases.
    labels: A `Tensor` of type `int64` and shape `[batch_size, num_true]`. The
      target classes.  Note that this format differs from the `labels` argument
      of `nn.softmax_cross_entropy_with_logits`.
    inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward activations of
      the input network.
    num_sampled: An `int`.  The number of classes to randomly sample per batch.
    num_classes: An `int`. The number of possible classes.
    num_true: An `int`.  The number of target classes per training example.
    sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
      `sampled_expected_count`) returned by a `*_candidate_sampler` function.
      (if None, we default to `log_uniform_candidate_sampler`)
    remove_accidental_hits:  A `bool`.  whether to remove "accidental hits"
      where a sampled class equals one of the target classes.  Default is True.
    seed: random seed for candidate sampling. Default to None, which doesn't set
      the op-level random seed for candidate sampling.
    name: A name for the operation (optional).

  Returns:
    A `batch_size` 1-D tensor of per-example sampled softmax losses.

  rh  )r3  rJ  rL  r:  r   r7  )Úsampled_softmax_loss)rV   rW   r?   rH  r4  rI  r3  rJ  rL  r7  r   s              r9   Úsampled_softmax_loss_v2ro  E  s6   € ôZ 
ØØØØØØØØ#Ø3ØØØô
ð rI   c                 ó˜   — t        | |||||||d||	|
|¬«      \  }}t        j                  |d¬«      }t        j                  ||¬«      }|S )a2  Computes and returns the sampled softmax training loss.

  This is a faster way to train a softmax classifier over a huge number of
  classes.

  This operation is for training only.  It is generally an underestimate of
  the full softmax loss.

  A common use case is to use this method for training, and calculate the full
  softmax loss for evaluation or inference. In this case, you must set
  `partition_strategy="div"` for the two losses to be consistent, as in the
  following example:

  ```python
  if mode == "train":
    loss = tf.nn.sampled_softmax_loss(
        weights=weights,
        biases=biases,
        labels=labels,
        inputs=inputs,
        ...,
        partition_strategy="div")
  elif mode == "eval":
    logits = tf.matmul(inputs, tf.transpose(weights))
    logits = tf.nn.bias_add(logits, biases)
    labels_one_hot = tf.one_hot(labels, n_classes)
    loss = tf.nn.softmax_cross_entropy_with_logits(
        labels=labels_one_hot,
        logits=logits)
  ```

  See our Candidate Sampling Algorithms Reference
  ([pdf](https://www.tensorflow.org/extras/candidate_sampling.pdf)).
  Also see Section 3 of (Jean et al., 2014) for the math.

  Args:
    weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
        objects whose concatenation along dimension 0 has shape
        [num_classes, dim].  The (possibly-sharded) class embeddings.
    biases: A `Tensor` of shape `[num_classes]`.  The class biases.
    labels: A `Tensor` of type `int64` and shape `[batch_size,
        num_true]`. The target classes.  Note that this format differs from
        the `labels` argument of `nn.softmax_cross_entropy_with_logits`.
    inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward
        activations of the input network.
    num_sampled: An `int`.  The number of classes to randomly sample per batch.
    num_classes: An `int`. The number of possible classes.
    num_true: An `int`.  The number of target classes per training example.
    sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
        `sampled_expected_count`) returned by a `*_candidate_sampler` function.
        (if None, we default to `log_uniform_candidate_sampler`)
    remove_accidental_hits:  A `bool`.  whether to remove "accidental hits"
        where a sampled class equals one of the target classes.  Default is
        True.
    partition_strategy: A string specifying the partitioning strategy, relevant
        if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
        Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.
    name: A name for the operation (optional).
    seed: random seed for candidate sampling. Default to None, which doesn't set
        the op-level random seed for candidate sampling.

  Returns:
    A `batch_size` 1-D tensor of per-example sampled softmax losses.

  References:
    On Using Very Large Target Vocabulary for Neural Machine Translation:
      [Jean et al., 2014]
      (https://aclanthology.coli.uni-saarland.de/papers/P15-1001/p15-1001)
      ([pdf](http://aclweb.org/anthology/P15-1001))
  T)rV   rW   r?   rH  r4  rI  r3  rJ  rK  rL  r:  r   r7  Úlabels_stop_gradientr   )r?   r>   )rf  r   r   r   Ú$softmax_cross_entropy_with_logits_v2)rV   rW   r?   rH  r4  rI  r3  rJ  rL  r:  r   r7  r>   rl  s                 r9   rn  rn  ¡  sl   € ôh +ØØØØØØØØ#ØØ3Ø+ØØô.€&ˆ&ô ×"Ñ" 6Ð0FÔG€&Ü×>Ñ>Ø˜Fô$€.ð 
ÐrI   )FN)NNNrÙ   )NNNNN)r`   )Ú	euclideanNN)Ngê-™—q=NN)NNNN)NFN)NNgü©ñÒMbP?ÚNHWCTNr`   )NNNNNNNNNNN)rK   NTFÚmodNN)rK   NFri  )rK   NFru  ri  )rK   NTNrn  )rK   NTru  rn  N)CÚ__doc__r+   Útensorflow.python.frameworkr   r   r   Útensorflow.python.opsr   r   r   r	   r¢   r
   r   r   r   r   r   r   r   r   r   Útensorflow.python.platformr   Útensorflow.python.utilr   Ú"tensorflow.python.util.deprecationr   r   Ú tensorflow.python.util.tf_exportr   Úadd_dispatch_supportr   r<   Úregister_binary_elementwise_apirH   rP   rR   rT   Úregister_unary_elementwise_apirl   rn   r{   r™   r‘   r“   rÅ   rÇ   rÉ   rÔ   r×   ró   rû   rý   r  r  r  r  r  r&  r(  r/  rf  rj  ri  ro  rn  © rI   r9   ú<module>r     sà  ðñ 3ã å 3Ý .Ý +Ý +Ý 1Ý 8Ý 1Ý )Ý 1Ý /Ý ,Ý 0Ý ,Ý *Ý :Ý (Ý +Ý 5Ý +Ý >Ý IÝ 6ñ Ð Ó!Ø	×Ñò=ó ó "ð=ñ@ Ð5Ð6Ô7Ø	×ÑàØØ	ò"ó ó 8ð"ñN Ð1°bÔ9Ø	×)Ñ)Ø	×ÑàØØ	òY/ó ó *ó :ðY/ðz )×0Ñ0ð "Ô )ñ Ð2°rÔ:Ø	×Ñà/3òWó ó ;ðWñt Ð6Ð7Ô8Ø	×ÑÙÐBÀIÓNØ.2Ø.2Ø26Ø,0Ø/3ò	;Qó Oó ó 9ð;Qñ| ˆÐÔ Ø	×Ñò-ó ó !ð-ñ, ˆ9jÓ!Ø	×(Ñ(Ø	×Ñò2$ó ó )ó "ð2$ñl ÐÓØ	×Ñò0ó ó ð0ñf ÐÐ 5Ð7HÚMôOà	×ÑÙÐ<¸eÓDò47ó Eó óOð47ðn (.§|¡|ó ñ* ÐÐ!3Ó4Ø	×Ñò%Aó ó 5ð%AñR Ð$Ð%Ô&Ø	×Ñð
 ØØ!%Ø#ò|ó ó 'ð|ñ~ Ð  RÔ(Ø	×Ñð
 %)Ø"&Ø!ò]3ó ó )ð]3ñF Ð$Ð%Ô&Ø	×Ñð ØØ!%Ø#ògó ó 'ðgñT Ð  RÔ(Ø	×Ñð ØØ	òEó ó )ðEñT Ð)Ð*Ô+Ø	×ÑØDHØ#'òF#ó ó ,ðF#ñR Ð%¨"Ô-Ø	×ÑòBó ó .ðBñ8 Ð!Ó"Ø	×Ñòó ó #ðñ> ˆ|ˆnÔØ	×Ñð Ø	ØØò=ó ó ð=ñ@ ˆ<˜BÔØ	×Ñð ØØ	ò Mó ó  ð MñF Ð$Ð%Ô&Ø	×ÑØFJØ"òH,ó ó 'ðH,ñV Ð  RÔ(Ø	×Ñòó ó )ðñ. Ð#Ó$Ø	×Ñð "òBMó ó %ðBMñJ Ð$Ð%Ô&Ø	×Ñð
 
ØØØØØ	Øòb&ó ó 'ðb&ñJ Ð8Ð9Ô:Ø	×ÑØ+/Ø+/Ø+/Ø.2Ø/3Ø:>ØCGØ.2Ø/3Ø.2Ø26ò1@ó ó ;ð1@ñj Ð4¸Ô<Ø	×Ñð 26ò-9ó ó =ð-9òd
;ð& &'Ø+/Ø+/Ø38Ø/4Ø!%Ø!%ów"ñt ˆ=˜RÔ Ø	×Ñð Ø#Ø',Øòdó ó !ðdñN ˆ}ˆoÔØ	×Ñð Ø Ø$)Ø %Øòk#ó ó ðk#ñ\ Ð$¨Ô,Ø	×Ñð &'Ø+/Ø37Ø!%Ø!7òWó ó -ðWñt Ð(Ð)Ô*Ø	×Ñð #$Ø(,Ø04Ø,1Ø4Ø"òdó ó +ñdrI   