
    AVh                     @   d Z ddlZddlmZ ddlmZ ddlmZ ddlmZ ddlm	Z	  ej                         Zdad Z ej                  d	      d
        Zd Zd Zd Zd Z ej                  d      d        Zd Z ej                  d      d        Zd Zd Zd ZddZy)z@Ops for GPU collective operations implemented using NVIDIA nccl.    N)context)def_functiondevice)ops)gen_nccl_opsc                     t        d|       S )a  Returns a list of tensors with the all-reduce sum across `tensors`.

  The computation is done with an all-reduce operation, so if only some of the
  returned tensors are evaluated then the computation will hang.

  Args:
    tensors: The input tensors across which to sum; must be assigned
      to GPU devices.

  Returns:
    List of tensors, each with the sum of the input tensors, where tensor i has
    the same device as `tensors[i]`.
  sum_apply_all_reducetensorss    N/home/dcms/DCMS/lib/python3.12/site-packages/tensorflow/python/ops/nccl_ops.pyall_sumr           
5'	**    NcclAllReducec                 T   | j                  d      dk7  rt        d      t        || j                         | j                  d      }| j                  d      dz   }t	        j                  | j                        5  t        j                  |d||	      cd
d
d
       S # 1 sw Y   y
xY w)a)  The gradients for `all_sum`.

  Args:
    op: The `all_sum` `Operation` that we are differentiating.
    grad: Gradient with respect to the output of the `all_sum` op.

  Returns:
    The gradient with respect to the output of `all_sum`.

  Raises:
    LookupError: If `reduction` is not `sum`.
  	reduction   sumANo gradient defined for NcclAllReduce except for reduction="sum".expectednum_devicesshared_names   _gradr
   inputr   r   r   N)get_attrLookupError_check_devicer   r   r   nccl_all_reduce)opgradr   r   s       r   _all_sum_gradr$   .   s     [['
 ) * * ryy)M*+M*X5+
zz")) !''	!! ! !s   ;BB'c                     t        d|       S )a  Returns a list of tensors with the all-reduce product across `tensors`.

  The computation is done with an all-reduce operation, so if only some of the
  returned tensors are evaluated then the computation will hang.

  Args:
    tensors: The input tensors across which to multiply; must be assigned
      to GPU devices.

  Returns:
    List of tensors, each with the product of the input tensors, where tensor i
    has the same device as `tensors[i]`.
  prodr   r   s    r   all_prodr'   L   s     
67	++r   c                     t        d|       S )a  Returns a list of tensors with the all-reduce min across `tensors`.

  The computation is done with an all-reduce operation, so if only some of the
  returned tensors are evaluated then the computation will hang.

  Args:
    tensors: The input tensors across which to reduce; must be assigned
      to GPU devices.

  Returns:
    List of tensors, each with the minimum of the input tensors, where tensor i
    has the same device as `tensors[i]`.
  minr   r   s    r   all_minr*   ]   r   r   c                     t        d|       S )a  Returns a list of tensors with the all-reduce max across `tensors`.

  The computation is done with an all-reduce operation, so if only some of the
  returned tensors are evaluated then the computation will hang.

  Args:
    tensors: The input tensors across which to reduce; must be assigned
      to GPU devices.

  Returns:
    List of tensors, each with the maximum of the input tensors, where tensor i
    has the same device as `tensors[i]`.
  maxr   r   s    r   all_maxr-   n   r   r   c                     t        d|       S )a  Returns a tensor with the reduce sum across `tensors`.

  The computation is done with a reduce operation, so only one tensor is
  returned.

  Args:
    tensors: The input tensors across which to sum; must be assigned
      to GPU devices.

  Returns:
    A tensor containing the sum of the input tensors.

  Raises:
    LookupError: If context is not currently using a GPU device.
  r
   )_apply_reducer   s    r   
reduce_sumr0      s      
ug	&&r   
NcclReducec                 J   | j                  d      dk7  rt        d      t        || j                         t	        j                  | j                        5  t        j                  ||j                        }ddd       gt        | j                        z  S # 1 sw Y   "xY w)a\  The gradients for input `Operation` of `reduce_sum`.

  Args:
    op: The `sum send` `Operation` that we are differentiating.
    grad: Gradient with respect to the output of the `reduce_sum` op.

  Returns:
    The gradient with respect to the input of `reduce_sum` op.

  Raises:
    LookupError: If the reduction attribute of op is not `sum`.
  r   r   r   r   r   shapeN)
r   r   r    r   r   r   nccl_broadcastr4   leninputs)r"   r#   results      r   _reduce_sum_gradr9      s     [['
 ) * *ryy)
zz")) G((t4::FFG C		N	""G Gs   "BB"c                     t        |        t        j                  | j                        5  t        j                  | | j
                        cddd       S # 1 sw Y   yxY w)a	  Returns a tensor that can be efficiently transferred to other devices.

  Args:
    tensor: The tensor to send; must be assigned to a GPU device.

  Returns:
    A tensor with the value of `src_tensor`, which can be used as input to
    ops on other GPU devices.
  r3   N)r    r   r   r   r5   r4   )tensors    r   	broadcastr<      sI     
zz&--  I&&V6<<HI I Is   !AANcclBroadcastc                    |j                   j                  D cg c]  }| }}|D ]  }t        |        t        j                  | j                        5  t        j                  |d      cddd       S c c}w # 1 sw Y   yxY w)a-  The gradients for input `Operation` of `broadcast`.

  Args:
    op: The `broadcast send` `Operation` that we are differentiating.
    accumulated_grad: Accumulated gradients with respect to the output of the
      `broadcast` op.

  Returns:
    Gradients with respect to the input of `broadcast`.
  r
   r   r   N)r"   r7   r    r   r   r   nccl_reduce)r"   accumulated_gradtgradss       r   _broadcast_gradrD      s     '))00
11
1%
1 a! zz")) B##%5AB B	 2B Bs   	A6A;;Bc                      st        d      t                fd}t        j                         r t	        j
                  |             S  |       S )z$Helper function for all_* functions.z-Must pass >0 tensors to all reduce operationsc                      g } D ]f  }t        |       t        j                  |j                        5  | j                  t	        j
                  |t                           ddd       h | S # 1 sw Y   txY w)zCall nccl allreduce.r   N)r    r   r   appendr   r!   r6   )resrB   r   r   r   s     r   _all_reducez&_apply_all_reduce.<locals>._all_reduce   sz    
C *A::ahh *

((#L'	)	** ** J* *s   2A11A:	)
ValueError_get_shared_namer   executing_eagerlyr   function)r   r   rI   r   s   `` @r   r   r      sM    	
D
EE "+   .<  -//=r   c                     |st        d      |D ]  }t        |        t        j                  ||       	 t	        fd|D               S # t
        $ r t        d      w xY w)z'Helper function for reduce_* functions.z)Must pass >0 tensors to reduce operationsr?   c              3   V   K   | ]   }|j                   j                   k(  s| " y wNr   ).0rB   r8   s     r   	<genexpr>z _apply_reduce.<locals>.<genexpr>   s      9qqxx6==89s   ))z3One input tensor must be assigned to current device)rJ   r    r   r@   nextStopIteration)r   r   rB   r8   s      @r   r/   r/      su    	
@
AA a!##'YG&L9G99 
- 
 L
J
KKLs   A A$c                  h    t         5  t        } t        dz  ad d d        d| z  S # 1 sw Y   d z  S xY w)N   zc%s)_module_lock_shared_name_counter)vals    r   rK   rK      s?      
CA 
 
s   $1c           	          t        j                  | j                         st        d|  d      |r/|| j                   k7  rt        d| d| j                    d|  d      y y )NzDevice assignment for tensor=z! required for nccl collective opszExpected device z, got z for tensor=.)r   canonical_namerJ   )r;   r   s     r   r    r      sw    			v}}	-
4VH =& & ' '(fmm+
'z G%ha) * * ,Xr   rP   )__doc__	threadingtensorflow.python.eagerr   r   tensorflow.python.frameworkr   r   tensorflow.python.opsr   LockrW   rX   r   RegisterGradientr$   r'   r*   r-   r0   r9   r<   rD   r   r/   rK   r     r   r   <module>re      s    G  + 0 . + . y~~ +" o&! '!:,"+"+"'& l## $#0I  o&B 'B(:*r   