
    Vh              '       "   U d dl Z d dlZd dlZd dlZd dlZd dlmZmZ d dlm	Z	 d dl
mZ d dlmZmZ d dlmZmZmZmZmZ d dlZd dlmZ d dlmc mZ d dlmc mZ d dlmZ d dl m!Z! d d	l"m#Z#m$Z$m%Z%m&Z& d d
l'm(Z( ddgZ) G d de      Z* G d de      Z+ejX                  jZ                  Z- ej\                  e/      Z0e G d d             Z1 e1       Z2de3de3de3de4de*f
dZ5dejl                  dejl                  fdZ7dejl                  dejl                  de3de3de3d e4dejl                  fd!Z8 G d" d#      Z9	 	 	 dudd$d%e!d&ejl                  d'ejl                  d(ejl                  d)e:de4d*e4d+ee:   de;ejl                  d,f   fd-Z<	 	 	 	 dvdd$d%e!d&ejl                  d'ejl                  d(ejl                  d.eejl                     d/e4d)e:de4d+ee:   de;ejl                  d,f   fd0Z=	 	 	 	 	 dwdd$d%e!d&ejl                  d'ejl                  d(ejl                  d.eejl                     d/e4d)e:de4d*e4d+ee:   de;ejl                  d,f   fd1Z> G d2 d3e      Z? G d4 d5e      Z@ G d6 d7e@      ZA G d8 d9e@      ZB	 dxd:ej                  d;e3d<ee+   de@fd=ZDd>ejl                  d:ej                  d?e4dejl                  fd@ZE	 dyd%e!d;e3dAe?d&ejl                  d'ejl                  d(ejl                  de4dBeFde;ejl                  d,f   fdCZGdDej                  j                  dEe;eFd,f   dBeJeKeFf   deFfdFZLdDej                  j                  dEe;eFd,f   dBeJeKeFf   deFfdGZMd%e!d;e3dAe?dHejl                  dIeKd&ejl                  d'ejl                  d(ejl                  dJejl                  dKejl                  de4dBede;ejl                  d,f   fdLZNdd$d%e!dHejl                  d&ejl                  d'ejl                  d(ejl                  dJejl                  dKejl                  dMejl                  dNejl                  dOe3dPe3d)e:de4dQejl                  dRejl                  d+ee:   de;ejl                  d,f   f"dSZO	 dydd$d%e!dHejl                  d&ejl                  d'ejl                  d(ejl                  dTejl                  dJejl                  dKejl                  dQejl                  dRejl                  d)e:dUe;e4d,f   de4d+ee:   de;ejl                  d,f   fdVZPdd$d%e!dHejl                  d&ejl                  d'ejl                  d(ejl                  dJejl                  dKejl                  dQejl                  dRejl                  d.ejl                  dMejl                  dNejl                  dOe3dPe3d)e:de4d+ee:   de;ejl                  d,f   f$dWZQe-j                  j                  eLe-j                  j                  eMe-j                  j                  eLe-j                  j                  eMe-j                  j                  eLe-j                  j                  eMiZYi aZeJee;eKef   f   e[dX<   	 	 dzdYedZej                  d[e!d\ee   d]ee   ddfd^Z]dYedZej                  ddfd_Z^e j                  de	d`   fda       Z` G db dce(      Zae j                  d;e3d%e!de	d`   fdd       Zb G de dfe      Zc G dg dhec      Zd G di djec      Zed%e!dkefejl                     dlefe3   defejl                     fdmZge j                   ej                         ddddnd%e!dkeefejl                        dleefe3      doeeiejl                        de	d`   f
dp              Zj ej                         d%e!dkefejl                     dqefe3   defejl                     fdr       ZkdseKddfdtZly){    N)ABCabstractmethod)	Generator)	dataclass)autoEnum)AnyCallableOptionalProtocolUnion)nn)
DeviceMesh)distribute_moduleDTensor	ReplicateShard)ParallelStylecontext_parallelset_rotate_methodc                       e Zd ZdZdZdZy)_CausalBehaviorNFT)__name__
__module____qualname__SKIPNOT_IS_CAUSAL	IS_CAUSAL     `/home/dcms/DCMS/lib/python3.12/site-packages/torch/distributed/tensor/experimental/_attention.pyr   r      s    DMIr    r   c                   (    e Zd Z e       Z e       Zy)_RotateMethodN)r   r   r   r   
ALL_TO_ALL
ALL_GATHERr   r    r!   r#   r#   !   s    JJr    r#   c                   B    e Zd ZU dZeed<   dZej                  Z	eed<   y)_ContextParallelOptionsTconvert_to_f32rotate_methodN)
r   r   r   r(   bool__annotations__enable_load_balancer#   r%   r)   r   r    r!   r'   r'   *   s$    
  ND#0#;#;M=;r    r'   rank
world_sizei	is_causalreturnc                     |st         j                  S |dk(  rt         j                  S | |z
  |z  }|| k  st        j                  rt         j                  S t         j
                  S )z
    Calculate is_causal behavior for each KV block. The attention can either be
    calculated in full, not at all or with the causal mask applied.
    r   )r   r   r   _cp_optionsr,   r   )r-   r.   r/   r0   source_ranks        r!   _is_causal_behaviorr5   7   s\     ,,,Av(((!8z)KT[<<,,,###r    tensorc                 Z    t        | t        j                        r| j                         S | S )zu
    When tracing the code, the result tensor is not an AsyncCollectiveTensor,
    so we cannot call ``wait()``.
    )
isinstanceft_cAsyncCollectiveTensorwait)r6   s    r!   _maybe_waitr<   K   s%    
 &$445{{}Mr    originalnewdimn_chunksidxaddc                    t        | j                  ||            }||   j                  |j                  k(  sJ | j                  |j                  |f       |r||xx   |z  cc<   n|||<   t        j                  ||      S )ax  
    This API partially update a chunk of ``original`` tensor. The ``original``
    tensor will be first chunked along ``dim`` dimension then the ``idx`` chunk
    will be updated with ``new``. If ``add`` is True, the chunk will be added
    with ``new``, otherwise the chunk with be replaced by ``add``.

    The result is a tensor that is the same size as ``original``.
    r?   )listchunkshapetorchcat)r=   r>   r?   r@   rA   rB   chunkss          r!   _partial_updaterK   U   sv      (..s.34F#;		)KHNNCIIs+KK)
sss99V%%r    c                       e Zd ZdZdedefdZdej                  dej                  dedd	fd
Z	dej                  dej                  dedd	fdZ
deej                  ej                  f   fdZy	)_SDPAMergerz/A class to help to merge the local SDPA result.r(   seq_dimc                     || _         d | _        d | _        || _        t        j
                  | _        t        j
                  | _        y N)_seq_dim_out_lse_convert_to_f32rH   float32
_out_dtype
_lse_dtype)selfr(   rN   s      r!   __init__z_SDPAMerger.__init__q   s5    ,0	,0	-----r    	block_out	block_lsepartialr1   Nc                    |j                  d      }| j                  || _        || _        y d}| j                  J | j                  J |r*| j                  j                  || j                        d   n| j                  }|r*| j                  j                  || j                        d   n| j                  }|t        j                  ||z
        ||z
  z  z
  }|t        j                  ||z
        z
  }|rUt        | j                  || j                  |dd      | _        t        | j                  || j                  |dd      | _        y || _        || _        y )NrD         Fr?   r@   rA   rB   )		unsqueezerS   rR   rF   rQ   Fsigmoid
logsigmoidrK   )rX   rZ   r[   r\   ROUND_ROBIN_CYCLElseouts          r!   
_merge_onez_SDPAMerger._merge_oney   sQ    ''B'/	99!DI!DI !99(((99(((  		 1t}}EaHYY   		 1t}}EaHYY  		)c/2cIoFFCS9_55C+II.	 ,II.	  		r    rh   rg   c                    |j                   | _        |j                   | _        | j                  r>|j	                  t
        j                        }|j	                  t
        j                        }| j                  |||       y rP   )dtyperV   rW   rT   torH   rU   ri   )rX   rh   rg   r\   s       r!   stepz_SDPAMerger.step   sU    ))))&&'C&&'CS'*r    c                     | j                   J | j                  J | j                   | j                  j                  d      }}|j                  | j                        |j                  | j
                        fS )Nr^   )rR   rS   squeezerl   rV   rW   )rX   rh   rg   s      r!   resultsz_SDPAMerger.results   sc    yy$$$yy$$$99dii//3Svvdoo&t(???r    )r   r   r   __doc__r*   intrY   rH   Tensorri   rm   tuplerp   r   r    r!   rM   rM   n   s    9(t (c (. . 27,,. IM. 	. `+ +5<< +$ +4 +@u||U\\9: @r    rM   )scalemeshquerykeyvalue	dropout_preturn_debug_maskru   .c                d    |rt        d      d}t        | |t        j                  ||||||	      S )Nz&return_debug_mask is not supported yetr_   )rw   rx   ry   r0   rz   ru   )NotImplementedError_templated_ring_attentionaten#_scaled_dot_product_flash_attention)	rv   rw   rx   ry   rz   r0   r{   ru   rN   s	            r!   (_scaled_dot_product_ring_flash_attentionr      sF     !"JKKG$00
 
r    	attn_biascompute_log_sumexpc                p    |t        d      |sd}d}	t        | |	t        j                  ||||||||      S )Nattn_bias is not supported yetTr_   )rw   rx   ry   r0   r   rz   ru   r   )r}   r~   r   '_scaled_dot_product_efficient_attention)
rv   rw   rx   ry   r   r   rz   r0   ru   rN   s
             r!   ,_scaled_dot_product_ring_efficient_attentionr      sX     !"BCC!G$44- r    c	                r    |t        d      |sd}d}
t        | |
t        j                  |||||||||	      S )Nr   Tr_   )	rw   rx   ry   r   r   rz   r0   r{   ru   )r}   r~   r   #_scaled_dot_product_cudnn_attention)rv   rw   rx   ry   r   r   rz   r0   r{   ru   rN   s              r!   (_scaled_dot_product_ring_cudnn_attentionr      s[     !"BCC!G$00-+ r    c                       e Zd Zdej                  dej                  dej                  dedeej                  df   f
dZy)	_AttentionOprw   rx   ry   kwargsr1   .c                      y rP   r   )rX   rw   rx   ry   r   s        r!   __call__z_AttentionOp.__call__  s     $'r    N)r   r   r   rH   rs   objectrt   r   r   r    r!   r   r     sQ    '||' \\' ||	'
 ' 
u||S 	!'r    r   c                       e Zd Zedej
                  deddfd       Zedej                  ddfd       Z
edej                  fd       Zy)	_RingRotaterpgrN   r1   Nc                      y rP   r   rX   r   rN   s      r!   rY   z_RingRotater.__init__)  s    EHr    curr_bufferc                      y rP   r   rX   r   s     r!   exchange_buffersz_RingRotater.exchange_buffers,  s    CFr    c                      y rP   r   rX   s    r!   next_bufferz_RingRotater.next_buffer/  s    +.r    )r   r   r   r   distProcessGrouprr   rY   rH   rs   r   r   r   r    r!   r   r   (  sX    H4,,HsHtH HFELLFTF F.U\\. .r    r   c                   |    e Zd ZdZdej
                  deddfdZdej                  ddfdZ
dej                  fd	Zy)
_AllToAllRotaterz.Use all_to_all to send the kv to the next rankr   rN   r1   Nc                 .    || _         || _        d | _        y rP   )_pgrQ   _bufferr   s      r!   rY   z_AllToAllRotater.__init__6  s    /3r    r   c                     |j                         }t        j                  | j                        }t	        t        d|            dgz   }t        j                  ||| j                        | _        y Nr`   r   )	
contiguousr   get_world_sizer   rE   ranger9   permute_tensorr   )rX   r   sizedstss       r!   r   z!_AllToAllRotater.exchange_buffers;  sU    !,,.""488,E!TN#qc)**;dhhGr    c                 H    | j                   J t        | j                         S rP   )r   r<   r   s    r!   r   z_AllToAllRotater.next_bufferA  s!    ||'''4<<((r    r   r   r   rq   r   r   rr   rY   rH   rs   r   r   r   r    r!   r   r   3  sO    844,, 4s 4t 4
HELL HT H)U\\ )r    r   c                   |    e Zd ZdZdej
                  deddfdZdej                  ddfdZ
dej                  fd	Zy)
_AllGatherRotaterzh
    Allgather the kv and return the only the requried kv.
    Only one communication will be done.
    r   rN   r1   Nc                 <    || _         || _        d | _        d| _        y Nr   )r   rQ   _aggregated_buffer_idxr   s      r!   rY   z_AllGatherRotater.__init__L  s    :>	r    r   c                     | xj                   dz  c_         | j                  6t        j                  |j	                         d| j
                        | _        y y )Nr`   r   )
gather_dimgroup)r   r   r9   all_gather_tensorr   r   r   s     r!   r   z"_AllGatherRotater.exchange_buffersR  sG    		Q	""*&*&<&<&&(Qdhh'D# +r    c                 $   t        j                  | j                        }|| j                  z
  }| j                  J t        | j                        | _        | j                  j                  t        j                  | j                              |   S rP   )r   get_rankr   r   r   r<   rF   r   )rX   r-   rA   s      r!   r   z_AllGatherRotater.next_bufferZ  sr    }}TXX&TYY&&222"-d.E.E"F&&,,T-@-@-JKCPPr    r   r   r    r!   r   r   F  sP    
4,, s t ELL T QU\\ Qr    r   r   rN   methodc                     |t         j                  }|t        j                  k(  rt	        | |      S |t        j
                  k(  rt        | |      S t        d|       )NzUnkonwn method )r3   r)   r#   r$   r   r%   r   r}   )r   rN   r   s      r!   _create_rotaterr   c  s\     ~**)))G,,	=++	+ W--!OF8"<==r    blocksend_to_nextc                     | j                         } t        j                  |      }|rt        t	        d|            dgz   n|dz
  gt        t	        d|dz
              z   }t        j                  | ||      S r   )r   r   r   rE   r   r9   r   )r   r   r   r   r   s        r!   _ring_rotater   q  sx     Er"D  	U1d^s"QhZ$uQq122 	
 udB//r    opr   c                 x   |r.|j                  d      |j                  d      k7  rt        d      |st        j                  rt	        d      t        | t        j                        r| }n| j                         }t        |t        j                        sJ d       t        j                  |      }	t        j                  |      }
d}|j                         }|j                         }t        t        j                  |      }t        |d      }t        |
      D ]  }|dkD  rh|j!                         }|d|j#                          j%                  |j&                        }||j#                         d j%                  |j&                        }||
dz
  k  rDt)        j*                  |j-                         |j-                         g      }|j/                  |      }t1        |	|
||	      }|t2        j4                  k(  r|dk(  st        j                  r|s|||d
f\  }}}}nZ||	k  r6d}||j7                  |d      d   |j7                  |d      d   d
f\  }}}}n|j7                  dd      d   ||df\  }}}} ||||fd|j8                  i|^}}}|j;                  |||        g |j=                         S )a  
    This is a generalized ring attention implementation that can support multiple attention ops.

    Note [Context parallelism load balance algorithm for causal masking]
    =====================
    This explanation uses an example to illustrate the CP algorithm with causal
    masking.

    Consider a scenario where the sequence length of q, k, and v is 4 (e.g.,
    q = (q0, q1, q2, q3)), and there are two ranks. For simplicity, we will discuss
    only q and k, as v follows the same pattern as k.

    The diagram below represents a complete QK^T operation without parallelism.
    The `****` entries indicate that the result is not required due to causal
    masking (e.g., q0k1 is marked as `****`).

    +----+------------------------+
    |    |  k0    k1   k2     k3  |
    +----+------------------------+
    | q0 | q0k0, ****, ****, **** |
    | q1 | q1k0, q1k1, ****, **** |
    | q2 | q2k0, q2k1, q2k2, **** |
    | q3 | q3k0, q3k1, q3k2, q3k3 |
    +----+------------------------+

    ### No Load Balance:

    In this scenario, each rank owns a local chunk of q, k, and v, with each chunk
    containing two elements. Rank0 is responsible for managing (q0, q1) and (k0, k1),
    while rank1 manages (q2, q3) and (k2, k3).

    First Iteration: Both rank0 and rank1 perform SDPA with their local qkv pairs.
    Causal masking is enabled as some results are not required (e.g., q0k1).

    Second Iteration: Local queries remain the same, but local kv pairs are exchanged.
    Rank0 now has (q0, q1) and (k2, k3); rank1 has (q2, q3) and (k0, k1). Rank0 performs
    no computation, while rank1 computes locally without causal masking since all results
    (q2k0, q2k1, q3k0, q3k1) are needed.

    ### Round-robin Load Balance:

    In this setup, each rank owns two local chunks of q, k, and v, with each chunk
    containing one element. Rank0 manages (q0, q3) and (k0, k3); Rank1 manages (q1, q2)
    and (k1, k2). Although the local chunks are not consecutive, they are concatenated to
    enable SDPA to be performed in a single call for each step. Consequently, the chunk()
    function may be required to prepare the correct q, k, and v configurations.

    First Iteration: Both ranks perform SDPA with their local qkv pairs, similar to the
    no-load-balance case. This iteration corresponds to the `if` of the
    (`if, `elif`, `else`) in the implemementation.

    Second Iteration: Rank0 now has (q0, q3) and (k1, k2); rank1 has (q1, q2) and
    (k0, k3). For rank0, no computation is needed for q0. However, computations for
    q3k1 and q3k2 are required, so only q3 is used for SDPA. This corresponds to the
    `else` of the (`if`, `elif`, `else`) in the implemementation.
    For rank1, k0 is not needed for q1 and q2, so only k3 is used for SDPA. This
    corresponds to the `elif` of (`if`, `elif`, `else`) in the implementation.

    Parameters
    ----------
    op:
        The attention op to use
    *args:
        additional args are passed to the op
    **kwargs:
        additional kwargs are passed to the op

    Returns
    -------
    out:
        The merged attention output
    softmax_lse:
        The logsumexp of the merged attention output
    r_   z>is_causal requires the same query and context sequence lengths)Load balancing requires `is_causal=True`.z&process group must be single dimensionN)rN   r   r`   r-   r.   r/   r0   FrD   Tr0   )r   r}   r3   r,   RuntimeErrorr8   r   r   	get_groupr   r   r   rM   r(   r   r   r   numelreshaperG   rH   rI   flattenr   r5   r   r   rF   ry   rm   rp   )rv   rN   r   rw   rx   ry   r0   r   r   r-   r   next_kvsdpa_mergerrotaterr/   is_causal_behaviorqkvr\   rf   rh   	logsumexprests                           r!   r~   r~   ~  s   h ejjmsxx{2!L
 	
 88FGG$))*@D^^b$++,V.VV,==Dr"DG
 ..
CEk88'JK b!$G4[ 52q5))+G-CIIK(00;CCIIKM*225;;?Eq>ii @AG..w7G0$!y
 !5!556+99 !&sE59Aq!W$Y !"		+	3A6-15a8	 Aq!W  %{{1!{4Q7eTIAq!W !#!
 )..	!

 !
Y 	i1k52n )K!(D((r    op_callargsc                 R   t         j                  j                  | ||      }t        j	                  d|j
                         t         j                  j                  j                  |       |j                  }|J d       |j                  rJ d       | t        j                  j                  k(  r.t        |j                  g|j                  i |j                   }n| t        j"                  j                  k(  r.t%        |j                  g|j                  i |j                   }nV| t        j&                  j                  k(  r.t)        |j                  g|j                  i |j                   }nt+        d      t         j                  j-                  ||j.                        S )NDispatching op_call: %s"output sharding should not be Noneinputs need to be redistributedzDCP only supports flash attention and memory efficient attention now.)r   _op_dispatcherunwrap_to_op_infologgerdebugschemasharding_propagator	propagateoutput_shardingneeds_redistributer   r   defaultr   compute_mesh
local_argslocal_kwargsr   r   r   r   r}   wrapoutput_specr   r   r   op_infor   local_resultss         r!   _sdpa_handlerr   +  s    $$66wfMG
LL*GNN; ..88A--O&L(LL&11T3TT1$::BBB@  

 ""

 
D@@HH	HD  

 ""

 
D<<DD	D@  

 ""
 "R
 	
 !!&&}o6Q6QRRr    c                    t        |      }t        |      }t        j                  j	                  | ||      }t
        j                  d|j                         t        j                  j                  j                  |       |j                  }|J d       |j                  rJ d       | t        j                  j                  k(  r.t        |j                   g|j"                  i |j$                  }n| t        j&                  j                  k(  r.t)        |j                   g|j"                  i |j$                  }nY| t        j*                  j                  k(  r.t-        |j                   g|j"                  i |j$                  }nt/        d|       t        j                  j1                  ||j2                        S )Nr   r   r   zop_call=)rE   rt   r   r   r   r   r   r   r   r   r   r   r   ,_scaled_dot_product_flash_attention_backwardr   1_scaled_dot_product_ring_flash_attention_backwardr   r   r   0_scaled_dot_product_efficient_attention_backward5_scaled_dot_product_ring_efficient_attention_backward,_scaled_dot_product_cudnn_attention_backward1_scaled_dot_product_ring_cudnn_attention_backwardr}   r   r   r   s         r!   _sdpa_backward_handlerr   W  s    :D;D $$66wfMG
LL*GNN; ..88A--O&L(LL&11T3TT1$CCKKKI  

 ""

 
DIIQQ	QM  

 ""

 
DEEMM	MI  

 ""
 "XWJ-00!!&&}o6Q6QRRr    grad_outgrad_out_namerh   r   c                    |
st         j                  rt        d      | j                         }t	        |t
        j                        sJ d       t        j                  |      }t        j                  |      }d}d}d\  }}}t         j                  rt        j                  n|j                  }t        j                  ||      }t        j                  ||      }t        j                  ||      }|j                         }|j                         }t        |d      }t        |dt         j"                        }t%        |      D ]  }|dkD  r|j'                         }d}||||j)                         z    j+                  |j,                        }||j)                         z  }||||j)                         z    j+                  |j,                        }||j)                         z  }||d	z
  k7  rDt        j.                  |j1                         |j1                         g      }|j3                  |       t5        ||||

      }|t6        j8                  k7  r|dk(  st         j                  r|
s||||||	f\  }}} }!}"}#n||k  r8||j;                  d|      d   |j;                  d|      d   |||	f\  }}} }!}"}#nm|j;                  d|      d	   |||j;                  d|      d	   |j;                  d|      d	   |	j;                  d|      d	   j                         f\  }}} }!}"}#|"||<    |d||| |!|#|j<                  d|^}}}}$nEt        j                  ||      }t        j                  ||      }t        j                  ||      }d}%|dk(  r||z  }||z  }nd}|j'                         }||||j)                         z    j+                  |j,                        }||j)                         z  }||||j)                         z    j+                  |j,                        }||k  r3t         j                  r#t?        ||||%dd      }t?        ||||%dd      }n
||z  }||z  }t        j.                  |j1                         |j1                         g      }|j3                  |       ||k  st         j                  s||z  }}t?        ||||%d	d      } |J |J |jA                  |j                        }|j'                         jA                  |j                        }|d|j)                          j+                  |j,                        }||j)                         d j+                  |j,                        }|||g$S )z7This API implements the backward of the ring attention.r   zmust be single dimensionNNNN)rk   r_   )r   r   r`   r   rD   )rw   rx   ry   rh   r   r0   Tra   r   )!r3   r,   r   r   r8   r   r   r   r   r(   rH   rU   rk   
zeros_liker   r   r#   r$   r   r   r   r   rG   rI   r   r   r5   r   r   rF   ry   rK   rl   )&rv   rN   r   r   r   rw   rx   ry   rh   r   r0   r   r   r-   r   r   next_grad_kvgrad_query_	grad_key_grad_value_accum_dtype
grad_querygrad_key
grad_value
kv_rotaterdkv_rotaterr/   bufferpointerr   r   r   r   out_doutrg   r   rf   s&                                         r!   "_templated_ring_attention_backwardr    s    88FGG		Bb$++,H.HH,==Dr"DGL*:'KK#.#=#=%--5;;K!!%{;J;7H!!%{;J
..
CE Q'J!"a0H0HIK4[ vq5++-FG7SYY[#89AA#))LCsyy{"G7Wu{{}%<=EEekkREu{{}$Gq=ii @AG''00$!y
 !5!55Avk==Y,13sHi+X(1atSd
 IIaWI-a0KKwK/2,(1atS KKwK/2IIaWI-a0NN1'N215 OOA7O3A6AAC	,(1atS %)F=! :< :,22: :6KK$  **5DK((K@I**5DK6	!H+%JG&224L#Gg8H.HIQQH x~~''G%g*:J:J:L0LMUU  J Dy[<<*. -.
 I%k)
yy("2"2"4j6H6H6J!KL$$\29K;;+%J(*J_vp    """u{{+J**,//		:L.hnn./77GHj..023;;J<L<LMJ 
	 r    	cum_seq_q	cum_seq_kmax_qmax_kphilox_seedphilox_offsetc                    d}t        | |t        j                  j                  fi d|ddd|d|d|d|d|d	|d
|d|d|	d|
d|d|d|d|S )Nr_   r   r   rw   rx   ry   rh   r   r0   r  r	  r
  r  rz   r  r  ru   )r  r   r   r   )rv   r   rw   rx   ry   rh   r   r  r	  r
  r  rz   r0   r  r  ru   rN   s                    r!   r   r   +  s    & G-99AA 	
 !            !"  #$ $%& ' r    biasgrad_input_maskc                l    d}t        | |t        j                  j                  f|d||||||||	|
|||dS )Nr_   	grad_out_)r   r   rw   rx   ry   r   rh   r   r  r  rz   r  r0   ru   )r  r   r   r   )rv   r   rw   rx   ry   r  rh   r   r  r  rz   r  r0   ru   rN   s                  r!   r   r   V  s\    " G-==EE !#'# r    c                    d}t        | |t        j                  j                  fi d|ddd|d|d|d|d|d	|d
|d|	d|
d|d|d|d|d|d|S )Nr_   r   r   rw   rx   ry   rh   r   r  r  r   r  r	  r
  r  rz   r0   ru   )r  r   r   r   )rv   r   rw   rx   ry   rh   r   r  r  r   r  r	  r
  r  rz   r0   ru   rN   s                     r!   r   r   }  s    ( G-99AA 	
 !        $     !" #$ %& '( ) r    _replaced_functionsfn	fn_moduledevice_meshinput_fn	output_fnc                     dt         dt        t            dt        t            dt         ffd}| t        v ry || ||      }t        || j                  |       | j                  | ft        |<   y)aZ  
    ``distribute_function`` is an experimental API that allows users to "distribute"
    the inputs and outputs of a function. Similar to ``distribute_module``, this API
    installs hooks to the ``fn`` to convert the inputs and outputs. There are two
    major differences between ``distribute_function`` and ``distribute_module``.
    First, a function does not have parammeters and buffers, as a result,
    ``distribute_function`` itself won't convert any parameters/buffers but simply
    install the input and output hooks.  The tensor conversion will happen in the hooks.
    Another difference is an nn.Module subclass can have several instances and each
    instance be fed into ``distribute_module`` independently with affecting other
    instance. On the other hand, function is a singleton object. So if a function
    is distributed by ``distribute_function`` all subsequent calls to the function
    will invoke the installed hooks.

    Args:
        fn (Callable): the function to be distributed.
        fn_module (types.ModuleType): the Python module that the function is declared.
            e.g., if ``fn`` is ``torch.nn.functional.scaled_dot_product_attention``,
            ``fn_module`` is ``torch.nn.functional``.
        device_mesh (:class:`DeviceMesh`): the device mesh that will be used by the
            input and output hooks to distribute the tensors.
        input_fn (Optioinal[Callable]): the hook to distribute or convert the input
            arguments of ``fn``.
        output_fn (Optioinal[Callable]): the hook to distribute or convert the output
            arguments of ``fn``.
    	target_fnr  r  r1   c                 p     dt         t        df   dt        t        t        f   dt        f fd}|S )Nr   .r   r1   c                  P     g| i |\  } } | i |}	 |      }|S rP   r   )r   r   outputr  r  r  r  s      r!   inner_fnz7_distribute_function.<locals>.wrapper.<locals>.inner_fn  sG    #'EdEfEf//F$";7Mr    )rt   r	   dictstr)r  r  r  r  r  s   ``` r!   wrapperz%_distribute_function.<locals>.wrapper  s9    	E#s(O 	tCH~ 	# 	 	 r    N)r
   r   r  setattrr   )r  r  r  r  r  r"  
wrapper_fns     `    r!   _distribute_functionr%    ss    D'/'9FNxFX	 
  Xy1JIr{{J/'){{B&7
#r    c                 H    | t         vryt         |    \  }}t        |||       y)z>Restore the function that is replaced by _distribute_function.N)r  r#  )r  r  original_nameoriginal_fns       r!   _restore_functionr)    s+    
 
$$!4R!8M;I}k2r    r   c               #      K   t         j                  j                  } i | t        t         j                  _        d | t         j                  _        yw)z2Enables DTensor dispatcher to dispatch SDPA to CP.N)r   r   _custom_op_handlerscustomized_ops)old_handlerss    r!   _enable_cp_dispatcherr.    sB      ))==L1SL1SN1SG.	1=G.s   AAc                      e Zd ZU dZ ej
                         Zded<   dej                  de
dej                  fdZedej                  deeej                   eef   d	f   de
deeej                   eef   d	f   fd
       Zedej                  deej                   eeej                   eef   d	f   f   de
deeej                   eef   eeej                   eef   d	f   f   fd       Zy)_AttentionContextParallela$  
    Applies context parallel optimizations to the attention layer.

    This will work for nn.MultiHeadedAttention and custom attention layers that
    call F.scaled_dotproduct_attention with a simliar signature.

    This expects the `forward` method consumes either:

    * a single tensor for self attention
    * one argument for each of: query, key, value

    This currently only supports ring attention and the
    SDPBackend.FLASH_ATTENTION backend. See sdpa_kernel.

    Non-flash attention backends will result in incorrect results.
    z)weakref.WeakKeyDictionary[nn.Module, Any]_CONTEXT_MANAGERSmoduler  r1   c                     t        |t              s#t        t        |       dt        |        d      |j                  dk(  st        t        ||| j                  | j                        S )Nz is not supported by z yet.r`   )r  r  )r8   r   
ValueErrortypendimr   	_input_fn
_output_fn)rX   r2  r  s      r!   _applyz _AttentionContextParallel._apply  sk    +z2$%%:4:,eL  1$ ^^oo	
 	
r    inputs.c                     t               g}dt        j                  dd f fd}g }|D ]  }t        |t        j                        r7t        |t              s't	        j
                  |j                         ||d      }t        |t        j                        r|j                  r|j                  |       |j                  |        t               }|j                          | j                  <   t        |      S )Ngradr1   c                 ~    j                   v r.j                      j                  d d d        j                   = y y rP   )r1  __exit__)r<  clsr2  s    r!   backward_hookz:_AttentionContextParallel._input_fn.<locals>.backward_hook8  sA    ...%%f-66tT4H))&1 /r    F	run_check)r   rH   rs   r8   r   
from_localr   requires_gradregister_hookappendr.  	__enter__r1  rt   )	r?  r2  r:  r  	placementr@  inpinputmanagers	   ``       r!   r7  z#_AttentionContextParallel._input_fn.  s     [M		2 	2 	2  		E%.z%7Q**$$&Y% %.53F3F##M2JJu		 ()(/f%Szr    outputsc                      j                      j                  d d d         j                   = dt        j                  dd f fd}g }t	        |t        j                        r|gn|D ]l  }t	        |t
              r|j                         n|}t	        |t        j                        r|j                  r|j                  |       |j                  |       n t	        |t        j                        r|d   S t        |      S )Nr<  r1   c                 v    j                   vr*t               }|j                          |j                   <   y y rP   )r1  r.  rG  )r<  rK  r?  r2  s     r!   r@  z;_AttentionContextParallel._output_fn.<locals>.backward_hook\  s:    S222/1!!#07%%f- 3r    r   )r1  r>  rH   rs   r8   r   to_localrD  rE  rF  rt   )r?  r2  rL  r  r@  rh   r  s   ``     r!   r8  z$_AttentionContextParallel._output_fnP  s     	f%..tT4@!!&)	8 	8 	8 #-gu||#Dwi' 	F*4VW*EV__&6F&%,,/F4H4H$$]3JJv	 gu||,q6MSzr    N)r   r   r   rq   weakrefWeakKeyDictionaryr1  r+   r   Moduler   r9  classmethodrt   r   rH   rs   rr   floatr7  r8  r   r    r!   r0  r0    sW   & 	"!!# B 
RYY 
Z 
BII 
  		 eELL#u45s:;  	
 
uU\\3-.3	4 B 		 u||U5sE1I+JC+O%PPQ  	
 
ellC&'uU\\35M/NPS/S)TT
 r    r0  c              #      K   dt         dt        t        df   dt        t        t        f   dt        t        t        df   t        t        t        f   f   f fd}dt         dt        dt        fd}t        t        j                  t        |||       t               5  d	 d	d	d	       t        t        j                  t               y	# 1 sw Y   (xY ww)
zJReplace SDPA with the CP-wrapped version and enable DTensor CP dispatcher.rv   r   .r   r1   c           	         t              g}g }t        j                  ||j                               D ]V  }t	        |t
        j                        r)t	        |t              st        j                  || |d      }|j                  |       X t        |dt        |             }t        t        |j                         |t        |      d              }||fS )NFrA  r   )r   	itertoolschainvaluesr8   rH   rs   r   rC  rF  rt   lenr   zipkeys)	rv   r   r   rH  all_argsargnew_args
new_kwargsrN   s	           r!   attention_input_fnz-_context_parallel.<locals>.attention_input_fnv  s     7^$	??49 	!C#u||,ZW5M((dIOOOC 		! !c$i01#fkkmXc$ik-BCD
##r    rL  c                    g }t        |t        j                        r|gn|D ]5  }t        |t              r|j	                         n|}|j                  |       7 t        |t        j                        r|d   S t        |      S r   )r8   rH   rs   r   rO  rF  rt   )rv   rL  new_outputsr  s       r!   attention_output_fnz._context_parallel.<locals>.attention_output_fn  sq    #-gu||#Dwi' 	'F*4VW*EV__&6Fv&	' gu||,q>![!!r    N)
r   rt   r	   r   r!  r%  rc   scaled_dot_product_attentionr.  r)  )rN   rv   ra  rd  s   `   r!   _context_parallelrf  r  s     $$!&sCx$<@cN$	uS#XS#X.	/$ 	"* 	"s 	"s 	" 	&&	 
	   a44a8 s   BCC!'CCCc            
           e Zd Zeedej                  dededej                  fd              Z	eedej                  dededej                  fd              Z
y)_LoadBalancerr  rv   rN   r1   c                      y rP   r   r?  r  rv   rN   s       r!   shardz_LoadBalancer.shard       r    c                      y rP   r   rj  s       r!   unshardz_LoadBalancer.unshard  rl  r    N)r   r   r   rS  r   rH   rs   r   rr   rk  rn  r   r    r!   rh  rh    s    \\)3>A	   \\)3>A	  r    rh  c            	           e Zd ZdZedej                  dededej                  fd       Z	edej                  dededej                  fd       Z
y)	_SequentialSharderz
    This load balancer chunks the buffer into cp_world_size and rank0 gets
    0th shard, rank1 gets 1st shard, ...
    So this doesn't have any load balancing effect when using the causal masking.
    r  rv   rN   r1   c                     |j                         |   |j                         z  dk(  sJ |j                  |j                         |      |j                            S )Nr   rD   )r   rF   get_local_rankrj  s       r!   rk  z_SequentialSharder.shard  sO     {{}W%		3q888||DIIKW|5d6I6I6KLLr    c                     |j                         }t        |j                               D cg c]  }t        j                  |       }}t        j                  |||       t        j                  ||      S c c}w )NrD   )r   r   r   rH   
empty_liker9   all_gather_inplacerI   )r?  r  rv   rN   _all_bufferss         r!   rn  z_SequentialSharder.unshard  sd     ""$9>tyy{9KLAu''/LLVT:yy'22 Ms   A8N)r   r   r   rq   rS  rH   rs   r   rr   rk  rn  r   r    r!   rp  rp    s     M\\M)3M>AM	M M 3\\3)33>A3	3 3r    rp  c            	           e Zd ZdZdZedej                  dede	dej                  fd       Z
edej                  dede	dej                  fd       Zy	)
_RoundRobinLoadBalanceraG  
    This load balancer chunk the buffer into cp_world_size * ROUND_ROBIN_CYCLE
    shards, and uses a round robin approach to achieve load balancing.
    Since ROUND_ROBIN_CYCLE being 2 will achieve perfect load balancing for
    causal masking, we assume ROUND_ROBIN_CYCLE is always 2 to simplify the
    implementation.
    r_   r  rv   rN   r1   c                 &   | j                   dk(  sJ d       |j                         }|j                         }|j                         |   |dz  z  dk(  sJ |j                  |dz  |      }t	        j
                  ||   ||dz  |z
  dz
     f|      S )Nr_   @The current implementation only works if ROUND_ROBIN_CYCLE is 2.r   rD   r`   )rf   r   rr  rF   rH   rI   )r?  r  rv   rN   cp_world_sizecp_rankrJ   s          r!   rk  z_RoundRobinLoadBalancer.shard  s     $$) 	
N	
) 		%%'{{}W%):;q@@@ma/W=yyG_f]Q%6%@1%DEF
 	
r    c                    | j                   dk(  sJ d       |j                         }|j                         }t        |      D cg c]  }t	        j
                  |       }}t        j                  |||       |D cg c]  }|j                  d|      D ]  }|  }	}}t        |	      }
t        |	      D ]'  \  }}|dz  dk(  r	||
|dz  <   ||
|dz  |dz  z
  dz
  <   ) t	        j                  |
|      S c c}w c c}}w )Nr_   r{  rD   r   r`   )rf   r   r   r   rH   rt  r9   ru  rF   rE   	enumeraterI   )r?  r  rv   rN   r|  rv  rw  bsbsliced_buffersordered_buffersr/   s               r!   rn  z_RoundRobinLoadBalancer.unshard  s    $$) 	
N	
) ""$		9>}9MNAu''/NNVT:&1TAGGA7G<STb"T"TT~.n- 	FDAq1uz*+Q'DE 1Q!V <q @A		F
 yyg66 OTs   C9>!C>N)r   r   r   rq   rf   rS  rH   rs   r   rr   rk  rn  r   r    r!   ry  ry    s     
\\
)3
>A
	
 
 7\\7)37>A7	7 7r    ry  buffersbuffer_seq_dimsc                     g }t         j                  rt        nt        }t	        ||      D ]'  \  }}|j                  |j                  || |             ) |S )zFShard the buffers along the sequence dimensions according to CP rules.)r3   r,   ry  rp  r[  rF  rk  )rv   r  r  new_bufferssharderr  rN   s          r!   _context_parallel_buffersr    sb     K ** 	  
 w8 A7==w?@A r    )r  r  no_restore_buffersr  c             #     	K   |g n|}|g n|}|
t               n|}t        |      t        |      k7  rt        d      |D ]!  	t        	fd|D              rt        d       |D cg c]  }||v rdn|j	                          }}t        | ||      }t        ||      D ]A  \  	}|j	                         }	j                  |j                         	j                  |       C t        d|       5  d ddd       t        ||      D ]4  \  	}|		j                  |j                         	j                  |       6 yc c}w # 1 sw Y   RxY ww)a  

    ``context_parallel`` is an experimental API to enable context
    parallelism (CP). This API performs two actions: 1) patch the SDPA
    (``torch.nn.functional.scaled_dot_product_attention``) with the CP-enabled
    one, 2) shard ``buffers`` along the sequence dimension and each rank will
    preserve the corresponding shard according ``mesh``.

    Args:
        mesh (:class:`DeviceMesh`): the device mesh for the context parallelism.
        buffers (Optional[List[torch.Tensor]]): buffers that the usage depend
            on the sequence dimension. Examples are input batch, labels and
            positional embedding buffers. These buffers must be sharded along
            the sequence dimension to ensure the accuracy. The sharding will
            happen in-place, the buffer's shape will change within the context.
            The buffers will be restored after the context finishes.
            ``no_restore_buffers`` can be used to specify which buffers don't
            need to be restored. Note that ``buffers`` should not contain any
            nn.Parameter.
        buffer_seq_dims (Optional[List[int]]): the sequence dimensions of ``buffers``.
        no_restore_buffers (Optional[Set[torch.Tensor]]): buffers in these set
            won't be restored after the context exits. This set must be a subset
            of ``buffers``. If the buffers won't be used after the context exits,
            these buffers can be put in this list to avoid extra restore time.

    .. warning::
        `torch.distributed._tensor.experimental.attention.context_parallel` is a
        prototype feature in PyTorch. The API is subject to change.
    Nz>`seq_dims` must have the same number of elements as `buffers`.c              3   &   K   | ]  }|u  
 y wrP   r   ).0r  r  s     r!   	<genexpr>z#context_parallel.<locals>.<genexpr>=  s     011;0s   z3`no_restore_buffers` must be a subset of `buffers`.r_   )rN   rv   )setrZ  r4  anycloner  r[  resize_rG   copy_rf  )
rv   r  r  r  r  original_buffersrJ   rF   original_bufferr  s
            @r!   r   r     sd    L ObG+3bO"4"<BT
7|s?++L
 	
 % T000RSST
 QXX1%7 7QWWYFXX&tWoFFWf- u{{#U
 
14	0  $'w0@#A *&NN?001LL)* Y s7   AEE*EA,E3E	8E4E	EEseq_dimsc           	          t         j                  rt        nt        }t	        ||      D cg c]  \  }}|j                  || |       c}}S c c}}w )a  
    Unshard the tensors (e.g., output) that are sharded due to context parallelism.

    Args:
        mesh (:class:`DeviceMesh`): the device mesh for the context parallelism.
        buffers (List[torch.Tensor]): the buffers to be unsharded.
        seq_dims (List[int]): the sequence dimensions of ``buffers``. This list
            must have the same length as ``buffers``.

    Returns:
        List[torch.Tensor]: the unsharded buffers.
    )r3   r,   ry  rp  r[  rn  )rv   r  r  r  r  r?   s         r!   context_parallel_unshardr  P  sH    ( ** 	  
 9<GX8NOfaGOOAtS)OOOs   Ar)   c                     | dk(  rt         j                  t        _        y| dk(  rt         j                  t        _        yt        d|  d      )a  
    Context Parallel SDPA requires the rotation of kv shards. Users can call this
    API to specify which rotation method to use. "alltoall" shuffles the kv shards
    using all-to-all collective. While "allgather" gathers the kv shards using
    all-gather collective after the first sub-SDPA computation. If this API has not
    been called, the default rotate method is "allgather".

    Args:
        rotate_method (str): the rotate method to use. Currently only supports
        "allgather" and "alltoall". If a different string other than these two
        is passed in, the function will raise an error.

    Returns:
        None
    	allgatheralltoallz(Context Parallel does not support using z for kv shards rotationN)r#   r%   r3   r)   r$   r}   )r)   s    r!   r   r   j  sO      #$1$<$<!	*	$$1$<$<!!"O#:<
 	
r    )        FF)NTr  F)NTr  FFrP   )F)NN)m
contextlibrW  loggingtypesrP  abcr   r   collections.abcr   dataclassesr   enumr   r   typingr	   r
   r   r   r   rH   torch.distributeddistributedr   )torch.distributed._functional_collectives_functional_collectivesr9   torch.nn.functionalr   
functionalrc   torch.distributed.device_meshr   torch.distributed.tensorr   r   r   r   'torch.distributed.tensor.parallel.styler   __all__r   r#   opsr   	getLoggerr   r   r'   r3   rr   r*   r5   rs   r<   rK   rM   rT  rt   r   r   r   r   r   r   r   r   r   r   r   r~   _ops
OpOverloadr   r!  r   r   r  r   r   r   r   r   r   r   r   r   r   r,  r  r+   
ModuleTyper%  r)  contextmanagerr.  r0  rf  rh  rp  ry  rE   r  no_gradr  r   r  r   r   r    r!   <module>r     sS        # % !  ; ;    8 8    4 Q Q A 2
3d D 
 yy~~			8	$ < < < &'$
$$#&$37$$(  &ll&	& 
& 	&
 
& 
& \\&2I@ I@b # "
<< 
 <<	
    E? 5<<B )-#  " 
 <<  
  <<	 
 %        E?  5<< P )-##" ""
"<<" 
" <<	"
 %" " " " " E?" 5<<"J'8 '/3 /)| )&Q Q< LP>>$'>19-1H>>
0<<
0!..
0>B
0
\\
0( j)
j)j) 	j) <<	j)
 
j) <<j) j) j) 5<<j)Z)SZZ"")S

)S f)S 	)SX(SZZ""(S

(S f(S 	(SVf
ff 	f ll	f
 f <<f 
f <<f 
f ||f f f 5<<ft "#(
(ll( <<( 
	(
 <<( 
( ||( ||( ||( ( ( ( ( ( <<(" E?#($ 5<<%(p $ "$
$ll$ <<$ 
	$
 <<$ ,,$ 
$ ||$ $ <<$ $ 49%$ $ E?$  5<<!$r "%*
*ll* <<* 
	*
 <<* 
* ||* * <<* ||* ||* ||* * * *  !*$ E?%*& 5<<'*\ 	,,44m55==?U0088-99AACY,,44m55==?U =? T(E#x-$889 > $($(686868 68 x 	68
 !68 
68r	3( 	3u/?/? 	3D 	3 >y)9: > >h hV -9s -9* -9CS9T -9 -9`C 3 30.7m .7b
%,, #Y 
%,,	$  -1+/6:?*
?* d5<<()?* d3i(	?*
 !U\\!23?*  ?*  ?*D P
P%,,P 3iP 
%,,	P P2
S 
T 
r    