
    Vh7X                     F	   U d dl Zd dlZd dlmZmZ ddlmZmZm Z m	Z	m
Z
  ed      Z e	j                   ej                  ej                              Ze	j                  ed<   d Zd Zd	 Ze
j(                  d
        Ze
j(                  d        Ze
j(                  d        Ze
j(                  d        Ze
j(                  d        Ze
j(                  d        Ze
j(                  d        Ze
j(                  d        Ze
j(                  d        Ze
j(                  d        Ze
j(                  d        Ze
j(                  d        Z e
j(                  d        Z!e
j(                  d        Z"e
j(                  de	j                  fd       Z#e
j(                  de	j                  fd       Z$e
j(                  de	j                  fd       Z%e
j(                  d        Z&e
j(                  d        Z'e
j(                  d        Z(e
j(                  d        Z)e
j(                  d         Z*e
j(                  d!        Z+e
j(                  d"        Z,e
j(                  d#e	jZ                  d$e	jZ                  d%e.d&e.d'e.d(e	jZ                  d)e	j^                  d*d+d,e	jZ                  d-e.d.e	jZ                  fd/       Z0e
j(                  d0e	j                  d1e	j                  fd2       Z1e
j(                  d3        Z2e
j(                  d4        Z3e
j(                  d0e	j                  d1e	j                  fd5       Z4e
j(                  d6        Z5e
j(                  d7        Z6e
j(                  d8e	j                  d9e	j                  d:e	j                  d;e	j                  fd<       Z7e
j(                  d=e	j                  d>e	j                  d9e	j                  d:e	j                  d;e	j                  f
d?       Z8e
j(                  d e	j                  d@       e	j                  d@      fdAe	j                  d:e	j                  d;e	j                  fdB       Z9e
j(                  dNdC       Z:e
j(                  dD        Z;dEedFefdGZ<e<ddHdIe	j                  dJe=dFe	j                  fdK       Z>e<ddHdLedJe=dFe	j                  fdM       Z?y)O    N)AnyTypeVar   )_log2	libdevicemathtltriton_T_LOG_2_Ec                  <   t         j                  j                  } t         j                  j                  j	                  dd       x}rAt        | j                  |j                        ry | j                  |j                                y t        j                  d       y )NcpuzOCould not find an active CPU backend. Generated kernels will not be executable!)
r
   runtimedriverbackendsget
isinstanceactive
set_activewarningswarn)r   backends     V/home/dcms/DCMS/lib/python3.12/site-packages/torch/_inductor/runtime/triton_helpers.pyset_driver_to_cpur      so    ^^""F//**..ud;;w;fmmW^^4'..*+MMY    c                     t         j                  j                  } t         j                  j                  j	                         D ]  \  }}|j                  j                         s!|dk7  s't        | j                  |j                        s@t        | j                  d      r,t        | j                  j                  |j                        r y | j                  |j                                 y  t        d      )Nr   _objz$Could not find an active GPU backend)r
   r   r   r   items	is_activer   r   hasattrr   r   RuntimeError)r   namer   s      r   set_driver_to_gpur#      s    ^^""F11779 g>>##%$%- 6=='..96==&1v}}117>>B gnn./ =
>>r   c                     t         j                  j                  } | j                  j	                         }t         j
                  j
                  j                  |      }|j                  t                     }|j                  S N)
r
   r   r   r   get_current_targetcompilermake_backendparse_optionsdict__dict__)r   targetr   optionss       r   get_backend_optionsr.   .   s[    ^^""F]]--/Foo&&33F;G##DF+Gr   c                 P    | t        j                  dt         j                        z   S )N)r   )r	   zerosint1xs    r   promote_to_tensorr4   6   s     rxxbgg&&&r   c                     | |z  }| |z  }t        j                  |dk7  |dz
  |      }t        j                  | dk  |dk  k7  ||      S )Nr   r   r	   where)abquot	remainderfixeds        r   div_floor_integerr=   <   sP     6DAIHHY!^TAXt4E88QUA&t44r   c                 `    | |z  }t        j                  |dk7  xr | dk  |dk  k7  ||z   |      S )Nr   r6   )r8   r9   r;   s      r   remainder_integerr?   F   s:     AI88IN;QAE(:Y]IVVr   c                 H    t        |       j                  j                         S r%   )r4   dtypeis_floatingr2   s    r   rB   rB   M   s    Q%%1133r   c                     | |z  S r%    r8   r9   s     r   _prod_accumulaterF   R       q5Lr   c                 8    t        j                  | |t              S r%   )r	   reducerF   )inputaxiss     r   prodrL   W   s    99UD"233r   c                 `    | |k  }t        |       r|| | k7  z  }t        j                  || |      S r%   rB   r	   r7   r8   r9   masks      r   minimumrQ   \   2    q5D1~Q88D!Qr   c                 `    | |kD  }t        |       r|| | k7  z  }t        j                  || |      S r%   rN   rO   s      r   maximumrT   d   rR   r   c                 8    t        j                  | |t              S r%   )r	   rI   rQ   r8   dims     r   min2rX   l       99QW%%r   c                 8    t        j                  | |t              S r%   )r	   rI   rT   rV   s     r   max2r[   q   rY   r   c                     | |k  }| |k(  }t        |       r| | k7  }||k7  }||xr | z  }||xr |z  }||||k  z  z  }t        j                  || |      t        j                  |||      fS r%   rN   a_valuea_indexb_valueb_indexrP   equala_isnanb_isnans           r   minimum_with_indexre   v       WDwE7W$W$'K'$W$ 	EWw&''D88D'7+RXXdGW-MMMr   c                     | |kD  }| |k(  }t        |       r| | k7  }||k7  }||xr | z  }||xr |z  }||||k  z  z  }t        j                  || |      t        j                  |||      fS r%   rN   r]   s           r   maximum_with_indexrh      rf   r   c                 <    t        j                  | |f|t              S r%   )r	   rI   re   valueindexrW   s      r   min_with_indexrm          99eU^S*<==r   c                 <    t        j                  | |f|t              S r%   )r	   rI   rh   rj   s      r   max_with_indexrp      rn   r   use_fast_mathc                 h    |rt        j                  | t        z        S t        j                  |       S r%   )r   exp2r   r   exp)r3   rq   s     r   rt   rt      s&    ~~a(l++xx{r   c                     t        | |      }|d d d f   }t        j                  |t        d      k(  d| |z
        }t        j                  |t        ||      z  |      }||fS )N-infr   )r[   r	   r7   floatsumrt   )lhs_maxlhs_sumrW   rq   out_maxout_max_keepdimdeltaout_sums           r   online_softmax_reducer      sc    7C Gag&OHH_f5q'O:STEffWs5-88#>GGr   c                     t        | |      }t        j                  |t        d      k(  dt	        | |z
  |            }t        j                  |t        d      k(  dt	        ||z
  |            }||z  |z   }||fS )z
    When we do combine, we assume lhs is the accumulator and rhs is the next
    block of data.
    Then rhs_sum is always 1. With that assumption, we can save some registers
    and computation.
    rv   g      ?)rT   r	   r7   rw   rt   )ry   rz   rhs_maxrq   r{   	lhs_scale	rhs_scaler~   s           r   online_softmax_combiner      s     gw'G5= #s7W+<m'LI 5= #s7W+<m'LI 	!I-GGr   c                     |rCt        j                  |j                  d|j                        }| }t        j                  |      }n| |z
  }|dz   }|||z  z   }||| |z
  z  z   }|||fS )Nr   )r	   fullshaperA   
zeros_like)	rk   meanm2weightfirst_iteration
new_weightnew_meannew_m2r}   s	            r   welford_reducer      st    WWV\\1fll;
r"aZ
%*,,eux/00VZ''r   c                     || z
  }||z   }t        j                  |dk(  d||z        }| ||z  z   ||z   ||z  |z  |z  z   |fS )Ng        r6   )	mean_1m2_1weight_1mean_2m2_2weight_2r}   r   	w2_over_ws	            r   welford_combiner      se    VOEH$Js*CJ1FGI""teemh.:: r   c                 >    t        j                  | ||f|t              S r%   )r	   rI   r   )r   r   r   rW   s       r   welfordr      s    99dB'o>>r   c                 2    t        j                  | |       |S r%   )r	   device_assert)condmsgrs      r   device_assert_thenr      s    T3Hr   c                 Z   t        j                  | |      \  }}}}|j                  t         j                        }|j                  t         j                        }||dz  z  }||z
  }	||	j                  t         j                        z  }|j                  t         j                        |z   }|S )N    )r	   	randint4xtouint64int64)
seedoffsetlowhighr0r1_r2_r3resultsizes
             r   	randint64r      s    ||D&1BC	ryy	B	ryy	B28_F#:Ddggbii((FYYrxx 3&FMr   c                     | |z  S r%   rD   rE   s     r   _any_combiner      rG   r   c                 8    t        j                  | |t              S r%   )r	   rI   r   rV   s     r   anyr      s    99Q\**r   valuesboundaries_ptrBOUNDARIES_SIZEBOUNDARIES_UNDERLYING_NUMELBOUNDARIES_STRIDEboundary_indicesindexing_dtyperightbool
sorter_ptrSORTER_STRIDEsorter_indicesc                    t        j                  | j                  |      }t        j                  | j                  ||      }|dz   }|dkD  r||z   dz  }||z  |z   |k  xr ||k  }||	|n t        j                  ||
z   |	|z  z   |d      }t        j                  ||z   ||z  z   |d      }|r| |k\  }n| |kD  }t        j
                  ||z  |dz   |      }t        j
                  |||      }|dz   dz  }|dkD  r|S )a  
    See [Note: Inductor bucketize op]

    Inputs:
    -------
    values: the values to bucketize.
    boundaries_ptr: a pointer to the beginning of the boundaries tensor, in 1-D.
    BOUNDARIES_SIZE: the length of the last dimension of the boundaries tensor (i.e. one
    individual set of boundaries).
    BOUNDARIES_UNDERLYING_NUMEL: the length of the boundaries tensor, in 1-D, ignoring
    any striding.
    BOUNDARIES_STRIDE: the stride of the last dimension of the boundaries tensor
    boundary_indices: a tensor of the same size as "values"; each element is an index
    into a 1-D, un-strided boundaries tensor, pointing to the first element in the set
    of boundaries used for that value.
    indexing_dtype: the dtype used for indexing into the boundaries tensor, and the
    return dtype.
    right: if true, use boundary intervals closed on the left; otherwise use intervals
    closed on the right.
    sorter_ptr: an optional pointer to a sorter tensor of the same shape as boundaries,
    but potentially different striding.  If present, this allows us to treat boundaries
    as sorted even if the elements of boundaries are unsorted.
    SORTER_STRIDE: must be present if sorter_ptr is non-None; the stride of the last
    dimension of the sorter tensor.
    sorter_indices: must be present if sorter_ptr is non-None; see "boundary_indices".
    BLOCK_SHAPE: the shape of the data block being processed.
    )rA   r      r   )rP   other)r	   r0   r   r   loadr7   )r   r   r   r   r   r   r   r   r   r   r   r   r   
full_rangemidrP   mid_indicesbucket_upper_boundis_aboves                      r   bucketize_binary_searchr     sC   T ((6<<~
6C776<<GD 1$J
q.cza##&66'( B,//,A 	
 !]%: ^+mc.AA 	  WW--0AK0OO

 !33H 22Hhhx$a5xx$, 1n*
9 q.< Jr   DTYPE_VALUE_AS_UINT
DTYPE_PACKc                     t         j                  j                  |      }|j                  }| j	                  |d      j	                  |      }|j	                  |      ||z  z  S NTbitcastr	   core_constexpr_to_valueprimitive_bitwidthr   )rk   flagr   r   bitwidthuvs         r   pack_value_flagr   S  s[     ''556IJ"55H	%t	4	7	7
	CB77:".11r   c                     t         j                  j                  |      }t         j                  j                  |      }|j                  }| |z	  j	                  |      }|j	                  |d      S r   r   )packDTYPE_VALUEr   r   
value_uints        r   unpack_valuer   a  sb     ''--k:K''556IJ"55H("&&':;J==d=33r   c                 $    | j                  |      S r%   )r   )r   
DTYPE_FLAGs     r   unpack_flagr   o  s    77:r   c                    |j                   }t        |t        j                  |j                  d|      ||      }|dkD  rt        j
                  | |z   |d       t        j                  g |      }d}	|dz
  }
|
dk\  ryt        j                  g d|      }|dk(  r-t        j                  | |
z   dd      }t        ||      }|dk(  r-t        |||      }|	r
 |||      }n|}d}	|dk(  rd}
n|
dz
  }
|
dk\  ry|	r
 |||      }n|}t        |t        j                  g d|      ||      }t        j
                  | |z   |d       |S )	a  Compute exclusive scan of a scalar value between blocks

    Ref: https://research.nvidia.com/publication/2016-03_single-pass-parallel-prefix-scan-decoupled-look-back

    scratch_base: Pointer to scratch space in global memory
    block_value: Scalar value for this block
    index: Scalar index of this block relative to the current scan
    combine_fn: Function ``(value, value) -> value`` which is scanned over
    DTYPE_VALUE_AS_UINT: A tl.uint{n} type equal in size to ``block_value``
    DTYPE_PACK: Unsigned type twice the width of block_value

    NOTE: This function is limited to values which are 32-bits or less because
    we need to pack (value, flag) into a single unsigned int.
    r   r   relaxedsemFTr   )
rA   r   r	   r   r   atomic_xchgr0   
atomic_addr   r   )scratch_baseblock_valuerl   
combine_fnr   r   r   r   exclusive_prefixprefix_validtest_targetr   rk   inclusive_prefixs                 r   !exclusive_scan_decoupled_lookbackr   t  sm   0 ##K
!!1&9:	D qy
|e+TyA xxK0L!)K

wwr112ai==!;QINDt%89D ai T;0CD)%1AB$L19K%/K# 
( %&6D&
A*+	D NN<%'9=r   c                    |dkD  r|j                  t        j                  d      }t        j                  | d|z  z   dz   |       t        j                          t        j
                  g dt        j                        }t        j                  | d|z  z   dz   |d       t        j                  g |j                        }d}|dz
  }|dk\  rt        j
                  g dt        j                        }	|	dk(  r't        j                  | d|z  z   dz   dd	      }	|	dk(  r't        j                  | d|z  z   |	j                  t        j                        z         }
|
j                  |j                  d      }|r
 |||      }n|}d}|	d
k(  rd}n|dz
  }|dk\  r|r
 |||      }n|}|j                  t        j                  d      }t        j                  | d|z  z   d
z   |       t        j                          t        j
                  g d
t        j                        }t        j                  | d|z  z   dz   |d       |S )a  Compute exclusive scan of a scalar value between blocks

    Ref: https://research.nvidia.com/publication/2016-03_single-pass-parallel-prefix-scan-decoupled-look-back

    scratch_base: Pointer to scratch space in global memory
    block_value: Scalar value for this block, must be 64-bits wide
    index: Scalar index of this block relative to the current scan
    combine_fn: Function ``(value, value) -> value`` which is scanned over
    init: Scalar value equal to the identiy of combine_fn
    r   Tr      r   releaser   Facquirer   r   )r   r	   r   storedebug_barrierr   r   r0   rA   r   r   int32)r   r   rl   r   block_value_u64flag_oner   r   r   r   	value_u64rk   r   inclusive_prefix_u64flag_twos                  r   $exclusive_scan_decoupled_lookback_64r     s    qy%..D.A
E	)A-?
772q")),
|a%i/!3X9M xxK$5$56L!)K

wwr1bii(ai==K!?!!CQIVD ai GGL1{?:TWWRXX=NNO	[..=)%1AB$L19K%/K! 
& %&6D&+..ryy$.GHH\AI%)+?@wwr1bii(HNN<!e)+a/yIr   c                     t        j                  |       dz   }t        j                  | dk(  d|      }t        j                  | dk(  dt        j                  | |             }||fS )Nr   r   )r   ilogbr	   r7   ldexp)r3   yexponentmantissas       r   frexpr    s[     	QAxxQ1%HxxQ9??1qb#9:HXr   in_dimsstable
descendingc                    | j                   |z	  }|d|z  z  dd||z
  dz
  z  g}	t        j                  j                  | j                  j
                  d      }
t        j                  | |	      }|j                  |
d      }t        j                  dd      d d d d f   j                  |
      }d|z
  j                  |
      }t        j                  t        j                  ||z  d      j                  |
      d d d d d f   |	      }t        j                  t        j                  ||z  d      j                  |
      d d d d d f   |	      }t        j                  || j                        }t        j                  || j                        }|j                  | j                  d      }|j                  | j                  d      }t        j                  ||	      }t        j                  t        j                  ||j                  |j                        z  d      d d d d d f   |	      }t        j                  t        j                  ||j                  |j                        z  d      d d d d d f   |	      }t        j                  || j                        }t        j                  || j                        }|_t        j                  | j                  dt        j                        }t        j                  | j                  dt        j                        }n
||k  }||k  }| j                  |
d      }|r||k  }n||kD  }|r|||k(  ||kD  z  z  }||kD  ||k(  |z  z  }||z  j                  t        j                        }|t        j                  |||z  t        j                  |            z  }|t        j                  |||z  t        j                  |            z  }|j                  | j                  d      |fS )Nr   r   T)r   signedr   r   )numelr	   r   get_int_dtyperA   r   reshaper   arangebroadcast_torx   r   r   r1   r7   r   )r3   idxsrnumelflipr  r  r  r	  n_outerr   idtyper  iy
right_mask	left_maskileftirightleftr   y_idxleft_idx	right_idxleft_valid_maskright_valid_maskixr   retnew_idxss                               r   _compare_and_swap_with_indexr$    sT    GGv-G"QT>1aFQJN.CDEWW""AGG,F,Ft"TF


1eA	
fd	#B1aq$/226:JZ##F+IOOBFF2	>1588@D!LeTE__RVVBOQ7::6B1dA:NPUVFJJuagg&EZZ(F88AGGT8*DIIaggtI,E JJtU#E
uy||EKK00!4QaZ@%H 
uz}}U[[1115aqjA5I zz(AGG,H

9agg.I ~''!''4977177D"'':"V+$v- 
fd	#Be|e|(Y*>?@.	_	,4D 4KBGG$D
rxxefnbmmB.?@
@CbhhtX	%92==;NOOH66!''46((22r   stagealternatingc                    | j                   |z	  }t        j                  ||k         |re|d|dz
  |z
  z  z  dd|z  g}	t        j                  t        j                  t        j
                  dd      d d d d f   |	      | j                        }
nd}
t        j                  |      D ]  }t        | |||
|||z
  z   |||      \  } } | |fS )Nr   r   r   F)	r  r	   static_assertr  r  r  r   static_ranger$  )r3   r  r  r%  r&  r  r  r	  r  r   r  r  s               r   _bitonic_merge_with_indexr*  B  s     GGv-GUf_% &vzE/A)BBAq%xPzzOOBIIaOD!TM:EBAGG
 __U# 
.tVT1#7
4
 d7Nr   FrW   c                 p   t        j                  | |      \  } }|t        | j                        dz
  n|}t        j                  |t        | j                        dz
  k(  d       t        | j                  |         }t        j                  d|dz         D ]  }t        | |||||k  |||      \  } } | |fS )Nr   z+only minor dimension is currently supported)r&  r  r  r	  )r	   	broadcastlenr   r(  r   r)  r*  )	r3   r  r  rW   r  r	  _dimr  r  s	            r   sort_with_indexr/  c  s     ll1d#GAt-0[QWW)cDAGGq  "O !/F__Q
+ 

+F
!	
4

 d7Nr   c                    t         j                  j                  | j                  j                  d      }| j                  |d      }t        j                  ||z  ||      }|j                  | j                  d      S )NF)r  Tr   )	keep_dims)r	   r   r  rA   r   r   rx   )r3   rP   rW   r1  r  r!  r  s          r   
select_oner2    sd    WW""177#=#=e"LF	
fd	#B	T	3)	4B55$5''r   c                    t        j                          d}|j                  t         j                        }t        j                  d      j                  t         j                        }t        j
                  d      dk(  r	d||z
  z
  }n|}t        j                  | |d      }d}|s&t        j                  | dd      }||z  dz  dk7  }|s&t        j                          y)	z
    Wait for all other thread blocks in grid sharing same y/z program_id
    to reach this barrier before returning.

    Args:
        sem: an uint32 semaphores, zero or 0x80000000 initialized.  Must be unique to each y/z program ID.
    r   r   l        r   r   Fr   N)r	   r   r   uint32num_programs
program_idr   )r   one_i32one_u32expectednb
old_arrivebar_flippedcurrent_arrives           r   x_grid_barrierr>    s     Gjj#Gq!$$RYY/H	}}Q18g-.sBI6JKsA9="^3zAaG	  r   freturnc                     d| _         | S )a  
    Decorator to mark a function as a Triton built-in function.  These functions
    are evaluated at compile time.

    Args:
        f (function): The function to be marked as a Triton built-in.

    Returns:
        function: The same function, marked as a Triton built-in.
    T)__triton_builtin__)r?  s    r   triton_builtinrC    s      AHr   )_buildernrD  c                    t        | t        j                        sJ t        j                  t        j                  | j
                              S )z[
    A version triton.next_power_of_two that can be used within a kernel on constants.
    )r   r	   	constexprr
   next_power_of_2rk   )rE  rD  s     r   constexpr_next_power_of_2rI    s6     a&&&<<..qww788r   rP   c                |    t        | t        j                        r!| j                  t        j                  d      S |S )z
    Work around triton compile error: `ValueError: `other` cannot be provided without `mask``
    A compile-time to check to return either `val` or `None` depending on the value of mask.
    N)r   r	   rG  rk   )rP   valrD  s      r   if_maskrL    s/     $%$***<||D!!Jr   )F)@r   pymathr   typingr   r   triton_compatr   r   r	   r
   r   rG  log2er   __annotations__r   r#   r.   jitr4   r=   r?   rB   rF   rL   rQ   rT   rX   r[   re   rh   rm   rp   rt   r   r   r   r   r   r   r   r   r   tensorintrA   r   r   r   r   r   r   r  r$  r*  r/  r2  r>  rC  objectrI  rL  rD   r   r   <module>rW     s      = = T]%kfkk&((&;<",, <?$ ' '
 5 5 W W 4 4   4 4         & & & & N N N N > > > > ",,      R\\  . 
( 
(   ? ?  
     + + KIIKIIK K "%	K
 K iiK HHK K 		K K IIK K\ 
2 
2 	
2 
2 
4 
4   D
 D D DN 4 4n   A3
 
||A3 LLA3 LLA3 A3 A3H  <<	
  LL LL  @ 
 '2<<.+r||E2 
	
 LL  > ( (  @b R  +/9	||9"(9\\9 9 26 #  2<<  r   