
    Vh                      d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlmZmZ d dlmZ d dlmZmZmZmZmZmZ d dlZd dlmZ d dlZd dlZd dlmc mZ d dlm Z  d dl!m"Z"m#Z# d d	l$m%Z% d d
l&m'Z' d dl(m)Z)m*Z*m+Z+ d dl,m-Z- ddl.m/Z/m0Z0m1Z1m2Z2 ddl3m4Z4 ddl5m6Z6m7Z7m8Z8 ddl9m:Z: ddl;m<Z<m=Z=m>Z> ddl?m@Z@ ddlAmBZB ddlCmDZD ddlEmFZFmGZGmHZHmIZI ddlJmKZKmLZL ddlMmNZNmOZOmPZPmQZQ ddlmRZRmSZSmTZTmUZUmVZVmWZWmXZXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_ ddl`maZbmcZcmdZdmeZe ddlfmgZg ddlhmiZi ddljmkZkmlZlmmZmmnZnmoZompZpmqZqmrZrmsZsmtZtmuZumvZvmwZwmxZxmyZy dd lzm{Z{m|Z|m}Z}m~Z~mZmZ dd!lmZmZmZmZmZ dd"lmZ er&d d#lmZ d d$lmZ d d%lmZ dd&l7mZ dd'lmZ  ed(      Z ej$                  e      Zej*                  j-                  ed)      Zej*                  j-                  ed*      Zej*                  j-                  ed+      Z e:       Z9 G d, d-      Z ed      dWd.       Z ed      dWd/       Z G d0 d1      Zej<                   G d2 d3             Zej<                   G d4 d5             Z	 	 	 	 	 	 	 	 dXd6Z G d7 d8et      Z e       jF                  ZdYd9ZdYd:ZdZd;ZdYd<Zd[d=Zd\d>Z G d? d@eo      Zd]dAZd^d_dBZ G dC dDes      Zej_                  dE        G dF dGe      Z G dH dI      Zej<                   G dJ dK             Z G dL dM      Zej<                   G dN dO             Z G dP dQeneeeeeef   f   f         Z G dR dSee         Z G dT dUe      Zd`dVZy)a    )annotationsN)IterableSequence)	lru_cache)AnyCallablecastOptionalTYPE_CHECKINGUnion)
PRECEDENCE)get_interface_for_device)identitypreserve_rng_state)is_integer_dtype)
OrderedSet)CeilDivFloorDivModularIndexing)has_triton_package   )free_symbol_is_type
prefix_strsymbol_is_typeSymT)ValueRanges   )configirmetrics)AsyncCompile)	code_hashget_pathPyCodeCache)DefaultHandler)triton_heuristics)benchmarker)AutotuneHintDevicePropertiesTRITON_MAX_BLOCKTRITON_MAX_RSPLIT)get_max_y_gridnext_power_of_2)BaseSchedulerNodeFusedSchedulerNode	SchedulerSchedulerNode)cache_on_selfDelayReplaceLineget_bounds_index_exprget_fused_kernel_nameget_kernel_metadatais_welford_reductionPlaceholderprefix_is_reduction	sympy_dotsympy_product
sympy_substriton_typetriton_version_uses_attrs_dictupcast_compute_type)_opsReductionType	StoreModeV)"get_kernel_category_by_source_code   )BlockPatternMatcher)ArgNameBackendFeatureConstexprArgCSECSEVariableDeferredLineIndentedBufferInplacedBufferOpOverridesPythonPrinter
RemovedArgSizeArg	TensorArgWorkspaceArgWorkspaceZeroMode)constant_reprIterationRangesIterationRangesEntryIterationRangesRoot
SIMDKernelSIMDScheduling)	config_ofequal_1_arg_indicesnon_constexpr_signatureshould_unwrap_unspec_argsignature_to_meta)SymbolicCallArg)
ModuleType)TypeVarDtypePropagationOpsHandler)IRNode)SIMDKernelFeatures_T
perf_hintsschedulefusionc                  @    e Zd ZU dZi Zded<   i Zded<   edd       Zy)	OpDtypeSupportz
    Some Triton ops such as libdevice and tl.math only support float32 and float64.
    This class records which dtypes are supported by specific IR ops.
    z"dict[str, OrderedSet[torch.dtype]]supported_dtypeszdict[str, bool]convert_outputsc                    |j                   }t        t        j                  t        j                  g      | j
                  |<   || j                  |<   y N)__name__r   torchfloat32float64rn   ro   )clsfuncconvert_outputop_names       N/home/dcms/DCMS/lib/python3.12/site-packages/torch/_inductor/codegen/triton.pyregister_upcastzOpDtypeSupport.register_upcast   s=    --(2EMM5==3Q(RW%'5G$    N)rw   zCallable[..., str]rx   boolreturnNone)	rr   
__module____qualname____doc__rn   __annotations__ro   classmethodr{    r|   rz   rm   rm   v   s1    
 <>8=')O_)6 6r|   rm   c                 d    t               syddl} t        | j                  j                  d      ryy)zd
    import AttrsDescriptor if the triton version is new enough to have this
    class defined.
     r   NAttrsDescriptorz4from triton.compiler.compiler import AttrsDescriptor)r   triton.compiler.compilerhasattrcompiler)tritons    rz   gen_attr_descriptor_importr      s-     # v''):;Er|   c                     t               } | j                  d       t               x}r| j                  |       | j                  d       | j	                         S )NzD
        import triton
        import triton.language as tl
        a  
        from torch._inductor.runtime import triton_helpers, triton_heuristics
        from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
        from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
        )rM   splicer   	writelinegetvalue)imports	attr_descs     rz   gen_common_triton_importsr      s[    GNN	 /00y0)$NN	 r|   c                     e Zd ZdZ eej                  ej                  g      Z eej                  ej                  ej                  ge      ZeD  ci c]%  }|t        j                  t        |    ddd      ' c}}}} ZeD  ci c]3  }|t        j                  t        |   j#                          ddd      5 c}}}} Zed
d       Zed
d       Zy	c c}}}} w c c}}}} w )TritonSymbolszU
    Stores sympy.Symbol instances and constants associated with triton codegen.
    offsetTintegernonnegativeBLOCKr   positivec                4    | j                   |j                     S rq   )block_sizessymtrv   trees     rz   get_block_sizezTritonSymbols.get_block_size   s    tyy))r|   c                4    | j                   |j                     S rq   )block_offsetsr   r   s     rz   get_block_offsetzTritonSymbols.get_block_offset   s      ++r|   N)r   rW   r~   zsympy.Symbol)rr   r   r   r   r   r   R0_INDEXR1_INDEXreduction_typesXBLOCKYBLOCKZBLOCKblock_typessympySymbolr   r   upperr   r   r   r   ).0r   r   r   s   0000rz   r   r      s    !$--!?@Odkk4;;VoVWK    	ellj./v6RVWWM  	   	ell$%%'(.t
 	
K * * , ,#
s   *C
8C'
r   c                  z    e Zd ZU ded<   ded<   ded<   ded<   d	ed
<   ddZddZddZddZddZe	dd       Z
y)IndexingOptionsstr	index_strOrderedSet[str]	mask_varsOptional[str]
expand_strr}   _has_rindex
sympy.Exprindexc                ,    t        | j                        S rq   )r}   r   selfs    rz   has_maskzIndexingOptions.has_mask   s    DNN##r|   c                J    t        | j                  t        j                        S rq   )r   r   r   TMPr   s    rz   has_indirectzIndexingOptions.has_indirect   s    "4::txx88r|   c                    | j                   S rq   )r   r   s    rz   
has_rindexzIndexingOptions.has_rindex   s    r|   c                :    t        d | j                  D              S )Nc              3  P   K   | ]  }t        |      j                  d          yw)tmpNr   
startswithr   masks     rz   	<genexpr>z.IndexingOptions.has_tmpmask.<locals>.<genexpr>   s     J43t9''.J   $&anyr   r   s    rz   has_tmpmaskzIndexingOptions.has_tmpmask   s    J4>>JJJr|   c                :    t        d | j                  D              S )Nc              3  P   K   | ]  }t        |      j                  d          yw)rNr   r   s     rz   r   z,IndexingOptions.has_rmask.<locals>.<genexpr>   s     H3t9'',Hr   r   r   s    rz   	has_rmaskzIndexingOptions.has_rmask   s    HHHHr|   c                p    | j                   r)dj                  t        t        | j                               S dS )N & r   )r   joinmapr   r   s    rz   mask_strzIndexingOptions.mask_str   s'    7;~~uzz#c4>>23Q6Qr|   Nr~   r}   r~   r   )rr   r   r   r   r   r   r   r   r   propertyr   r   r|   rz   r   r      sP    N$9 KI R Rr|   r   c                  R   e Zd ZU ded<   ded<   ded<   ded<   d	ed
<   ded<   d	ed<   dZded<   ed!d       Zed!d       Zed!d       Zed!d       Z		 	 	 	 	 	 	 	 	 	 d"dZ
e	 	 	 	 	 	 	 	 	 	 	 	 d#d       Z	 	 	 	 	 	 	 	 d$dZd%d&dZd'dZd(dZd)dZd*dZd*dZd*dZd*dZd*d Zy)+BlockPtrOptionsBlockParametersparamsr   constant_offset	list[int]orderr   r   Sequence[sympy.Expr]broadcast_shapez
list[bool]broadcasting_dimsfinal_shapeNzOptional[list[int]]_boundary_checkc                .    | j                   j                  S rq   )r   shaper   s    rz   r   zBlockPtrOptions.shape   s    {{   r|   c                .    | j                   j                  S rq   )r   block_shaper   s    rz   r   zBlockPtrOptions.block_shape   s    {{&&&r|   c                .    | j                   j                  S rq   )r   stridesr   s    rz   r   zBlockPtrOptions.strides       {{"""r|   c                .    | j                   j                  S rq   )r   offsetsr   s    rz   r   zBlockPtrOptions.offsets   r   r|   c                0  	 t        | j                  | j                        D cg c]#  \  }}|rt        j                  j
                  n|% }}}t        |||      }t        j                  j                  	t        | j                        xr7 t        |      t        |      k7  xs t        	fdt        ||      D              }|r|r0d| dt        j                  j                  | j                         d}t        || j                  |      }|S c c}}w )z
        Generate a broadcast and a reshape for the block pointer.
        This restores stride-0 dimensions which were removed from the block pointer.
        c              3  r   K   | ].  \  }}j                  |d       xs j                  ||        0 ywrE   N)statically_known_equals)r   pre_dimpost_dimsizevarss      rz   r   z@BlockPtrOptions.codegen_broadcast_and_reshape.<locals>.<genexpr>  sG      
 &GX 44Wa@ K77Js   47tl.broadcast_to(, ))zipr   r   r   SOnetriton_reshaperC   graphr   r   lenkernelindex_to_str)
r   valueinitial_shaper   allow_implicitdimis_broadcastingpre_broadcast_shaperequire_broadcastr   s
            @rz   codegen_broadcast_and_reshapez-BlockPtrOptions.codegen_broadcast_and_reshape  s    ),$$d&<&<)
$_ +EGGKK3
 
 um5HI 77## 6 67 	
#$K(88  
 *--@+)N  	 !2&ugR0E0EdFZFZ0[/\\]^E ud&:&:KH=
s   (Dc                   t         j                  j                  d	fd} || j                        | _         || j                        | _        | j                  D cg c]  }j                  |d       }}| j                  D cg c]  }j                  |d       }	}t        |	      rd|	d<   t        | j                  |	      D 
cg c]	  \  }}
|
s| }}}
t        |	|      D cg c]  }t        |       c}fd}t        d
i t        j                  |       j                         D ci c]  \  }}| ||       c}}} |D cg c]  }t        j                  |       }}t         j                   j"                  r%|d   j$                  dk(  sJ |j'                  d       t         j                   j(                  }t         j                   j*                  st-        | j                        t-        t         j                   j.                        |z
  k(  rIt         j                   j0                  j3                         r!|t4        j6                  j8                  g|z  z  }t;        | t         j                  j                  j=                  |      t?        tA        tC        t-        | j                                          ||||      }|jE                  |       |S c c}w c c}w c c}
}w c c}w c c}}w c c}w )z,Helper to create a  BlockPtrOptions instancec                L    | D cg c]  }j                  |       c}S c c}w rq   )lookup_precomputed_size)exprsexprr   s     rz   lookup_sizez+BlockPtrOptions.create.<locals>.lookup_size=  s"    GLMtH44T:MMMs   !r   rE   Fc                R    t        |       D cg c]	  \  }}|s| c}}S c c}}w )z@Removes any broadcasting or singleton dims from a given sequence)r   )ititemis_removableremovable_dimss      rz   remove_dimsz+BlockPtrOptions.create.<locals>.remove_dims_  s3     +.b.*A&D,#   s   #x)r   r   r   r   r   r   r   )r  zIterable[sympy.Expr]r~   list[sympy.Expr]r   )#rC   r  r   r   r   r   r   allr   r   r   dataclassesasdictitemsr   r   r  no_x_dimprefixpopnum_reduction_dimsinside_reductionr  numelsfeaturesis_reductionr   r   r   r   r  listreversedrangecompute_boundary_check)r   r   range_treesr   get_max_blockr  strider   r  singleton_dimsis_singletonr   dimsr  keyvalr   r   reduction_ndimresultr  r   s                       @@rz   createzBlockPtrOptions.create0  s    77##	N #6<<0$V^^4
 GMnn
<BH,,VQ7
 
 AG@R@R
9<H,,S!4
 
 ~!&N2 &)););^%L
!\ 
 
 14NDU0VW#d)W	 ! 
5@5G5G5O5U5U5WXcsK$$X

 GRRd}33D9RR88q>((C///OOA44))FNN#s188??';n'LL!!..0 EGGKK=>99K GG,,DD_Uxc&,,&7 89:#+/
 	%%m4


 X Y Ss$   KK$K)+K/2K4K:c                D    t         j                  |   }t        |||i      S )zN
        Replaces instances of {symt}_offset with the new expression.
        )r   r   r<   )r   r  replacementr   roffsets        rz   replace_offsetzBlockPtrOptions.replace_offset  s&      --d3$+ 677r|   c           	         d fd}t         j                  j                  }g  j                  }|s|D cg c]
  } ||       }} j                  dk7  r| d | j                         dn|d | j
                         d | j                         d | j                         d | j                         d	 ||       g}d
dj                  |       dS c c}w )a  
        Codegen a call to tl.make_block_ptr()

        Args:
            name: variable name for pointer
            roffset: should rn_offset be included in offsets=..., for use with tl.advance()

        Returns:
            "tl.make_block_ptr(...)"
        c                ~    t         j                  D ](  }j                  | t        j                  d      |      } * | S Nr   )r   r   r:  r   Integer)r  r   r   s     rz   remove_roffsetsz/BlockPtrOptions.format.<locals>.remove_roffsets  s<    %55 I**4q1A4HIKr|   r    + (r   zshape=zstrides=zblock_shape=zorder=zoffsets=ztl.make_block_ptr(r   )r  r   r~   r   )
rC   r  r  r   r   r   r   r   r   r   )r   namer9  r?  fr   r   argss   `       rz   formatzBlockPtrOptions.format  s    	
 HH!!!DLL/=DE6v.EGE ''1, &Qt3345Q7Qtzz]O$q'(1T--./0Qtzz]O$qzl#
 $DIIdO#4A66 Fs   C c           
        t         j                  j                  }t        j                  j                         D ci c]  \  }}| |t        |          }}}t        t        | j                              D cg c]  }|j                  | j                  |   t        j                  j                        s|j                  | j                  |   | j                   |         s|j                  | j                  |   t#        | j                   |   |            sMt         j$                  j&                  r1| j                   |   t        j                  t(        j*                     k(  s| c}| _        yc c}}w c c}w )z6List of indices to pass to tl.load(boundary_check=...)N)rC   r  r   r   r   r  r   r*  r  r   r   r   r   r   Zerostatically_known_multiple_ofr   r<   r  r   r   r   r   )r   r-  r   r   
block_sizeblock_to_maxidxs          rz   r+  z&BlockPtrOptions.compute_boundary_check  s-   77## %2$=$=$C$C$E/
 j j&677/
 /
 S_- 
44T\\#5FU ==JJsOT%5%5c%: !==JJsOZ0@0@0E|%T HH%%((-1J1J4;;1WW  
/

 
s   E/9C-E5c                6    | j                   J | j                   S rq   )r   r   s    rz   boundary_checkzBlockPtrOptions.boundary_check  s     ##///###r|   c           	         t         j                  |   }| j                  D cg c]A  }| j                  |||      | j                  |t        j
                  j                  |      z
  C }}|S c c}w )av  
        Codegen string to pass to tl.advance(name, ...).

        Advance is the difference between offsets in each loop iteration.
        To compute it, we replace rN_offset with multiples of RN_BLOCK.
        Since we expect rN_offset to vary in range(0, rN_numel, RN_BLOCK), the first
        iteration has rN_offset=0, while the second has rN_offset=RN_BLOCK.
        )r   r   r   r:  r   r   rF  )r   r   rblockr   advances        rz   advance_roffsetzBlockPtrOptions.advance_roffset  st     **40 ,,

  ##FFD9%%feggllDAB
 
 
s   AA,c                     yNFr   r   s    rz   r   zBlockPtrOptions.has_indirect      r|   c                :    t        d | j                  D              S )Nc              3  P   K   | ]  }t        |t        j                           y wrq   )r   r   r   )r   r  s     rz   r   z-BlockPtrOptions.has_rindex.<locals>.<genexpr>  s%      
  m&C&CD
r   )r   r   r   s    rz   r   zBlockPtrOptions.has_rindex  s"     
((
 
 	
r|   c                "    | j                         S rq   )r   r   s    rz   r   zBlockPtrOptions.has_rmask  s      r|   c                     yrR  r   r   s    rz   r   zBlockPtrOptions.has_tmpmask  rS  r|   c                4    t        | j                               S rq   )r}   rL  r   s    rz   r   zBlockPtrOptions.has_mask  s    D'')**r|   r~   r  )
r  r   r  r   r   r   r  r}   r~   r   )r   r   r   r   r,  zlist[IterationRangesRoot]r   r   r-  Callable[[str], int]r~   r   )r  r   r8  r   r   r   r~   r   T)rA  r   r~   r   )r-  rZ  r~   r   )r~   r   )r   r   r~   r   r   )rr   r   r   r   r   r   r   r   r   r   r  staticmethodr6  r:  rD  r+  rL  rP  r   r   r   r   r   r   r|   rz   r   r      se   ))!!%%+/O(/! ! ' ' # # # #++ ,+ *	+
 + 
+Z TT $T /	T
 #T ,T 
T Tl88-78?C8	8!7F
:$&
!+r|   r   c                r   t        |t              rt        |t              sJ |D cg c]!  }t        j                  j	                  |      # }}|D cg c]!  }t        j                  j	                  |      # }}||k(  r| S |D cg c]
  }|dk7  s	| c}|k7  rd|  ddj                  |       dS d}g }|D ]G  }	|t        |      k  r|	||   k(  r|j                  d       |dz  }0|	dk(  sJ |j                  d	       I |t        |      k(  sJ |  d
dj                  |       dS c c}w c c}w c c}w )z7Workaround https://github.com/openai/triton/issues/28361ztl.reshape(z, [r   z])r   :rE   r   [])
isinstancer(  rC   r  r  r   r  append)
r  	old_shape	new_shaper   old_shape_strnew_shape_strsrJ  expandsizes
             rz   r   r     sE    i&:i+FFF?HIeQXX**51IMI?HIeQXX**51IMI% -aAH->UG3tyy'?&@CC
CF "]##c0B(BMM#1HC3;;MM&!" #m$$$$WAdii'(**% JI .s   &D*&D/
D4D4c                      e Zd ZddZddZddZddZddZddZddZ	ddZ
dd	Zdd
ZddZddZeZddZddZddZddZddZddZddZddZddZddZddZddZddZddZddZddZy) TritonPrinterc                    t        |j                        dk(  sJ d| j                  |j                  d          dt        j                  j
                   dS )NrE   libdevice.trunc(r   ).to(r   r  rC  _printrC   r  index_dtyper   r  s     rz   _print_TruncToIntzTritonPrinter._print_TruncToInt  M    499~"""t{{499Q<89qxx?S?S>TTUV	
r|   c                x    t        j                         rt        j                  j                  r| }|S d| d}|S )Nztl.full([], z, tl.float64))r   	is_fbcoders   versionhip)r   r  rets      rz   _print_FloatzTritonPrinter._print_Float   s=    %--"3"3FC 
 !m4C
r|   c                    t        |j                        dk(  sJ | j                  |j                  d   t        d   dz
        }| dS )NrE   r   Atom      ?z.to(tl.float64))r  rC  parenthesizer   )r   r  rh  s      rz   _print_ToFloatzTritonPrinter._print_ToFloat'  sI    499~"""diilJv,>,DEO$$r|   c                    |j                   \  }}|j                  r3|j                  r'| j                  |j                   dt        d   dz
        S | j	                  |      }| j	                  |      }d| d| dS )N % r}  r~  z!triton_helpers.remainder_integer(r   r   )rC  is_nonnegative	stringifyr   rq  r   r  quotdivquot_sdiv_ss         rz   _print_PythonModzTritonPrinter._print_PythonMod,  sr    II	c3#5#5>>$))UJv4F4LMMT"C 26("UG1EEr|   c                   |j                   sJ |j                  \  }}|j                  r3|j                  r'| j                  |j                  dt        d   dz
        S | j                  |      }| j                  |      }d| d| dS )N // r}  r~  z!triton_helpers.div_floor_integer(z,  r   )
is_integerrC  r  r  r   rq  r  s         rz   _print_FloorDivzTritonPrinter._print_FloorDiv4  s~    II	c3#5#5>>$))VZ5G#5MNNT"C 26(#eWAFFr|   c                P    | j                  |j                  dt        d   dz
        S )N / r}  r~  )r  rC  r   rs  s     rz   _print_IntTrueDivzTritonPrinter._print_IntTrueDiv?  s#    ~~dii
60BS0HIIr|   c                    t        |j                        dk(  sJ d| j                  |j                  d          dt        j                  j
                   dS NrE   libdevice.floor(r   ro  r   rp  rs  s     rz   _print_floorzTritonPrinter._print_floorD  ru  r|   c                    t        |j                        dk(  sJ d| j                  |j                  d          dt        j                  j
                   dS r  rp  rs  s     rz   _print_FloorToIntzTritonPrinter._print_FloorToIntJ  ru  r|   c                    t        |j                        dk(  sJ d| j                  |j                  d          dt        j                  j
                   dS NrE   libdevice.ceil(r   ro  r   rp  rs  s     rz   _print_ceilingzTritonPrinter._print_ceilingP  K    499~""" TYYq\!: ;5AUAU@VVWXXr|   c                    t        |j                        dk(  sJ d| j                  |j                  d          dt        j                  j
                   dS r  rp  rs  s     rz   _print_CeilToIntzTritonPrinter._print_CeilToIntT  r  r|   c                ,    d| j                  |       dS )Nzlibdevice.sqrt(().to(tl.float32)))rq  rs  s     rz   _helper_sqrtzTritonPrinter._helper_sqrtX  s    !$++d"3!44EFFr|   c                    d| j                  |j                  d          d| j                  |j                  d          dS )Nlibdevice.pow(r   r   rE   r   )rq  rC  rs  s     rz   _print_FloatPowzTritonPrinter._print_FloatPow[  s?    T[[167r$++diiPQl:S9TTUV	
r|   c                    | j                  |j                  d         }| j                  |j                  d         }| j                  |j                  d         }d| d| d| dS )Nr   rE   r   	tl.where(r   r   )doprintrC  )r   r  cpqs        rz   _print_WherezTritonPrinter._print_Whereb  s_    LL1&LL1&LL1&1#Rs"QCq))r|   c                   t        |j                        dk(  r| j                  |j                  d         S t        |j                        dz  }t        |      }| j                   ||j                  d|        }| j                   ||j                  |d        }t	        d ||fD              \  }}|dv sJ d| d       d	| d
| d| d| d| d
| d| d| dS )zK
        Helper for max/min code genereration.
        cmp: > or <
        rE   r   r   Nc              3  (   K   | ]
  }d | d  yw)(r   Nr   r   r  s     rz   r   z6TritonPrinter._print_min_max_helper.<locals>.<genexpr>w  s     .!q1X.s   )><zUnexpected comparator: ''r  z * ( z= z) + )))r  rC  rq  typetuple)r   r  cmpmidrv   abs          rz   _print_min_max_helperz#TritonPrinter._print_min_max_helperh  s    
 tyy>Q;;tyy|,,$))n!4jKKTYYt_-.KKTYYst_-. .1v..1j C$<SE"CC 1#T!AcU"QCtA3d1#Qse1QCrBBr|   c                &    | j                  |d      S )Nr  r  rs  s     rz   
_print_MinzTritonPrinter._print_Min{      ))$44r|   c                &    | j                  |d      S )Nr  r  rs  s     rz   
_print_MaxzTritonPrinter._print_Max~  r  r|   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrE   tl_math.abs(r   r   r  rC  rq  rs  s     rz   
_print_AbszTritonPrinter._print_Abs  s9    499~"""dkk$))A,78::r|   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrE   zlibdevice.cos((r   r  r  rs  s     rz   _print_OpaqueUnaryFn_cosz&TritonPrinter._print_OpaqueUnaryFn_cos  :    499~""" TYYq\!: ;;LMMr|   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrE   zlibdevice.cosh((r   r  r  rs  s     rz   _print_OpaqueUnaryFn_coshz'TritonPrinter._print_OpaqueUnaryFn_cosh  :    499~"""!$++diil";!<<MNNr|   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrE   zlibdevice.acos((r   r  r  rs  s     rz   _print_OpaqueUnaryFn_acosz'TritonPrinter._print_OpaqueUnaryFn_acos  r  r|   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrE   zlibdevice.sin((r   r  r  rs  s     rz   _print_OpaqueUnaryFn_sinz&TritonPrinter._print_OpaqueUnaryFn_sin  r  r|   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrE   zlibdevice.sinh((r   r  r  rs  s     rz   _print_OpaqueUnaryFn_sinhz'TritonPrinter._print_OpaqueUnaryFn_sinh  r  r|   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrE   zlibdevice.asin((r   r  r  rs  s     rz   _print_OpaqueUnaryFn_asinz'TritonPrinter._print_OpaqueUnaryFn_asin  r  r|   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrE   zlibdevice.tan((r   r  r  rs  s     rz   _print_OpaqueUnaryFn_tanz&TritonPrinter._print_OpaqueUnaryFn_tan  r  r|   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrE   zlibdevice.tanh((r   r  r  rs  s     rz   _print_OpaqueUnaryFn_tanhz'TritonPrinter._print_OpaqueUnaryFn_tanh  r  r|   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrE   zlibdevice.atan((r   r  r  rs  s     rz   _print_OpaqueUnaryFn_atanz'TritonPrinter._print_OpaqueUnaryFn_atan  r  r|   c                    t        |j                        dk(  sJ d| j                  |j                  d          dt        j                  j
                   dS )NrE   zlibdevice.llrint(r   ro  r   rp  rs  s     rz   _print_RoundToIntzTritonPrinter._print_RoundToInt  sM    499~"""DIIaL 9:%@T@T?UUVW	
r|   c                    t        |j                        dk(  sJ |j                  \  }}|j                  r|dk  sJ t        d| d      | j	                  |t
        d         }d| d| d|  S )	Nr   r   zOFor integer inputs, only non-negative ndigits are currently supported, but got .Mulzlibdevice.nearbyint(1e * z) * 1e)r  rC  r  
ValueErrorr  r   )r   r  numberndigits
number_strs        rz   _print_RoundDecimalz!TritonPrinter._print_RoundDecimal  s    499~"""))Q;;abiajjkl  &&vz%/@A
'yJ<vwhZPPr|   N)r  r   r~   r   )r  r   r  r   r~   r   ) rr   r   r   rt  r{  r  r  r  r  r  r  r  r  r  r  _print_PowByNaturalr  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r|   rz   rl  rl    s    
%
FGJ


YYG

 **C&55;NOONOONOO
Qr|   rl  c                *    t        t        |             S )zCConvert torch.dtype to triton type and upcast [b]float16 to float32)r=   r?   dtypes    rz   triton_compute_typer    s    *5122r|   c                ^    | t         j                  k(  rt         j                  } t        |       S )z@Convert torch.dtype to triton type, with fix for storing tl.bool)rs   r}   int8r=   r  s    rz   triton_store_typer    s"    



ur|   c                    t        |       r+| j                  r| j                  dk  rt        j                  S t        |       S )z0Implicit upcasts used for Triton reduction types   )r   	is_signeditemsizers   int32r?   r  s    rz   upcast_acc_dtyper    s0    5??u~~7J{{u%%r|   c                *    t        t        |             S )z:Convert torch.dtype to triton type, with reduction upcasts)r  r  r  s    rz   triton_acc_typer    s    /677r|   c                <    | j                   dk  xr | j                  S )Nr   )r  is_floating_pointr  s    rz   low_precision_fpr    s    >>Q:5#:#::r|   c                    t        | t              sy| j                  }t        |t        j                        rt	        |      S dS rR  )rb  rK   r  rs   r  )varr  s     rz   low_precision_fp_varr    s6    c;'IIE&0&DE"O%Or|   c                  &     e Zd Zd fdZd Z xZS )TritonCSEVariablec                f    t         |   |||       t        t                  | _        |J d       y )Nz!TritonCSEVariable must have dtype)super__init__r   r   r   )r   rA  boundsr  	__class__s       rz   r  zTritonCSEVariable.__init__  s4    vu-#C* E"EE r|   c                F   |D ]  }t        |t              r&| j                  j                  |j                         9t        |t        j
                        sTt        j                  D ]6  }t        ||      s| j                  j                  t        |    dg          y )Nr   )
rb  r  r   updater   r   r   r   r   r   )r   rA  rC  kwargsargr   s         rz   update_on_argsz TritonCSEVariable.update_on_args  s     
	C#01%%cmm4C. *55 D%c40--*T2B1C4/H.IJ
	r|   )r   zValueRanges[Any]r  torch.dtyper~   r   )rr   r   r   r  r  __classcell__r  s   @rz   r  r    s    Fr|   r  c                     ddl m}   |        S )Nr   rd   )!torch._inductor.dtype_propagationre   rd   s    rz   get_dtype_handlerr    s    L%''r|   c                0     dddfdd fd}|S )z
    Codegen helper to upcast arguments to float32, depending on the config and dtype.
    This decorates tl.math/libdevice codegen functions.
    c                    t         j                  j                   xr> t        | t              xr, | j
                  t        j                  t        j                  fv S rq   )	r   r   codegen_upcast_to_fp32rb  rK   r  rs   float16bfloat16)r  s    rz   needs_upcastz*maybe_upcast_float32.<locals>.needs_upcast  sD    444 =3,=		emmU^^<<	
r|   c                (     |       rdnd}|  | S )N.to(tl.float32)r   r   )r  upcast_stringr  s     rz   maybe_upcast_argz.maybe_upcast_float32.<locals>.maybe_upcast_arg  s!    -9#->)B}o&&r|   c                H     t         j                          d fd}|S )Nc                    | D cg c]
  } |       }}|j                         D ci c]  \  }}| |       }}} |i |}xr6 t        fdt        j                  | |j	                               D              }|sd n# t        t               j                        | i |}	|	t        j                  d fv}
|
r|	dt        |	       dnd}| | S c c}w c c}}w )Nc              3  .   K   | ]  } |        y wrq   r   )r   r  r  s     rz   r   zKmaybe_upcast_float32.<locals>.decorator.<locals>.wrapped.<locals>.<genexpr>  s      6&)S!6s   .to(r   r   )r  r   	itertoolschainvaluesgetattrr  rr   rs   rt   r=   )rC  r  r  upcast_argsr2  r3  upcast_kwargsr5  any_needs_upcastresult_dtypeneeds_downcastdowncast_stringrx   rw   r  r  s               rz   wrappedz8maybe_upcast_float32.<locals>.decorator.<locals>.wrapped  s   <@AS+C0AKAHNWHCS"23"77WMW ;8-8F-  # 6-6__T6==?-S6 3
 ( @W.0$--@$Q&Q 
 *%--1FFN "l&> {<013 
 Xo.//' BWs
   CCr   )rm   r{   )rw   r%  rx   r  r  s   ` rz   	decoratorz'maybe_upcast_float32.<locals>.decorator  s$    &&t^<	0 	0. r|   r   r   )rw   Callable[..., Any]r~   r'  r   )rx   r&  r  r  s   ` @@rz   maybe_upcast_float32r(    s    
': r|   c                     e Zd ZdZ ej
                  ej                        Ze	 	 dT	 	 	 dUd       Z	edVd       Z
ed        Zed        Ze e       d               Zed	        Zed
        Ze e       d               Ze e       d               Ze e       d               Ze e       d               Ze e       d               Ze e       d               Ze e       d               Zed        Zed        Zed        Zed        Zedej<                  dddd       Ze e       d               Z e e       d               Z!e e       d               Z"e e       d               Z#ed        Z$ed        Z%e e       d               Z&e e       d                Z'e e       d!               Z(e e       d"               Z)e e       d#               Z*e e       d$               Z+e e       d%               Z,e e       d&               Z-e e       d'               Z.e e       d(               Z/e e       d)               Z0e e       d*               Z1e e       d+               Z2e e       d,               Z3e e       d-               Z4e e       d.               Z5e e       d/               Ze e       d0               Z6ed1        Z7ed2        Z8ed3        Z9ed4        Z:ed5        Z;ed6        Z<ed7        Z=ed8        Z>ed9        Z?ed:        Z@ed;        ZAed<        ZBed=        ZCed>        ZDe e       d?               ZEe e       d@               ZFe e       dA               ZGe e       dB               ZHe e       dC               ZIedD        ZJe e       dE               ZKe e       dF               ZLe e       dG               ZMe e       dH               ZNe edIJ      dK               ZOe edIJ      dL               ZPe e       dM               ZQe e       dN               ZRedO        ZSedP        ZTe e       dQ               ZUedR        ZVe e       dS               ZWy)WTritonOverrideszMap element-wise ops to TritonNTc                :   	 	 	 	 	 	 dd}|>t         |||      t        j                  j                        t        j                  _        |t        j
                  k(  rd|  dS |t        j                  k(  r|  dS |rt        |      }nt        |      }|  d| dS )Nc                   | |k(  ryt         j                  t         j                  f}| |v r||v r| |k7  rJ d       | t         j                  k(  s|t         j                  k(  ry| t         j                  k(  s|t         j                  k(  ryy)Nr   zCConversions between float8_e5m2 and float8_e4m3fn is not supported!r  r   )rs   float8_e4m3fnfloat8_e5m2)	src_dtype	dst_dtype
fp8_dtypess      rz   _get_min_elements_per_threadz>TritonOverrides.to_dtype.<locals>._get_min_elements_per_thread<  s     I% ##!!J Z'+*U U	U 
 E---e>O>O1OE///9@S@S3Sr|   r  z != 0)z.to(tl.int8).to(tl.uint8)r  r   )r/  r  r0  r  r~   int)	maxrC   r  min_elem_per_threadrs   r}   uint8r  r  )r  r  r/  use_compute_typesr2  	out_dtypes         rz   to_dtypezTritonOverrides.to_dtype5  s    	"	/:		6   ,/,Y>,,,AHH(
 EJJqc= ekk! S122+E2I)%0ID1%%r|   c                    |j                   |j                   k(  sJ | j                  |k7  r|  dt        |       d} |  dt        |       d}t        |      |k7  r| dt        t        |             d}|S )Nr  r   z, bitcast=True))r  r  r=   r?   )r  r  r/  outs       rz   to_dtype_bitcastz TritonOverrides.to_dtype_bitcastn  s    !!U^^333 77i#T+i013A4E*+?;u%.Ek*=e*DEFaHC
r|   c                    t         j                  j                  |      }t         ||             }t	        |      }|dk(  r|S d| d| d| dS )Nz
tl.float32tl.full(r   r   )rs   _prims_commondtype_to_typerV   r  )r  r  r   type_
triton_valr=   s         rz   _shaped_constantz TritonOverrides._shaped_constant  s[    ##11%8"5<0
)%0,& %:,bQ??r|   c                *    | j                  ||g       S )Nr   )rC  )rv   r  r  s      rz   constantzTritonOverrides.constant  s    ##E5#;;r|   c                    d|  dS )Nr  r   r   r  s    rz   abszTritonOverrides.abs       aS""r|   c                    d|  d| d}t        |       st        |      rMt               j                  | |      }|t        j                  t        j
                  fv r| dt        |       d}|S )Nr  r  r   r  )r  r  truedivrs   r  rt   r=   r  yr;  r8  s       rz   rL  zTritonOverrides.truediv  sl    !Cs!n"&:1&=)+33Aq9IU]]EMM::T+i"8!9;
r|   c                    d|  d| d}t        |       st        |      rMt               j                  | |      }|t        j                  t        j
                  fv r| dt        |       d}|S )Nr  r  r   r  )r  r  modrs   r  rt   r=   rM  s       rz   rP  zTritonOverrides.mod  sl    !Cs!n"&:1&=)+//15IU]]EMM::T+i"8!9;
r|   c                    d|  dS )Nzlibdevice.abs(r   r   rH  s    rz   libdevice_abszTritonOverrides.libdevice_abs        s!$$r|   c                \    t         j                  rd|  dt        j                   dS d|  dS )z
        When use_fast_math, use the ftz (flushing to zero) variant
        of exponent computation.

        Check https://github.com/triton-lang/triton/issues/5735 for
        more details.
        libdevice.exp2(r  r   ztl_math.exp()r   use_fast_mathr*  _LOG_2_ErH  s    rz   expzTritonOverrides.exp  s8     $QCs?+C+C*DAFF!!A&&r|   c                    d|  dS )Nzlibdevice.exp(r   r   rH  s    rz   libdevice_expzTritonOverrides.libdevice_exp  rS  r|   c                    d|  dS )NrU  r   r   rH  s    rz   exp2zTritonOverrides.exp2       !1%%r|   c                    d|  dS )Nzlibdevice.expm1(r   r   rH  s    rz   expm1zTritonOverrides.expm1       "!A&&r|   c                    d|  dS Nzlibdevice.sqrt(r   r   rH  s    rz   sqrtzTritonOverrides.sqrt  r]  r|   c                    d|  dS rb  r   rH  s    rz   libdevice_sqrtzTritonOverrides.libdevice_sqrt  r]  r|   c                   t         j                  j                  }|dk(  ry|dk(  r	d|  d|  dS |dk(  r|  dS |8t        j                  t        j
                  d	t        j                        |       S t        d
|      )Ncompile_errorzcompile error!runtime_errorz"triton_helpers.device_assert_then(z == 0, "injected assert fail", r   accuracyz + 1r   z:unrecognized config triton.inject_relu_bug_TESTING_ONLY = )	r   r   inject_relu_bug_TESTING_ONLYopsmaximumrF  rs   r  AssertionError)r  bugs     rz   reluzTritonOverrides.relu  s    mm88/!#O# 8s:YZ[Y\\]^^JS:[;;s||Au{{;Q?? LSGT r|   c                    d|  d| dS )Nztriton_helpers.minimum(r   r   r   r  r  s     rz   minimumzTritonOverrides.minimum      (2aS22r|   c                    d|  d| dS )Nztriton_helpers.maximum(r   r   r   rq  s     rz   rl  zTritonOverrides.maximum  rs  r|   c                    d|  d| d| dS )Nr  r   r   r   )r  r  r  s      rz   wherezTritonOverrides.where  s    1#Rs"QCq))r|   rE   )constraintsr  is_purepackc                    t        |      }dj                  |D cg c]  }t        |       c}      }|#dj                  dg|D 	cg c]  }	d c}	z         }d|  d| d| d| d| d	| d
S c c}w c c}	w )Nr   z=rr   ztl.inline_asm_elementwise('z', 'z', [z	], dtype=z
, is_pure=z, pack=r   )r  r   r   )
asmrw  r  rx  ry  inputsr=   i
input_refs_s
             rz   inline_asm_elementwisez&TritonOverrides.inline_asm_elementwise  s     *%0YY71A78
))TF6-Bac-B$BCK,SEk]$zlR[\g[hhrszr{  |C  DH  CI  IJ  K  	K  8-Bs   A.	A3
c                    d|  dS )Nztl_math.cos(r   r   rH  s    rz   coszTritonOverrides.cos  rJ  r|   c                    d|  dS )Nzlibdevice.cos(r   r   rH  s    rz   libdevice_coszTritonOverrides.libdevice_cos  rS  r|   c                    d|  dS )Nztl_math.sin(r   r   rH  s    rz   sinzTritonOverrides.sin  rJ  r|   c                    d|  dS )Nzlibdevice.sin(r   r   rH  s    rz   libdevice_sinzTritonOverrides.libdevice_sin  rS  r|   c                    t        d      )Nz/ops.index_expr not implemented outside a kernelNotImplementedError)rv   r  r  s      rz   
index_exprzTritonOverrides.index_expr  s    !"STTr|   c                    t        d      )Nz+ops.masked not implemented outside a kernelr  )r   bodyothers      rz   maskedzTritonOverrides.masked  s    !"OPPr|   c                    d|  dS )Nzlibdevice.lgamma(r   r   rH  s    rz   lgammazTritonOverrides.lgamma       #1#Q''r|   c                    d|  dS )Nzlibdevice.erf(r   r   rH  s    rz   erfzTritonOverrides.erf"  rS  r|   c                    d|  dS )Nzlibdevice.cosh(r   r   rH  s    rz   coshzTritonOverrides.cosh'  r]  r|   c                    d|  dS )Nzlibdevice.sinh(r   r   rH  s    rz   sinhzTritonOverrides.sinh,  r]  r|   c                    d|  dS )Nzlibdevice.acos(r   r   rH  s    rz   acoszTritonOverrides.acos1  r]  r|   c                    d|  dS )Nzlibdevice.acosh(r   r   rH  s    rz   acoshzTritonOverrides.acosh6  r`  r|   c                    d|  dS )Nzlibdevice.asin(r   r   rH  s    rz   asinzTritonOverrides.asin;  r]  r|   c                    d|  dS )Nzlibdevice.asinh(r   r   rH  s    rz   asinhzTritonOverrides.asinh@  r`  r|   c                    d|  d| dS )Nzlibdevice.atan2(r   r   r   r  rN  s     rz   atan2zTritonOverrides.atan2E       "!Bqc++r|   c                    d|  dS )Nzlibdevice.atan(r   r   rH  s    rz   atanzTritonOverrides.atanJ  r]  r|   c                    d|  dS )Nzlibdevice.atanh(r   r   rH  s    rz   atanhzTritonOverrides.atanhO  r`  r|   c                    d|  d| dS )Nzlibdevice.copysign(r   r   r   r  s     rz   copysignzTritonOverrides.copysignT  s     %QCr!A..r|   c                    d|  dS )Nzlibdevice.erfc(r   r   rH  s    rz   erfczTritonOverrides.erfcY  r]  r|   c                    d|  dS )Nzlibdevice.erfinv(r   r   rH  s    rz   erfinvzTritonOverrides.erfinv^  r  r|   c                    d|  d| dS )Nzlibdevice.hypot(r   r   r   r  s     rz   hypotzTritonOverrides.hypotc  r  r|   c                    d|  dS )Nzlibdevice.log10(r   r   rH  s    rz   log10zTritonOverrides.log10h  r`  r|   c                    d|  dS )Nzlibdevice.log2(r   r   rH  s    rz   log2zTritonOverrides.log2m  r]  r|   c                    d|  d| dS )Nzlibdevice.nextafter(r   r   r   r  s     rz   	nextafterzTritonOverrides.nextafterr  s     &aS1#Q//r|   c                    |  d| S Nr   r   rq  s     rz   logical_andzTritonOverrides.logical_andw      Cs|r|   c                    |  dS )Nz == 0r   r  s    rz   logical_notzTritonOverrides.logical_not{  s    E{r|   c                    |  d| S Nz | r   rq  s     rz   
logical_orzTritonOverrides.logical_or  r  r|   c                    d|  d| dS )Nr   ^ r   r   rq  s     rz   logical_xorzTritonOverrides.logical_xor  s    1#S1~r|   c                    |  d| S r  r   rq  s     rz   bitwise_andzTritonOverrides.bitwise_and  r  r|   c                    d|  S )N~r   r  s    rz   bitwise_notzTritonOverrides.bitwise_not  s    1#wr|   c                    |  d| S r  r   rq  s     rz   
bitwise_orzTritonOverrides.bitwise_or  r  r|   c                    |  d| S )Nr  r   rq  s     rz   bitwise_xorzTritonOverrides.bitwise_xor  r  r|   c                    |  d| S )Nz << r   rq  s     rz   bitwise_left_shiftz"TritonOverrides.bitwise_left_shift      D}r|   c                    |  d| S )Nz >> r   rq  s     rz   bitwise_right_shiftz#TritonOverrides.bitwise_right_shift  r  r|   c                     d| d}d|  d| dS )Nr  ).to(tl.uint32)ztl.rand(r   r   r   seedr   s     rz   randzTritonOverrides.rand  s%    VHO,$r&++r|   c                     d| d}d|  d| dS )Nr  r  z	tl.randn(r   r   r   r  s     rz   randnzTritonOverrides.randn  s%    VHO,4&6(!,,r|   c           	     ,    d| d}d|  d| d| d| d	S )Nr  r  ztriton_helpers.randint64(r   r   r   )r  r   lowhighs       rz   	randint64zTritonOverrides.randint64  s1    VHO,*4&6("SED6KKr|   c                    t        d      )Nz.ops.load_seed not implemented outside a kernelr  )rA  r   s     rz   	load_seedzTritonOverrides.load_seed  s    !"RSSr|   c                    d|  dS )Nzlibdevice.rsqrt(r   r   rH  s    rz   rsqrtzTritonOverrides.rsqrt  r`  r|   c                    d|  dS )Nzlibdevice.log1p(r   r   rH  s    rz   log1pzTritonOverrides.log1p  r`  r|   c                    d|  dS )Nzlibdevice.tan(r   r   rH  s    rz   tanzTritonOverrides.tan  rS  r|   c                    d|  dS )Nzlibdevice.tanh(r   r   rH  s    rz   tanhzTritonOverrides.tanh  r]  r|   c                    d|  dS )Nztl.sigmoid(r   r   rH  s    rz   sigmoidzTritonOverrides.sigmoid  s     QCq!!r|   c                    d|  d|  d|  dS )Nz(libdevice.signbit(z) != 0) if (z).dtype is tl.float32 else z < 0r   rH  s    rz   signbitzTritonOverrides.signbit  s#     "!L3NqcQUV	
r|   c                    d|  d| dS )Nzlibdevice.fmod(r   r   r   rq  s     rz   fmodzTritonOverrides.fmod  s     !2aS**r|   c                    d|  d| dS )Nr  r   r   r   rq  s     rz   powzTritonOverrides.pow  s      s"QCq))r|   c                    d|  dS )Nztl_math.log(r   r   rH  s    rz   logzTritonOverrides.log  rJ  r|   c                    d|  dS )Nzlibdevice.log(r   r   rH  s    rz   libdevice_logzTritonOverrides.libdevice_log  rS  r|   F)rx   c                    d|  dS )Nzlibdevice.isinf().to(tl.int1)r   rH  s    rz   isinfzTritonOverrides.isinf       "!M22r|   c                    d|  dS )Nzlibdevice.isnan(r  r   rH  s    rz   isnanzTritonOverrides.isnan  r  r|   c                    d|  dS )Nzlibdevice.nearbyint(r   r   rH  s    rz   roundzTritonOverrides.round  s     &aS**r|   c                    d|  dS )Nr  r   r   rH  s    rz   floorzTritonOverrides.floor  r`  r|   c                H    |  d| }|  d| }d|  d| d| d| d| d| d	S )
Nr  r  z
tl.where((z
 < 0) != (z < 0), tl.where(z != 0, z - 1, ), r   r   )r  r  r  rems       rz   floordivzTritonOverrides.floordiv  sV    
 D}3qclA3j+;C5vVTXSYY\]a\bbcddr|   c                f   t        j                  dt        j                        }t        j                  t        j
                  ||       t        j                        }t        j                  t        j
                  | |      t        j                        }t        j                  ||      }| d|  dS )Nr   r  .dtype))rk  rF  rs   r  r9  ltr  sub)r  zleftrightr  s        rz   signzTritonOverrides.sign  su    LLEKK(||SVVAq\EJJ7cffQlUZZ8ggdE"d1#W%%r|   c                    d|  dS )Nrn  r   r   rH  s    rz   trunczTritonOverrides.trunc  r`  r|   c                    |  d| S )Nr  r   rq  s     rz   truncdivzTritonOverrides.truncdiv  s     D}r|   c                    d|  dS )Nr  r   r   rH  s    rz   ceilzTritonOverrides.ceil  r]  r|   )NT)r  r  r/  zOptional[torch.dtype])r  r  r/  r  )Xrr   r   r   r   mathr  erW  r\  r9  r<  rC  r   rF  r(  rI  rL  rP  rR  rX  rZ  r\  r_  rc  re  ro  rr  rl  rv  rs   rt   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r	  r  r  r  r   r|   rz   r*  r*  0  s1   (tyy H ,0	6&6& )6& 6&p    @ @ < < #  #     %  % '  ' %  % &  & '  ' &  & &  &  " 3 3 3 3 * * "&emmTPQK K #  # %  % #  # %  % U U Q Q (  ( %  % &  & &  & &  & '  ' &  & '  ' ,  , &  & '  ' /  / &  & (  ( ,  , '  ' &  & 0  0                     , , - - L L T T '  ' '  ' %  % &  & "  " 
 
 +  + *  * #  # %  % /3 0 3 /3 0 3 +  + '  ' e e & & '  '  
 &  &r|   r*  r   c                  `    e Zd ZdZed        Zed        Zed        Zed        Z	ed        Z
y)TritonKernelOverridesa   Map element-wise ops to Triton within a TritonKernel

    Unlike TritonOverrides, these assume the code is going to be inserted into
    the body of the main triton kernel and so it may use indexing and mask
    variables which are assumed to already be defined in the current scope.
    c                r    t         j                  j                         }dg|z  }| j                  |||      S )NrE   rE  )rC   r  triton_tensor_ndimrC  )rv   r  r  ndimr   s        rz   rF  zTritonKernelOverrides.constant'  s9    
 xx**,d
##E5#>>r|   c                n   t         j                  j                  |d      }t        |t              sJ t         j                  j
                  dk(  rt        j                  nt        j                  }|t        j                  t        j                  fvr|n|}t        j                  j                  }	 dt        j                  _        t         j                  j                  j                  t         j                  j                  |j                  t!        |      |      }|t        j                  _        |t        j                  t        j                  fvr^t         j                  j                  j                  t         j                  j                  | j#                  ||      t%        |            }n|}|j&                  D ]l  }t)        |t*        j,                        st        j.                  |t         j                  j                  j0                  |j2                     j4                        }n ||k7  rTt         j                  j                  j                  t         j                  j                  | j#                  ||      |      }|j6                  |_        |S # |t        j                  _        w xY w)NF	block_ptrtl.int32r   r  r  )rC   r  indexingrb  r   rr  rs   r  int64r   test_configsruntime_triton_dtype_assertcsegeneratecomputer   r4   r9  r?   free_symbolsr   r   r   promote_typesvarname_maprA  r  r   )rv   r  r  r  rr  origr  	index_vars           rz   r  z TritonKernelOverrides.index_expr0  s   88$$TU$;(O444 &'XX%9%9Z%GekkU[[u{{EKK&@@k "">>		C>CF;((,,''  "",T2	 ( C ?CF;ekk22((,,''  S%()%0 ( C  E!.. 	!)TXX6!//qxx||77	GMME #hhll++HH$$LLk2% ,  !**
9 ?CF;s   2A-J J4c           
        | ot         j                  j                  Ut        j                  j
                  j                  t        j                  j                  |  dt         j                        } |j                  j                  d      }|sJ d       d}|D ]>  }|j                  D ]-  }|j                  dk7  st        |j                  d         s+d	} > @ |rd n|}t        j                  j                  | |
      5 } |       }	d d d        |r	j                  j                   rt        |      }t        j                  j
                  j                  t        j                  j                  d|	 dt#        |       d|	 dt%        j&                  |      |	j(                        }t+        j,                  |	|      }
n	}
|
j.                  j1                         |
S # 1 sw Y   xY w)N.to(tl.int1)r  output)opz)graph for body does not contain an outputFloadrE   Tr  r>  z.shape, r   r  r  )rs   rx  ry  rC   r  r   r!  r"  r}   r  
find_nodesrC  targetr_   
mask_loadsr   is_boolrV   r   wrapr  rk  rv  r   discard)r   r  r  nodes
need_wherenoder  r  new_maskr5  rz  s              rz   r  zTritonKernelOverrides.maskedc  s    1 1 =88<<((  &%jj ) D 

%%%2AAAu
  	Dyy ::'+CCHHQK+P!%J	 #XX  U 3 	xVF	 }}$$UHHLL))  6((=+?*@6('R"''.ll	 * E ))Hfe4CCh'
'	 	s   G""G+c                    t         j                  j                  j                  |       }d| dt         j                  j                  j	                  d|       dS )Ntl.load( + load_seed_offsetr   )rC   r  rC  inputseed_offset)rA  r   r  s      rz   r  zTritonKernelOverrides.load_seed  sI    hhmm!!$'se3qxx}}889KVTUUVW	
r|   c                   d|  d}t         j                  j                  j                  |      x}r|S t         j                  j                  j	                  | j
                        }t         j                  j                  j	                  t        j                        }t         j                  j                  j                  | d| d|  d       t         j                  j                  j                  |||f       ||fS )Nzfrexp(r   r  r   z = triton_helpers.frexp()rC   r  r   try_getnewvarr  rs   r  r"  r   put)r  	cache_keycse_valmantissaexponents        rz   frexpzTritonKernelOverrides.frexp  s    QCqM	hhll**95575N88<<&&QWW&588<<&&U[[&9	""j8*$<QCqA	
 	
Xx$89(##r|   N)rr   r   r   r   r   rF  r  r\  r  r  rF  r   r|   rz   r  r    sm     ? ? 0 0d * *X 
 
 $ $r|   r  c                  H    e Zd ZU dZded<   ded<   ddZdddd	Zd
 Zd Zy)HelperFunctionsz#An ordered set of helper functions.zdict[str, str]_templates_seen	list[str]finalized_helpersc                     i | _         g | _        y rq   )rI  rK  r   s    rz   r  zHelperFunctions.__init__  s    !!#r|   _triton_helper_fn	base_namec                   | j                   j                  |      }||S | t        | j                         }|| j                   |<   | j                  j	                  |j                  |             |S )a9  This accepts a function definition with the function name
        left as a format specifier e.g.

            @triton.jit
            def {name}(arg0, arg1):
                return arg0 + arg1

        We add the templated code to the function set and return the name
        assigned to that function.

        )rA  )rI  getr  rK  rc  rD  )r   template_coderO  existing_namerA  s        rz   addzHelperFunctions.add  sw     ,,00?$  S!7!789:.2]+%%m&:&:&:&EFr|   c                ,    t        | j                        S rq   )iterrK  r   s    rz   __iter__zHelperFunctions.__iter__  s    D**++r|   c                     | j                   |   S rq   )rK  )r   rJ  s     rz   __getitem__zHelperFunctions.__getitem__  s    %%c**r|   Nr~   r   )rR  r   r~   r   )	rr   r   r   r   r   r  rT  rW  rY  r   r|   rz   rH  rH    s+    -##  $ 4G ,,+r|   rH  c                      e Zd ZU dZ ej
                  e      Zded<    ej
                  e      Z	ded<    ej
                  e      Z
ded<    ej
                  e      Zded<   d
dZy	)r   zM
    Class representing ND block dimensions, for block pointer analysis.
    )default_factoryr  r   r   r   r   c                    t        |       }t        d | |fD              \  }} |di |D ci c]  }|||   ||   z    c}S c c}w )z0
        Concatenates block parameters.
        c              3  F   K   | ]  }t        j                  |        y wrq   )r  r  r  s     rz   r   z*BlockParameters.__add__.<locals>.<genexpr>  s     Bq[''*Bs   !r   )r  r  )r   r  rv   r  r  r2  s         rz   __add__zBlockParameters.__add__  sR     4jBT5MBB19a8sc1S6AcF?*8998s   AN)r  r   r~   r   )rr   r   r   r   r  fieldr(  r   r   r   r   r   r_  r   r|   rz   r   r     sn     0k//EEE$5K$5$5d$KK!K 1 1 1$ GGG 1 1 1$ GGG:r|   r   c                  *    e Zd ZdZd ZddZd Zd Zy)"CooperativeReductionWorkspaceCachez
    The scratch space used for cooperative reductions can be reused
    after two reduction loops.  This keeps track of what can be reused.
    c                    || _         g | _        g | _        t        j                  t        j
                        | _        d| _        d| _        y r=  )	rC  current_loop
prior_loopcollectionsdefaultdictdequeready_for_reuse
loop_countstore_count)r   rC  s     rz   r  z+CooperativeReductionWorkspaceCache.__init__  s@    	*66{7H7HIr|   c                    | j                   j                  |      }|r|j                         S | j                  j	                  |d      \  }}| j
                  j                  |||f       ||fS rR  )ri  rQ  popleftrC  	workspacerd  rc  )r   nbytescachedws_name	ws_offsets        rz   allocatez+CooperativeReductionWorkspaceCache.allocate  si    %%))&1>>##!YY00?  &'9!=>##r|   c                    | j                   D ]&  \  }}}| j                  |   j                  ||f       ( | j                  | _         g | _        | xj                  dz  c_        y NrE   )re  ri  rc  rd  rj  )r   ro  rq  rr  s       rz   on_loop_endz.CooperativeReductionWorkspaceCache.on_loop_end  s_    *.// 	F&FGY  (//)0DE	F++1r|   c                H    | j                   }| xj                   dz  c_         |S ru  )rk  )r   priors     rz   increment_store_countz8CooperativeReductionWorkspaceCache.increment_store_count  s#      Ar|   N)ro  r   )rr   r   r   r   r  rs  rv  ry  r   r|   rz   rb  rb    s    
$r|   rb  c                  $    e Zd ZU ded<   d Zd Zy)FixedTritonConfigzdict[str, int]r   c                     | j                   |   S rq   r   r   r  s     rz   rY  zFixedTritonConfig.__getitem__	  s    {{4  r|   c                    || j                   v S rq   r}  r~  s     rz   __contains__zFixedTritonConfig.__contains__  s    t{{""r|   N)rr   r   r   r   rY  r  r   r|   rz   r{  r{    s    !#r|   r{  c                      e Zd ZdZddZy)	TritonCSEz
    Subclasses CSE to apply the current load mask to the cache key to avoid CSEing
    variables across separate masked blocks.
    c                Z    t         j                  j                  x}r||j                  fS |S rq   )rC   r  
_load_maskrA  )r   rB  r   s      rz   augment_keyzTritonCSE.augment_key  s,    88&&&4&tyy))r|   N)rB  r   r~   zUnion[str, tuple[str, str]])rr   r   r   r   r  r   r|   rz   r  r    s    
r|   r  c                  *    e Zd ZU eZded<   eZded<   dZ	 	 	 dI	 	 	 	 	 dJ fdZ	dKdZ
dLd	Zd
 Zd Zd Zd ZdLdZd ZedMd       Zddddd	 dNdZ	 dO	 	 	 	 	 	 	 dPdZdOdZ	 	 	 	 	 	 	 	 dQdZd ZdRdZ	 dS	 	 	 	 	 	 	 	 	 dTdZd Z	 	 dU	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dVdZdMdZdWdZ	 	 	 	 	 	 	 	 	 	 dXdZ	 	 dYdZ dYd Z!d! Z"d" Z#d# Z$d$ Z%d% Z&	 	 	 	 	 	 dZd&Z'd[d'Z(	 	 	 	 	 	 	 	 d\d(Z)	 	 	 	 	 	 	 	 	 	 d]d)Z*d* Z+d^d+Z,d, Z-d- Z.d. Z/e0d/        Z1dSd0Z2e0d1        Z3e0d2        Z4d3 Z5d_d4Z6d5 Z7dSd`d6Z8dad7Z9dbd8Z:dcd9Z;ddd:Z<	 	 	 	 	 	 ded;Z=ddd<Z>dfd=Z?dgd>Z@dhd?ZAdLd@ZBdidAZCeDd^dB       ZEdjdCZFdkdDZGeDdldE       ZHdmdFZIdjdGZJ	 	 	 	 	 	 dndHZK xZLS )oTritonKernelrH  helper_functionszCallable[[sympy.Expr], str]kexprTNc                   || _         || _        t        |   |fi | t	        | j
                  | j                        | _        t               | _	        t               | _
        t        t                  | _        || _        t        j                          | _        t%        t&        t&        f          | _        t+               | _        t/        j0                  t$              | _        t/        j4                         | _        t        t8                  | _        d | _        | j>                  r| jA                  | jB                         | jD                  r| jG                          | jI                          | jD                  r| jK                          y y rq   )&optimize_maskfixed_configr  r  r  newvar_prefixsuffixr   rM   post_loop_combinepost_loop_storer   r   outside_loop_varsr5  r  countblock_ptr_iddictr   block_ptr_to_bufferrH  r  rf  rg  pointer_advancementsCounter_load_countsr(   autotune_hintstriton_metar$  codegen_reduction_numelsr  cooperative_reductioninit_cooperative_reductioncodegen_range_treeinit_cooperative_reduction_mask)r   tilingr5  r  r  r  r  s         rz   r  zTritonKernel.__init__#  s3    $1(*6*T//=1?1A/=/?!+C!2#6 %OO-#'S>#3  / 1##D) 	! 7B6I6I6K )6859  ))$))4%%++-!%%002 &r|   c                    t        |      S rq   )r=   )r   r  s     rz   dtype_to_strzTritonKernel.dtype_to_strJ  s    5!!r|   c                p    | j                   xr) t        j                  j                  | j                        S rq   )r$  rC   choices should_use_cooperative_reductionr&  r   s    rz   r  z-TritonKernel.should_use_cooperative_reductionM  s-    $$ 
)S)SMM*
 	
r|   c                     j                   sJ  j                  D ]$  }|j                  |xj                  dz  c_        &  j                  d   } j                  rt        | j                  d         } j                  j                  |       _        t         j                         _
         j                  j                  d       t         fd j                  D              r j                  j                  d       yy)z/One time setup code for cooperative reductions.NrE   r  r   a              RSPLIT_NEXT_POWER_OF_2: tl.constexpr = triton_helpers.constexpr_next_power_of_2(RSPLIT)
            RSPLIT_IS_POWER_OF_2: tl.constexpr = RSPLIT == RSPLIT_NEXT_POWER_OF_2
            HAS_RSPLIT: tl.constexpr = RSPLIT > 1
            rsplit_id = tl.program_id(0)
            num_rblocks = (rnumel + RBLOCK - 1) // RBLOCK
            rsplit_chunk = (num_rblocks + RSPLIT - 1) // RSPLIT * RBLOCK
            rsplit_start = rsplit_chunk * rsplit_id
            rsplit_end = rsplit_chunk * (rsplit_id + 1)
            c              3  Z   K   | ]"  }|j                   rj                  |        $ y wrq   )r'  _has_constant_mask)r   r   r   s     rz   r   z:TritonKernel.init_cooperative_reduction.<locals>.<genexpr>n  s0      
   ''--
s   (+z>rsplit_end = tl.where(rsplit_end < rnumel, rsplit_end, rnumel))r  r,  grid_dimr%  r  r   rC  
semaphoressemaphores_namerb  %cooperative_reduction_workspace_cacher  r   r   r   )r   r   	sem_counts   `  rz   r  z'TritonKernel.init_cooperative_reductionR  s    )))) $$ 	#D}}("	# KK$		4+<+<X+FGI#yy33I>5WII6
2 					
  
((
 

 IIP
r|   c                   d}| j                   s| d}| j                  j                  d|        | j                         r| j                  j	                  d       y | j                   rJ | j                  j                  d       y )Nz$tl.arange(0, RSPLIT_NEXT_POWER_OF_2)z	[None, :]zrsplit_arange = z                if RSPLIT_IS_POWER_OF_2:
                    rsplit_mask: tl.constexpr = None
                else:
                    rsplit_mask = rsplit_arange < RSPLIT
                zSrsplit_mask = xmask if RSPLIT_IS_POWER_OF_2 else ((rsplit_arange < RSPLIT) & xmask))r   r  r   _has_constant_xmaskr   )r   rsplit_aranges     rz   r  z,TritonKernel.init_cooperative_reduction_maskw  s{    >}},oY7M		.}o>?##%II }}$$IIer|   c                2   | j                   D ]q  }|j                  s| j                  || j                         ,| j                  s9| j                  j                  |j                   d| j                  |              s | j                  rt        d | j                   D              rS| j                  ddd      }| j                  |      }| j                  j                  d| j                  |              y | j                  | j                         y y )Nzbase = c              3  4   K   | ]  }|j                     y wrq   )is_loopr   r   s     rz   r   z2TritonKernel.codegen_range_tree.<locals>.<genexpr>  s     =D4<<=s   baseTr   zrbase = )r,  r  iteration_ranges_codegen_headerr  r$  r   r!  iteration_ranges_ranges_coder   _get_reduction_symbols_flatten_reduction_indicesr   r  codegen_reduction_indices)r   r   rn_basesrbases       rz   r  zTritonKernel.codegen_range_tree  s    $$ 		D<<44T499E&& 		##{{m74+L+LT+R*ST		   =D,<,<==66Dd 7  77A		  8D,=,=e,D+E!FG ..tyy9 !r|   c                     y)z
        Indicate whether we need provide numel as arguments for the generated
        kernel calls in the benchmark.

        Should be true for pointwise/reduction kernels but false for triton
        matmul kernels.
        Tr   r   s    rz   need_numel_argszTritonKernel.need_numel_args  s     r|   c                    | j                   xr4 t        j                  j                  | j                  | j
                        S rq   )r$  rC   r  should_use_persistent_reductionr&  r  r   s    rz   r  z,TritonKernel.should_use_persistent_reduction  s5    $$ 
)R)RMM455*
 	
r|   c                    | j                   rlt        | j                        | j                  dz   k(  rG| j                  r| j                  d   dk(  S t
        j                  j                  | j                        S y)NrE   r   F)	persistent_reductionr  r%  r#  r  rC   r  want_no_x_dimr&  r   s    rz   r  zTritonKernel.want_no_x_dim  sb    %%DKK D$;$;a$??  ((2a7799**4==99r|   c                     y)Nztl.device_assertr   r   s    rz   assert_functionzTritonKernel.assert_function  s    !r|   F)
copy_shapedense_indexingoverride_maskr  c          
     T     j                        j                  }d}t        t                  |D ]j  }t	        |t
        j                        sJ |xs t        |t        j                        }|rAt        |t        j                        r? j                  j                  |j                     }	j                  |	j                          t        |t        j"                  t        j$                  t        j&                  t        j(                  t        j*                  t        j,                  f      rt        j.                  D 
cg c]  }
t        ||
      r	t0        |
    }}
t3        |      dk(  sJ d|j                          j5                  |d    d       m t6        j8                  j:                  xs |xs  j<                  duxr dk7  }d}d}t        t                  } j?                         D ]@  }|jA                  |jB                        rd}nd}|j5                  |jD                   d       B |r jF                  rt6        j8                  jH                  rx|sv j<                  sjt3        |z
        dk(  rY jK                        sH|rF jL                  dk(  r7	 	 	 	 	 	 dd		 	 	 	 	 	 d fd
	 	 	 	 	 	 dfdd fd} |       }||S d} jO                        }t	        t
        jP                        r|r| dn jS                         }d| d| d} jT                  r jW                         st        dg      n
t                j<                  rj5                   j<                         tY        |||      S |r%|s#|r| dn jS                         }d| d| d}|n|s|rd| d| d}||rt        |g       j<                  rj5                   j<                          j[                         tY        |||      S c c}
w )zO
        Compute the index and mask to pass to tl.load() or tl.store()
        FrE   zAmbiguous type: r   r   NTr  c                    t        j                  | |j                               }|yt        |j                  gt
        j                  |      g|gt
        j                  |      g      S )z
                Matches expressions of the form:
                    idx = s * xindex

                This implies stride (s,), and shape (XBLOCK,).
                Nr   r   r   r   )rF   match_affine_block_exprsymbolr   numelr   r   r   )r   
range_treer.  s      rz   match_affine_blockz1TritonKernel.indexing.<locals>.match_affine_block  sl     -DD:,,. >&%++,!.!=!=j!I J#H*;;JGH	 r|   c                   |j                         }t        j                  dt        j                  t        j
                  |g            \  }}t        dt        j                        | j                  t        ||            | j                  t        |||            z         }t        j                  | ||j                  |      }|y|\  }}}	t        j                  |      }
t         j"                  j$                  j'                  |j(                        t+        fd|
D              ryt,        j/                  |      }t1        ||
d         gt3        |
dd |dd       D cg c]%  \  }}t        j4                  t1        ||      |      ' c}}z   }|	D cg c]#  }t7        ||t,        j9                  |      i      % }}t;        ||||	      S c c}}w c c}w )
a  
                Matches higher-dimensional blocks coming from FloorDiv and ModularIndexing.

                Example expression to match:
                   sN * ((rindex//(d1 * ... * d(N-1))))
                       + s1 * ModularIndexing(rindex, 1, d1)
                       + ...
                       + s(N-1) * ModularIndexing(rindex, d1 * ... * d(N-2), d(N-1))

                This iterates over a block of shape (dN, ..., d1) and stride
                (sN, ..., s1). (d1,...,d(N-1)) and (s1,...,sN) are
                wildcards that we match.

                Note that dN does not appear in the expression, but we solve for it
                using range tree numels and the other dims.
                zdenom modulo)exclude)rv   r   Nc              3  l   K   | ]+  }j                  |       xr j                  |        - y wrq   )rG  statically_known_power_of_2)r   r  	max_blockr   s     rz   r   zETritonKernel.indexing.<locals>.match_mod_div_block.<locals>.<genexpr>f  sH        !==eYOO H$@@GGHs   14r   rE   r  )r  r   symbols	functoolspartialWildr4  r  range_tree_nodesr  r   r   rF   match_mod_div_block_exprr  get_slice_numelsrC   r  r   r  r!  r   r   r   r   r   Minr<   r   r   )r   r  r'  denommodulonum_dimsmatch_resultr1  r   block_index_exprsslice_numelslinear_block_sizer  r  r   r  r   r  r   r   s                    @@rz   match_mod_div_blockz2TritonKernel.indexing.<locals>.match_mod_div_block'  s   ( '--/	 !&"!))%**ykJ!v --.HY$>?++oi&OPQ	  3KK9j&6&6   ' !	%2CCDI 77++ NN:+<+<=	  ". 
   %2$@$@$L!-|A?1 '*,qr*:DH&E"s IIg&7?E1 !2	3  y-*H*H*TU3 3 ' +#)	 3s   ,*G (Gc                6    fD ]  } || |      }||c S  y)ze
                Match a block indexing subexpression involving a single range tree.
                Nr   )r  r  
match_funcmatchr  r  s       rz   match_block_pointer_subexprz:TritonKernel.indexing.<locals>.match_block_pointer_subexpr  s:     ''# %J 'tZ8E($% r|   c            	        t        j                  j                         D  ci c]  \  } }| |j                   c}}       }j	                  d      }|D cg c]&  }t        j                  ||j                               ( }}t        d |D              }t               }t        ||      D ]@  \  }}t        |j                  |j                              dkD  r y  ||      }	|	 y ||	z  }B |t        |      z
  }
j                         t         j#                  ||
|j$                        S c c}} w c c}w )NT)reorderc              3  <   K   | ]  }|j                           y wrq   )r  r  s     rz   r   zETritonKernel.indexing.<locals>.match_block_pointer.<locals>.<genexpr>  s     *QT4;;=*Q   rE   )r   r   r,  r   r-  )r<   r  r  r  active_range_treesrF   get_subexpr_involving_symbolr  r   r   r   r  intersectionr#  sumfilter_masksr   r6  r  )vtindex_relative_to_xyr_indexr,  r   index_subexprsrange_symbolsblock_paramssubexprr   r   r   r   r  r   s              rz   match_block_pointerz2TritonKernel.indexing.<locals>.match_block_pointer  sT   .8$2G2G2M2M2OP$!QAqvvIP/+ #55d5C !,	"  (DD3T[[]" " !+*Q[*Q Q.0%(n%E 
+MD' =55g6J6JKLqP# 9$GF~# F*L
+ 5s>7JJ !!),&--'$* +'"&.. .  E Q"s   D5+D;z.shaper>  r   z, tl.int32)xmaskr   r   .shape))r   r   r  rY   r~   Optional[BlockParameters])r  r   r  rY   r~   r  )r~   zOptional[BlockPtrOptions]).prepare_indexingr#  r   r   rb  r   r   r   r   r   r   r   r   r%  rA  r  r   UNBACKED_INTSIZEPRECOMPUTED_SIZEINDEXFLOATUNBACKED_FLOATr   r   r  rT  r   r   r  r  r  r  var_listr!  allow_block_ptruse_block_ptris_indirect_indexingrr  r  r>  dense_size_strr  r  r   r  )r   r   r  r  r  r  
index_varsr   r  cse_varr   prefix_matches
need_dense
have_densehave_loop_varsdense_mask_varsr   r  optionsr   r   r   r  r  r  s   ``                   @@@@rz   r  zTritonKernel.indexing  s!    %%e,''

sO%	 	:Cc5<<000# ~]22(J TXX.((..sxx8  !2!23%%II))JJJJ''
 
 !. 9 9"%c40 t$" "
 >*a/N3CCHH:1NN/!2 3489?	:D MM(( ++d* qj	 	 
$S/+++- 	6D&&t}}5!%"
4;;-t 45	6 $$++!OOI/0A5--e4  J.!/B*,`!`/B`*`D .A* * *Z *+G"
%%e,	eU]]+2<J<v.$BUBUBWJ":,b;GI  )A)A)C&y1	&L	doo."9iZQVWWj2<J<v.$BUBUBWJ*9+R
|1EI'IJ*9+R
|7KI'I"M?3I??MM$//*)$y)ZUSSy"s   P%c                f   |j                         }|sd}n|r|dk(  sJ d|d}nd|}| j                  r| j                  d   j                  r|j	                         rdt        | j                         }| j                  j                  t        || d|j                  |d	                    || j                  |<   t        j                  D ]E  }|j                  |      }t        d
 |D              r'| j                   |   }	||	vsJ d       ||	|<   G ||fS |j                  |      }||fS )Nr   , other=0.0, boundary_check=z, padding_option='zero'r  r   = F)r9  c              3     K   | ]A  }t         j                  j                  j                  |t	        j
                  d              C ywr   N)rC   r  r   r   r   r>  )r   r   s     rz   r   z1TritonKernel.codegen_block_ptr.<locals>.<genexpr>	  s9       GG$$<<VU]]STEUVs   AA	z@duplicate advancement for pointer '{block_ptr}' at type '{symt}')rL  r$  r,  r  r   nextr  r  r   rL   rD  r  r   r   rP  r  r  )
r   rA  r  r  r  checkr  r   advance_offsetsadvancementss
             rz   codegen_block_ptrzTritonKernel.codegen_block_ptr  sp    '')EM)))'y0GHE'y1E!!  $,,##%#D):):$;#<=IIIYKs8??3?+N*OP 37D$$Y/ &55 :"*":":4"@  "1  #88> 4 V4 +:Y':" % !,I%r|   c                    |j                  ||j                  |j                  d      }| dt        t        j
                  j                  |             d}d| d| | dS )NFr  r   	tl.store(r   )r  r   r   r  rC   r  	get_dtype)r   rA  r  r  r  r  s         rz   codegen_block_ptr_store_linez)TritonKernel.codegen_block_ptr_store_line  sj    668'')=)=u

 '/0A0A$0GHIK9+RwugQ77r|   c                   |s|sy t        |t        j                        sJ | j                  |d      }t        |t              sJ |j
                  }|j                         r|j                  nd }|rt        | j                  |            nd }| j                  ||rdnd ||      }	| j                  |      }
| j                  j                  |
|	dt        j                         y )NFr  0)
assignmentr  )rb  r   Exprr  r   r   r   r   texprrename_indexingindirect_assertget_load_bufferr   r!  rs   r  )r   r  rj  lowerr   r  r   r   size_strlinebuffers              rz   check_boundszTritonKernel.check_bounds"  s     $

+++===7(O444&&	(0(9(9(;8$$8=5--d344 ##esx
 %%h/&$5Lr|   c                    |j                         s|j                         r| j                  S | j                  r5| j                  d   j
                  r|j                         s| j                  S | j                  S )Nr  )	r   r   r"  r$  r,  r  r   r  loads)r   r  s     rz   r(  zTritonKernel.get_load_buffer<  sb      "h&:&:&<<<!!  $,,'') 99::r|   c           
     	   | j                   j                        }| j                  xx   dz  cc<   t        }| j	                  |      |}| j                  |d      }|j                         |j                         }t        d | j                  |      j                         D              }| j                  |      rd}	nX|sd}	nS| j                  rE| j                  d   j                  r,fd}
   d}	t        j                   t"        d	|
      }nd
}	|sr8|j%                         r(| j&                  rdt)        | j&                         }nd}nd
}	 d}t*        j,                  j.                  r"| j0                  j3                         }|   dkD  }	 | j                  |       xr | j                   xr | xr |}d
}|rd}d }t4        j6                  j9                        }t;              r7|}|t<        j>                  t<        j@                  fv rWt<        jB                  }nEtE        |tF              rL| jI                  |||      \  }}d| | |	 | d}|jK                  ||jL                  |jN                  d      }nVtE        |tP        jR                        rd| d| d}|jT                  }n&d| d|jV                   d|jX                   |	 | | d
}|t<        j>                  t<        j@                  fv r/t*        j,                  jZ                  r|dz  }t<        jB                  }|t<        j\                  k(  r/t<        j^                  j`                  |dz  }t<        j\                  }| jc                  |      }| jd                  jg                  | ||      |      }|jh                  dkD  rxx   dz  cc<   tE        |tj              sJ |jl                  |_6        |rd| d| d}| jd                  jg                  |||      }|jl                  r~|jn                  rd}n|t<        j\                  k(  rd}nd}| j&                  rt)        | j&                        n|}d|jX                   d| d| d}| jd                  jg                  |||      }| j                  r|jq                         ss| jr                  ju                  |       |S )NrE   Tr  c              3  &   K   | ]	  }|d k(    ywr   r   )r   r}  s     rz   r   z$TritonKernel.load.<locals>.<genexpr>`  s      
AF
   z, eviction_policy='evict_last'r  c                          kD  rsryy)N
evict_lastevict_firstr   )expected_countr   indirect_indexingload_countsrA  s   rz   decide_laterz'TritonKernel.load.<locals>.decide_lateri  s    t$~5"3'$r|   z, eviction_policy='<EP>'z<EP>r   z, other=r  z, cache_modifier='.cg'r9  r   r@  r  r  r  r)  r  r   r   z0.0Truer"  r  );rC  r<  r  r   r  r  r   r   r   get_strides_of_loadr  is_broadcastedr$  r,  r  r  r  r3   r   _load_otherrV   r   r   skip_l1_cacher&  buffer_read_countsrC   r  r  r_   rs   r  r  rt   rb  r   r  r  r   r   r   r>  r   r   r   r  r}   rx  ry  r(  r   r!  	use_countr  r   r  r   r  rT  )r   rA  r   r  	make_lineoriginal_indexr  r   is_coalescedepr9  r  has_read_depsr?  r>  cachemodappend_broadcastr  r+  r  load_buffer
result_varzero	other_valr6  r   r7  r8  s    `                      @@@@rz   r,  zTritonKernel.loadK  s   iiood#''DQCK	 55e<==$=7((*
**,  
 44^DKKM
 
 ~.1B1B""t'7'7';'C'C% % ).N+B!))*:FLQIB:8+<+<+>"=1A1A#B"CD%E	 ==&&!%!A!A!C.t4q8M	 ##N33 )))!! 	 	 /H!!$'#D)D 77 (O4#'#9#9$Xu#U 	5!)UGB4zC==(..0D0Dd NEMM:!#d>*:"=#+#6#6 !#d8+=+=*>c(BSBSATUWTXY^X_`h_iijk %--88MM88))

"u}}'8'8'@ &

**84XX&&{IdO5&Q
!#"*&7888'11
%j\4D3EQGD**;E*JJ!!** Dejj(!DD7;7G7GM$"2"23T  #8#4#4"5R
|2i[PQR!XX..{D.N
$$X-?-?-A*""&&z2r|   c           	        | j                   j                  |      }|}| j                  |d|d u       }|| j                   j                  v }| j	                  |      }	|r'|	r%| j
                  j                  t        |d             t        |t              r,| j                  |||      \  }
}| j                  |||
||      }n]|$d| d|j                   d| d|j                   d	}n7|d	k(  r$d
| d|j                   d| d|j                   d	}nt        d|       t        j                          }| j"                  s7| j$                  r+|j'                  | j)                  || j
                               | j
                  j                  t        ||             | j"                  s| j*                  j-                  |       |j/                          y )NT)r  r  ztl.debug_barrier()r  r@  r  r   r   
atomic_addztl.atomic_add(z, sem='relaxed')zstore mode=)rC  r*  r  inplace_buffersr<  storesr   rL   rb  r   r  r   r   r   r  
contextlib	ExitStackr$  r  enter_contextguard_cooperative_storer  rT  close)r   rA  r   r  moder  rB  r  
is_inplacer<  r  r  r+  
exit_stacks                 rz   storezTritonKernel.store  s    iit$==ttt|=T TYY666
,,^<.KK!!,t5I"JKh0#55dCJIu44h	5%D \se4(:(:';3ugRHYHYGZZ[\D\!#C5X-?-?,@E7"XM^M^L__opD%D6&:;;))+
$$)C)C$$T%A%A$%TUl467$$""&&u-r|   c                    | j                   j                         }|j                  t        |d| d             |j	                         S )z
        For cooperative reductions only one thread block should write out the result.
        We rotate which thread block does each write for better parallelism
        zif rsplit_id == (z % RSPLIT):)r  ry  r   rL   indent)r   rA  r,  rJ  s       rz   rS  z$TritonKernel.guard_cooperative_store  sC    
 88NNPd.?uK,PQR}}r|   c                   | j                   j                  t        j                         | j                  j                  |d         }| j                  |d         }	| j                  |d         }
| j                  |d         }|r| j                  j                  |d         nd}|r| j                  |d         nd}|t        j                  k(  rd}n!|t        j                  k(  rd}nt        d      | j                  j                  | j                  d	| d
| d
|	 d
|
 d
| d
| d
| d
| d
| d
| d
| d|      }|S )z3
        See [Note: Inductor bucketize op]
        r   rE   r   r   r   r  ztl.int64z5Bucketize only supports indexing with int32 and int64z'triton_helpers.bucketize_binary_search(r   z, )r  )r  rT  r(   ONE_ELEMENT_PER_THREADrC  r<  r  rs   r  r  r  r   r!  r"  )r   r  
boundariesboundary_indicesindexing_dtyper  sortersorter_indicesboundaries_ptrboundary_sizeboundaries_underlying_numelboundary_stride
sorter_ptrsorter_stridetriton_dtyper5  s                   rz   	bucketizezTritonKernel.bucketize	  s^   $ 	 C CDA7))*Q-8&*&7&7
1&F#++JqM:39TYY__VAY/v
8>))&)4FU[[(%Lu{{*%L%G  ""LL5fXRbr2M1NbQ`Paac nBgRl"]O2  ! # 
 r|   c                    | j                         }|dk(  rd| dS | j                  }dg||z
  z  dg|z  z   }| ddj                  |       dS )	NrE   z!triton_helpers.promote_to_tensor(r   r_  r   r`  r   ra  )r  r#  r   )r   r  ndimsnreducesizess        rz   reduction_resizezTritonKernel.reduction_resize8	  sj    '')A:6ugQ??)))VHw,>>$))E*+1--r|   c           
         | j                   dk(  r|S | j                         | j                   z
  }| j                         }|d| dgz   }t        | j                  j                  |t        |||      |            S )zC
        Reshape to RBLOCK, collapsing all reduction dims.
        rE   NRBLOCKr  )r#  r  dense_size_listr   r   r!  r   )r   r,  r  r  target_ndimr  target_shapes          rz   reduction_collapse_dimsz$TritonKernel.reduction_collapse_dimsA	  s    
 ""a'L--/$2I2II,,.$\k2hZ?HHum\JRW  
 	
r|   c                   234567 d8d}t        j                  |      D cg c]  }|j                   }}t        j                  ||      }t	        d |D              rHt        j                  |t
        j                        }t        j                  t
        j                         j                  sJ t        d  j                  D              } j                  |       t        |      } j                  r|j                   j                          j                  d   j                  d   }	 j!                         3 j#                  3 fd|      } j%                          j&                  z
  4	 	 	 	 	 	 d94 fd5	 	 	 	 	 	 	 	 d:5fd}
46 fd	}||f}| j(                  j*                  v r j(                  j*                  |   S t-        |      }t/        |      } j(                  j1                  |
      }t        d |D              |_        dj5                  |      22fd7 j6                  r!t8        j:                  j=                  |      } j#                  t>        |      }d; 7fd}dk(  rn?tA        |tB              r&tE        ||      D cg c]  \  }} |||       }}}n	 |||      }dv rtG         j(                  jI                   jJ                  d|	 d dtL        jN                  jP                  dk(  rt
        jR                  nt
        jT                  
            }ddd   6 | jJ                  |||       ndk(  r8 jV                  r jY                  ||7|      }np j[                  |      }n\dk(  rMtA        t\              sJ |\  }}}tC         fd j_                   jJ                  |||4      D              }n
dk(  r ja                  |      }ntA        tb              sJ  j(                  jI                   jJ                   5 jJ                  tG        |      d       |j                  
      }n j(                  je                  d| |
      }t8        j:                  jg                  |      } j#                  t>        |      }tA        |tB              s5 jh                  jk                  | d j!                          d| d| d       dv rd| d} jl                  jo                         } jh                  jk                  | d j!                          dt        jp                  |      jr                   d ju                  |       d       ddd   6 jJ                  jw                  d| d | d!6 d"| d| d| d|	 d#| d$ 7| d%|       d&| d$ 7| d%|       d&        | jx                  |||       n t{              r jY                  ||7|      }n݉dk(  r(d| d'}d| d(} jh                  jk                  | d j!                          d)| d        jh                  jk                  | d* j!                          d| d        jJ                  jw                  d+| d | d,| d| d| dt|        j~                   d-        jJ                  jw                  d+| d$ 7| d%|       d+| d$ 7| d%|       d+	       |} j(                  j1                  
      } j                   jx                  ||||4      }nt9        j                  |      } |||      }  jJ                  jk                  | d$ 7| |              |t
        j                  k(  r/| d.}!t              }" |
 jx                  tG        |      |!|"       n' |
 jx                  tG        |      tG        |      d         jV                  r{t8        j:                  jg                  |      }t        j                         }# jx                   j                  fD ]2  }$|$jk                  d/       |#j                  |$j                                4 dv r jx                  jk                  | d0 j                  | d1               j                  | d2||      }% jl                  jo                         } j                  ||t        jp                  |      jr                        }& | j                  ||%|&       n=t{              rdk(  sJ |\  }'}(}) j                  |'t/        |      |d         }* j                  |(t/        |      |d3         }+ j                  |)t/        |      |d4         }, j                   j                  |'|(|)|*|+|,4	       ndk(  re|\  }} j                  |t/        |      |d         }- j                  |t/        |      |d3         }. j                   j                  |||-|.4       n: j                  |t/        |      |      }/ |
 j                  tG        |      |/d        |#j                          | j(                  j*                  |<   tA        |tB              rt        d5 |D              sJ  j                  j                  |       d6v rt        |      d3k(  sJ t        |      |z  }t        |      t        |      k(  sJ tE        ||      D ]F  \  }0}1|1J |0j                  |1k7  s jx                  jk                  |0 d$|0 d7t        |1       d       H |S tA        |t              sJ  j                  j                  |       |j                  |d   k7  r7|d   J  jx                  jk                  | d$| d7t        |d          d       |S c c}w c c}}w )<Nc                    | j                   t        j                  t        j                  fv r$t	        j
                  | t        j                        S | S rq   )r  rs   r  r  rk  r9  rt   r-  s    rz   maybe_upcastz,TritonKernel.reduction.<locals>.maybe_upcastY	  sF     ;;MMNN UEMM2 r|   c              3  `   K   | ]&  }|t         j                  t         j                  fv  ( y wrq   )rs   r  r  r  s     rz   r   z)TritonKernel.reduction.<locals>.<genexpr>i	  s"     MqU]]ENN33Ms   ,.c              3  :   K   | ]  }|j                    d   ywr   Nr!  r  s     rz   r   z)TritonKernel.reduction.<locals>.<genexpr>o	       MDdkk]$/M   r  r   c                v    j                   j                  j                  d|  d d| j                        S )Nr   r   r   r  r   r!  r"  r  )r  r	  r   s    rz   <lambda>z(TritonKernel.reduction.<locals>.<lambda>~	  s<    dhh''"1#R'7q9gg (  r|   c           
         dv }|rdnd}j                  | |      }dv rj                  | d d| d d      }nj                  | d d	| d d      }|| d
| d}|S )zK
            Helper to generate a reduction call, e.g. tl.sum.
            )r   r4  minprodtriton_helperstl)r4  r  r  z2(r   r   r  r  )rt  rn  )	r,  r  result_type
use_helpermoduler  r  reduction_typer   s	        rz   final_reductionz/TritonKernel.reduction.<locals>.final_reduction	  s     (+HHJ)3%F00FE/--ha/r%3%qA --ha/qr#a@ & 'k]!4Lr|   c                F     | ||      }| j                  | d|        y)zU
            Generate a reduction and assign it to an existing variable.
            r  N)r   )r,  rI  r  r  r  s       rz   final_reduction_definez6TritonKernel.reduction.<locals>.final_reduction_define	  s*     $FE;?EMMZLE734r|   c                    j                  | |      }j                  | |      }| j                  d| d| d d| d| d d| dj                  | d       d	       y )
N                z_val, z_idx = triton_helpers.z_with_index(r   )
                r  _idx
                )rt  r   rn  )r,  rI  r  r   r  r  root_opr   s       rz   final_argreducez/TritonKernel.reduction.<locals>.final_argreduce	  s    00FE00FEMMF:,.DWI\Z_Y``bchbiiklokp qC 5 5D6I JK Lr|   r  c              3  >   K   | ]  }t        |d          r|  ywr  )r9   )r   r  s     rz   r   z)TritonKernel.reduction.<locals>.<genexpr>	  s!      *
(;CF(CC*
s   r   c                :    s| S t         j                  | |      S rq   )r  rv  )tvalfvalconds     rz   
where_condz*TritonKernel.reduction.<locals>.where_cond	  s     (..tT4@@r|   c                v    j                   j                  j                   | |      | j                        S )Nr  r  )r  defaultr   r  s     rz   _mask_valuez+TritonKernel.reduction.<locals>._mask_value	  s5    xx((LL*UG"<EKK )  r|   online_softmax_reduce)argmaxargminr   zindex, r  r  r4  r  welford_reducewelford_combinec              3  n   K   | ],  }j                   j                  j                  |        . yw)r  N)r   r!  r"  )r   r  r  r   s     rz   r   z)TritonKernel.reduction.<locals>.<genexpr>	  s3      # HH%%dllE%G#s   25r   = tl.full(r   r   _indexr  _next, z_next = triton_helpers.z%imum_with_index(
                    z(index
                )
                r  _nextr  _max_sumz, float('-inf'),  = tl.zeros(z
                    zG_next = triton_helpers.online_softmax_combine(
                        z+
                    )
                    z.to(tl.int8)zif HAS_RSPLIT:z_bval = _val_bvalrE   r   c              3  <   K   | ]  }t        |t                y wrq   )rb  r  r  s     rz   r   z)TritonKernel.reduction.<locals>.<genexpr>
  s     LAz!%67Lr  )r  r  r  )r  rK   r~   rK   )r  r   r  r   r~   r   )rI  r   r  r   r  r   r~   r   )r~   rK   )Spytreetree_leavesr  tree_mapr   rs   r$  rt   r$  r   r,  r  sortedr  rc  r!  r	  _map_tuple_or_scalarr  r#  r   reduction_cacher  r  r@  r   r   r  r   	Reductiondefault_valuerV   rb  r  r   r   r!  r"  rC   r  rr  r  r  r  r  welford_reduce_fallbackr   _welford prepare_softmax_twopass_fallbackrK   namedvardefault_accumulatorr  r   r&  select_index_dtypeiinfor4  r  r   r  r7   r   rV  %online_softmax_reduce_final_reductionget_reduction_combine_fnr}   r  rP  rQ  r  rR  rZ  rn  *codegen_cooperative_reduction_peer_combinewelford_reduce_final_reductionrT  r  r  r  r  r  rT  )8r   r  r/  r  r  rw  r3  original_dtypesmasksreduction_range_prefixr  r  rB  acc_typetorch_acc_typerI  r  r  r  dmasked_valueaccumulator_indexmeanm2weightaccumulatorrr  accumulator_maxaccumulator_sum
result_max
result_sum
combine_fnupdatedaccumulator_casted_strr  rW  bufpeer_valpeer_idxresult_mean	result_m2result_weight	peer_meanpeer_m2peer_weightpeer_maxpeer_sumpeersr  
orig_dtyper  r	  r  r  r  r  s8   `` `                                              @@@@@@rz   	reductionzTritonKernel.reductionR	  s   	 170B0B50IJ399JJe4M_MM++Iu}}EI''u}}=E$$$$MD<L<LMM% u??LL)!%!1!1"!5!<!<Q!? ,,.))
 
 %%'$*A*AA		 '	 		 	4
	5
	5 
	5 '	
	5
 
	5	 6	00088++I66"9-))4((///?
) *
 *
  

 zz% 	A
 $$ll00KG//wGG !88 E5)>A%>QRdaAq 1RR*5':!55$'HH%%*+A*B',W^_88//:= $kk"[[ & %! &+e<^LLL*l<M  #33--!%!4!4"NE:xQV"J "&!=!=eU!KJ#44!,999%1"r6" #!%dBU"# 
  #:: "BB5%P
!,<<<!XX..LL#DLL#l2CTJ&,, / 
 ((++a
|,<N+SKll66~yQG//wGGgu-		##"m;t/B/B/D.ERyPRS[R\\]^ !55&'
|6$:!"mm>>@		##()T5H5H5J4K2{{;/334Bt7H7H7U6VVWY &+e<^L##W%6$77Nwi X M$5#6brBXAY ZS{m5,A;!O P Q"#3z5F4Gu2MO`'a&b c  **JEV &n5!00z8U
  #::$%j\"6$%j\"6 		##&'{43F3F3H2IIZ[cZddef 		##&'|D4G4G4I3J"XJVWX ##$%W_,= >()O+<BugRH\H\G] ^ ##$%S6Gu4M)_(` a$%S6Gu4M)_(` a (
!XX__5_9
!GG**##
  88S
$[%8&&"m3z';'G&HI 

* 1<}L-I*"5e"<K*..J.#	 +..J[AQSW %%ll66~yQG#--/J..0D0DE 7./((67
 !55&&00!l(4+@+@J<tAT+U*VW  JJ!l%()W #mm>>@JJU[[-E-I-I   4 4j(HU%n5%)99998B5Y KK!1)!<gaj	 II/	:GAJ #MM!#3I#>
 33((!
  #::)3&
JJJ 0 ;WQZ  JJ 0 ;WQZ ::(( GG 0 ;W '((#j/5$ .8  +j%(LLLLL""))*5 !LL?+q000"%j/O"Cz?c/&::::#&z?#C Z!---99
***44%s3%t,?
,K+LAN"  j*;<<<""&&z2 ?1#55&q)555&&00!l#j\6I/Z[J\6]5^^_` _ Kd  Ss   ooc                   | j                  |||      }| j                  |||      }t        d      D cg c]'  }t        | j                  j	                  |            ) c}\  }}|j                  d| d| d| d| d| dt        j                   d| d| j                  |        d| d| j                  |        d       ||fS c c}w )Nr   r  
            r   z9 = triton_helpers.online_softmax_reduce(
                )
            r  )	rt  r*  r   r   r@  r   r   rV  rn  )	r   r,  r  r  r  r  r  r  r  s	            rz   _online_softmax_reducez#TritonKernel._online_softmax_reduce
  s     66vPUV66vPUVMRSTX!V#dhhooEo&B"C!V
JL:, ' !O#4Bse2f>R>R=S TLD11ZLBC DLD11ZLBC D		
 :%% "Ws   ,Cc           	     D     fd|||fD        \  }}}d| d| d| d| d	}t        d      D cg c]'  }t         j                  j                              ) }	}j	                  dj                  |	       d|        t         fd|	D              }
|
S c c}w )	z;
        Helper to codegen triton_helpers.welford.
        c              3  D   K   | ]  }j                  |        y wrq   )rt  )r   r  r,  r  r   s     rz   r   z(TritonKernel._welford.<locals>.<genexpr>
  s(      
 ((>
s    ztriton_helpers.welford(r   r   r   r  r  c              3  @   K   | ]  }j                  |        y wrq   )rn  )r   r  r   s     rz   r   z(TritonKernel._welford.<locals>.<genexpr>
  s     Xud33E:Xs   )r*  r   r   r@  r   r   r  )r   r,  r  r  r  r  r  welfordr  welford_resultsresult_valuess   ``    `    rz   r  zTritonKernel._welford
  s    
F+
b& ,D6B4r&C5JFKAhO3txxU;<OODIIo67s7)DEXXX	 Ps   ,Bc                   | j                         | j                  z
  }| d}| d}	| d}
| j                  j                  | d| j	                          d| d       | j                  j                  |	 d| j	                          d| d       | j                  j                  |
 d| j	                          d| d       |dk(  r>|\  }}}| j
                  j                  d| d	|	 d	|
 d
| d|	 d|
 d| d| d| d       n8|dk(  sJ | j
                  j                  d| d	|	 d	|
 d| d| d|	 d|
 d       | j
                  j                  d| d || d|       d|	 d ||	 d|	       d|
 d ||
 d|
       d       |}| j                  j                  |      }| j                  j                  |      }| j                  | j                  |||||	|
||	      S )z%Helper to codegen a welford reduction_mean_m2_weightr  r   r   r  r  r  z<_next = triton_helpers.welford_combine(
                    z,
                    z#
                )
                r  z;_next = triton_helpers.welford_reduce(
                    z1, roffset == 0
                )
                z            r  r  r  r  )r  r#  r  r   r	  r"  r   r   r@  r  r  )r   rI  r  r  r  r  r  r  r  accumulator_m2accumulator_weightr  r  r  r  r  r  s                    rz   r  zTritonKernel.welford_reduce
  s    %%'$*A*AA#E*&<s+ *|73		m<(;(;(='>b
!L	
 			l4+>+>+@*AH:QO	
 			!",t/B/B/D.ERzQRS	
 ..$D"fLLW^$4G<N;O P MN#326H5I JF"RD6( + "%5555LLW^$4G<N;O PG2k]"^,<B?Q>R S 	MZ;-u(={KL MC
n-=U+C^ TU V J2D1EU/KM_$`#a b	
 !HHOO%O0	e422""

 
	
r|   c
                    | j                  ||||||	      }
|||g}t        ||
      D ]  \  }}|j                  | d|         |||fS )z0Helper to codegen call to triton_helpers.welfordr  )r  r   r   )r   r,  r  r  r  r  r  r  r  r  r  result_exprsresult_exprr  s                 rz   r  z+TritonKernel.welford_reduce_final_reduction3  si     vtReD#Y>"%lF"; 	6KMM[MUG45	6 I}44r|   c                    | j                  |||||      }||g}	t        |	|      D ]  \  }
}|j                  |
 d|         ||fS Nr  )r  r   r   )r   r,  r  r  r  r  r  r  r  r  r  r  s               rz   r  z2TritonKernel.online_softmax_reduce_final_reductionG  se     ,,VXxeT"J/"%lF"; 	6KMM[MUG45	6 :%%r|   c                D    | j                   r| j                   d   S t        S )NRSPLIT)r  r+   r   s    rz   
max_rsplitzTritonKernel.max_rsplitQ  s"    $$X..  r|   c                   | j                   d   }| j                         sdnd}||j                  z  | j                         z  }| j                  j                  |      \  }}| j                  j                  d| d| d| j                  |       dt        |       d| d	| d
| dd       | j                  j                  | d| dt        |       d       | dS )a	  
        Generate code to save a [XBLOCK, RSPLIT] temporary workspace, where each thread block writes a different
        column.  After the barrier, every thread block loads the completed value so that it can compute the final
        value independently.
        r  zxindex < xnumelNr  z_ws = (r:  z).to(tl.pointer_type(z))
                tl.store(z%_ws + (xindex * RSPLIT + rsplit_id), r   r  Tstripz_peers = tl.load(z_ws + (xindex * RSPLIT + rsplit_arange), rsplit_mask, eviction_policy='evict_first', other=triton_helpers.if_mask(rsplit_mask, r  _peers)r%  r  r  r  r  rs  r  r   r  r=   r  r   rV   )	r   rI  r  default_valxnumelr   ro  rq  rr  s	            rz   r  z7TritonKernel.codegen_cooperative_reduction_peer_combineV  s$    S!(,(@(@(B %..(4??+<<!GGPPQWX%%GG9C0A0A)0L/MMbcnotcubv w$%J:,VXY]X^ _  	& 	
 	&&l+J< 8eers~e  eA  ACD	
 V$$r|   c                   | j                   sJ d| _         | j                  |d      }d| _         | j                  j                  |      }t	        j
                         }| j                  r+|j                  | j                  || j                               t        |t              rY| j                  j                  t        || j                  |||j                  |      |d|j!                                            nXt        |t"              sJ | j                  j                  t        |d| d|j$                   d| d|j&                   d		             |j)                          y )
NFTr  r  r  r@  r  r   r   )r$  r  rC  r*  rP  rQ  r  rR  rS  r  rb  r   r   rL   r   rD  rL  r   r   r   rT  )r   rA  r   r  r  r  rW  s          rz   store_reductionzTritonKernel.store_reductionq  sR    $$$$ %==$=7 $iit$))+
%%$$,,T43G3GH h0  **55  ,+H,C,C,E+HI	 h888  **uD););(<CwbIZIZH[[\] 	r|   c           	        
 t               j                  d       t               
t        d      D cg c]!  t	        
fdt        |      D              # }}dj                  d t        j                  j                  |      D              }j                  d| d       t               dd	d
l
m}  |        G 
fddt              }j                         5  t        j                   |             5   || }	dj                  d |	D              }	j                  d|	        d d d        d d d        | j                   j#                  j%                               S c c}w # 1 sw Y   AxY w# 1 sw Y   ExY w)Nz@triton.jitr   c              3  V   K   | ]   }j                  d  d| |          " yw)r  r  r  N)r  )r   nr   dtypesr}  s     rz   r   z,TritonKernel._lift_helper.<locals>.<genexpr>  s.     X1#,,QCq}F1I,>Xs   &)r   c              3  2   K   | ]  }t        |        y wrq   r   r  s     rz   r   z,TritonKernel._lift_helper.<locals>.<genexpr>  s     Rc!fR   zdef {name}():rM  r   rd   c                  2    e Zd Z	 	 	 	 	 	 	 	 d fdZy)+TritonKernel._lift_helper.<locals>.CSEProxyc                    d| z   t        |      |i |}j                   t        	|      |i ||      S )Nr  r  )r  r!  )
r   rA  rC  r  output_dtyper   dtype_handlerhelperhelper_name	overridess
        rz   _defaultz4TritonKernel._lift_helper.<locals>.CSEProxy._default  sq     4&z) w!   # " #
 ||,GIt,d=f=& $  r|   N)rA  r   rC  ztuple[Any, ...]r  dict[str, Any]r~   r   )rr   r   r   r  )r   r  r  r  r  s   rz   CSEProxyr
    s-    '6@N r|   r  c              3  2   K   | ]  }t        |        y wrq   r  )r   r*  s     rz   r   z,TritonKernel._lift_helper.<locals>.<genexpr>  s     BFBr  return rN  )rM   r   rJ   r*  r  r   r  r  from_iterabler*  r  re   r%   rZ  rC   set_ops_handlerr  rT  r   )r   fnnum_argsr  r}  rC  	signaturere   r  outputsr   r  r  r  r  s      ``     @@@@@rz   _lift_helperzTritonKernel._lift_helper  s]     !'e 1X
 XhXX
 
 IIRioo.K.KD.QRR	=267#%	 *P24	 	~ 	$ ]]_ 	2a//
; 	2$iGiiB'BBGwwi01	2 	2
 $$(():k(RRU
J	2 	2 	2 	2s)   &E3+F2E88F8F	=FFc                     j                   sJ  j                  rJ d       t        d  j                  D              } j	                  |       t        |      } j                  rJ d       g }g }t        d |D              }t        j                   j                  j                   j                        } j                  |t        |      |      } j                          j                   z
  }	t#        ||      D ]1  \  }
} j                  j                   j                  |
 dt%        |       d|      } j                  j                   j                  d| d	 j'                          d|      }
|j)                  |
       t+        |      } j,                  r j                  j/                  |      } j1                         }d
|d<   dd	j3                  |       d}|j4                  rdnd} j6                  j9                  | d| d	| d	| d       |j)                  |       4 d  fd} |d |       d|	 d	| d|||      } j,                  s|D cg c]#  } |d| dt;        |j<                              % }} |t        |      t        |            } |t        |      |      }t#        ||      D cg c]   \  }} |d| d	| d|j<                        " }}}t#        |||      D ]*  \  }}} j                  j9                  | d| d	| d       , n|}|D ]$  }t?        |t@              sJ t        |      |_!        & t        |      S c c}w c c}}w )NTODOc              3  :   K   | ]  }|j                    d   ywrz  r{  r  s     rz   r   z$TritonKernel.scan.<locals>.<genexpr>  r|  r}  z(ops.scan not supported inside ops.maskedc              3  2   K   | ]  }t        |        y wrq   r?   r   r  s     rz   r   z$TritonKernel.scan.<locals>.<genexpr>       Fe*51Fr  r  r   r  r   r   r^  r  r`  ra  zfloat('nan')z-1r  c                2    dj                  d | D              S )Nr  c              3  &   K   | ]	  }| d   yw,Nr   r   r  s     rz   r   z1TritonKernel.scan.<locals>.csv.<locals>.<genexpr>       <EugQK<r2  r   r  s    rz   csvzTritonKernel.scan.<locals>.csv      88<V<<<r|   c                   t        |      }t        |      D cg c]  }|  d| d|  }}t        fd|D              r)|D cg c]  }j                  j	                  |       c}S |D cg c]  }j                  j                  |        }	}j                  j                   |	       d|         t        |	|      D ]*  \  }
}|r||
_	        j                  j                  ||
       , t        |	      S c c}w c c}w c c}w )Nr   c              3  T   K   | ]  }j                   j                  |       ! y wrq   r   containsr   rB  r   s     rz   r   z:TritonKernel.scan.<locals>.cse_multiple.<locals>.<genexpr>        LI488$$Y/L   %(r  r  )r  r*  r  r   rQ  r@  r"  r   r   r   rA  r  )r+  r  r  r  r  r}  
cache_keysrB  _dtyperesult_varsrI  r,  r   s              rz   cse_multiplez'TritonKernel.scan.<locals>.cse_multiple  s    FA;@8DaTF"QCr%1DJDLLLAKLIY/LLGMNV488???8NKNLL""{#$Cv. *-[*)E 4%
I+0J(Y
34 %% ELNs   C="D.#Dztl.associative_scan((r  ztriton_helpers.select_one((z1), rbase == (RBLOCK - 1), dim=-1, keep_dims=True)ztl.where(roffset > 0, z = tl.where(roffset > 0, )"r$  r  r   r,  r  r  r  r  r  r  r   r!  r"  r  r  r  r#  r   r  r	  rc  r  r  r@  rq  r   r  r  r   r?   r  rb  r  r   )r   r  r  r  r  broadcasted_valuesaccumulatorscse_computecombine_helper_fnr  r  r  value_dtyper  r  reduced_sizer  r8  partial_scan_varspartial_scan_varpartial_reduce_vars	accs_nextfull_scan_vars	full_scanpartial_scanr7  acc_nextpartial_reducerI  r,  s   `                            @rz   scanzTritonKernel.scan  s    $$$$--5v5-MD<L<LMM% u??N$NN"FvFF''(9(94<<H --j#f+vN%%'$*A*AA/ 	1LE5((++'1%89; , K
 HH%%";-r$2E2E2G1HJ & E
 %%e,&u-H,,"hhooEo:#335#&R !"499\#:";1=,1,C,C.		##"m;|nBwir(STU ##K05	18	=	& )#C(:$;#<CuBGXFYYZ[	
 (( ):#
 %	 12B1CCtu-.>.D.DE# # #5#6>Q8RSI'l(;=NON 03>CT/U
 ,I|	 ,YKr,qI&,,K  :=<)<: 5+~ &&"m#<XJbHXXYZ ,K% 	5Jj*;<<<#-e#4J 	5 [!!;#s   '(M&%M+c                     j                   sJ  j                  rJ d       t        d  j                  D              } j	                  |       t        |      } j                  rJ d        j                  sJ d       t        j                   j                  j                   j                        } j                          j                  z
  }t        d |D              }t!        |      t!        |      k(  sJ t#        |      D 	cg c]'  \  }}	 |d|	 d j%                          d||   	      ) }
}}	d
  fd} j                  d   j&                  sJ  j)                   j                  d         rdnd}t!        |      dk(  r0d|
d    d|
d    d| d| d| d| d} ||t!        |      ||      }nt+        d      t-        ||      D ]  \  }}||_        |j0                  |_         t        |      S c c}	}w )Nr  c              3  :   K   | ]  }|j                    d   ywrz  r{  r  s     rz   r   z$TritonKernel.sort.<locals>.<genexpr>F  r|  r}  z(ops.sort not supported inside ops.maskedz3ops.sort is only supported in persistent reductionsc              3  2   K   | ]  }t        |        y wrq   r!  r"  s     rz   r   z$TritonKernel.sort.<locals>.<genexpr>Q  r#  r  r   r   r   r  c                2    dj                  d | D              S )Nr  c              3  &   K   | ]	  }| d   ywr&  r   r(  s     rz   r   z1TritonKernel.sort.<locals>.csv.<locals>.<genexpr>[  r)  r2  r*  r+  s    rz   r,  zTritonKernel.sort.<locals>.csvZ  r-  r|   c                   t        |      D cg c]  }|  d| d|  }}t        
fd|D              r)|D cg c]  }
j                  j                  |       c}S t        |      D cg c]!  }
j                  j	                  ||         # }}
j
                  j                   	|       d|         t        ||      D ]*  \  }}|r||_        
j                  j                  ||       , t        |      S c c}w c c}w c c}w )Nr   c              3  T   K   | ]  }j                   j                  |       ! y wrq   r0  r2  s     rz   r   z:TritonKernel.sort.<locals>.cse_multiple.<locals>.<genexpr>_  r3  r4  r  r  )r*  r  r   rQ  r@  r"  r   r   r   rA  r  )r+  r  r  r  r}  r5  rB  r7  rI  r,  r   s            rz   r8  z'TritonKernel.sort.<locals>.cse_multiple]  s    ;@8DaTF"QCr%1DJDLLLAKLIY/LLEJ1XN488???;NKNLL""{#$Cv. *-[*)E 4%
I+0J(Y
34 %% ELNs   C>"D,&Dr  r   rnumelr   ztriton_helpers.sort_with_index(r   rE   z	, stable=z, descending=zUnhandled sort)r$  r  r   r,  r  r  r  r  r  r  r   r!  r"  r  r#  r  r  	enumerater	  r'  r  rm  r   r   r   )r   r  r  stable
descendingr  r;  r  r}  r  r9  r8  rP  r+  r7  rI  	input_varr,  s   `                @rz   sortzTritonKernel.sort=  s2    $$$$--5v5-MD<L<LMM% u??N$NN"(( 	
A	
(  ''(9(94<<H%%'$*A*AAFvFF6{c&k)))
 &f-	
 5 "5'D,?,?,A+B!DFSTI
 
	=	& #00002243C3CB3GHhv;!12DQ2G1HK]^_K`Ja b82cU)F8=AO  'tS[%HK !122%(f%= 	1!J	#(J  ) 0 0J	1 [!!Q
s   ,Hc                   | j                   s=| j                  s1| j                  s%| j                  s| j                  s| j
                  sy| j                  D cg c]  }|j                  s| }}| j                  rWt        |      dkD  rHt        |      D ]  \  }}| j                  j                  |      5  |j                  }| j                  rdnd}| j                  rdn| d}| j                  j                  d| d	| d
| d
|j!                          d	       ddd       | j                  j                  |dz         5  | j#                  || j                         ddd        | j                  j                  t        |            5  | j%                  | j                         | j                  j'                  | j                          | j                  j'                  | j                         | j                  j'                  | j                         | j                  j'                  | j                         ddd       t)        g t        |            D ]o  \  }}| j                  j                  |dz         5  | j*                  |j,                     j/                         D ]  \  }}|t        |      dz
  k  rs||dz      }	| j*                  |	j,                     |   }
t0        j3                  |	      }t5        |	j6                  |      }t9        ||
      D cg c]  \  }}|||z  z
   }}}| j                  j                  t;        | j<                  |   | d| d
t>        j@                  jC                  |       d              	 ddd       | jD                  jG                  | jH                         |jK                          r n| j                  j'                  | j                          | j                  j'                  | j                         | j                  j'                  | j                         | j                  j'                  | j                         | j                  j'                  | j                         | j                  rb| j                  s| j
                  rJ| jL                   d}| j                  j'                  d| dd       | jN                  jQ                          | j                  j'                  | j
                         | j                   jS                          | j                  jS                          | j                  jS                          | j                  jS                          | j                  jS                          | j
                  jS                          yc c}w # 1 sw Y   xY w# 1 sw Y   BxY w# 1 sw Y   xY wc c}}w # 1 sw Y   ^xY w)a  
        Concat output code from index_code, loads, compute, stores,
        suffix into self.body.

        For pointwise kernels, this is called just once at the end.

        For reduction kernels, this generates a loop over the reduction
        axis.
        Nr   )r   rsplit_startr"  
rsplit_endr  zfor zoffset in range(r   zBLOCK):rE   z = tl.advance(r   z + tl.program_id(1)zR
                if HAS_RSPLIT:
                    triton_helpers.x_grid_barrier(r  Tr  )*indexing_coder/  rO  r"  r  r  r,  r  r$  r  rQ  r  rZ  r!  r  r   r   r  r  r   r)  r  r   r  r   r   r   r  r   rL   r  rC   r  r  r   
invalidater  cache_clearr  r  rv  clear)r   r   
loop_treeslevelr!  
loop_startloop_endr  advancement	prev_treeprev_advancement
prev_blockprev_num_itercurprevsem_ptrs                   rz   codegen_bodyzTritonKernel.codegen_body}  s	    zz{{||%%##'+'7'7Ht4<<dH
H  S_q%8(4 JtYY%%U%3 ![[F373M3MSVJ(,(B(B6(RWHX  II''vh&6zl"XJbQWQ]Q]Q_P``gh YY%%UQY%7 J88tyyIJ JJ !!Z!9 ...tyy9		  !3!34		  ,		  .		  -.  ((@)J*?(@A #tYY%%UQY%7 262K2K		3eg.	; !3z?Q#66(2519(=I/3/H/H )0'0), *7)E)Ei)PJ,3IOOZ,PM 25[BR1S+$-C !$d]&: :+K +
 		++( $ 8 8 C#,+^I;bI^I^_jIkHllm n!4 ##D$:$:;  "9#< IIT//0IITZZ(IIT\\*IIT[[)		//0%%""d&:&:--..ABGII33:) <    66BBD		--.  "

$$&""$] I J J. .,+ sR   V=*V=9A&WWB0WBW/#W)5AW/W	W	W&)W//W9	c                V   g }| j                         rg }| j                  d|g        |D ]  }t        |t              r|j	                  t        |             .t        |t              rL|j	                  t        t        j                  j                  j                  |j                                     t        |t        j                        rB|j	                  t        t        j                  j                  j                  |                   t        dt        |              |S )Nr   z!Unsupported numel argument type: )r  add_numel_to_call_argsrb  r3  rc  r   ra   rC   r  r   	size_hint
inner_exprr   r$  r  r  )r   rC  
numel_argsr  s       rz   kernel_benchmark_extra_argsz(TritonKernel.kernel_benchmark_extra_args  s    !+-J''J;! Vc3'KKC)_5KKAGG$4$4$>$>s~~$N OPUZZ0KKAGG$4$4$>$>s$C DE$'Hc%TUUV r|   c                   t               }| j                  j                         \  }}}}|j                  g d       |j	                         5  t        j                         }g }t        ||      D ]  \  }	}
dt        |       }t        j                  j                  |	      }|r|j                  | dt        j                  j                  j                  |j                                dt        j                  j                  j                  |j!                                d|j#                          d|j%                          d
       n|	t        j                  j&                  v rt        j                  j&                  |	   }|j                  | dt        j                  j                  j                  |j)                                dt        j                  j                  j                  |j+                                d|j,                   d|j.                   d
       nt1        |
t2              rZt        j                  j                  j5                  |
j6                        }d|
j8                  v rd	}|j                  | d
|        nt1        |
t:              ryt        j                  j=                         }t        j                  j                  j5                  |
j                        }|j                  | d| d| d|
j.                   d       nt?        d|	       |jA                  |        |jC                  | jE                                |j                  ddjG                  |       d       d d d        |j                  g d       t        j                  j=                         }|jH                  }|j	                         5  |j                  dt        j                  jJ                  jM                  |       d       |j	                         5  |j                  t        j                  jJ                  jO                  |             d| }|j                  | d| d       |j                  tQ        tR        jT                         d| d       d d d        d d d        |j                  g d       |j	                         5  |j                  dt        j                  jJ                  jM                  |       d       |j	                         5  |j                  t        j                  jJ                  jO                  |             |j                  dtQ        tR        jT                         d       d d d        d d d        |j                  g d       |j	                         5  |j                  d       |j                  d       |j                  d       |j                  d       |j                  d|        |j                  d       |j                  d       d d d        |S # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   |S xY w)N)r   r   zdef get_args():arg_z = rand_strided(r   z
, device='z	', dtype=r   r=  r   r  z = torch.zeros(z*Don't find the buffer or const tensor for r  r'  )
rr  zdef call(args):zwith r_  streamz = get_raw_stream(z.run(*args, stream=)rr  rr  z def benchmark_all_configs(args):z.benchmark_all_configs(*args))rr  rr  zif __name__ == '__main__':z<from torch._inductor.runtime.benchmarking import benchmarkerr   zargs = get_args()z:ms = benchmarker.benchmark_gpu(lambda: call(args), rep=40)z	num_gb = zgb_per_s = num_gb / (ms / 1e3)z<print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s"))+rM   rC  python_argdefs
writelinesrZ  r  r  r   r  rC   r  try_get_bufferr   r   
size_hintsget_size
get_stride
get_devicer  	constantsrj  r.  devicer  rb  rR   rl  r  rA  rT   get_current_device_or_throwKeyErrorrc  extendro  r   r   
device_opsdevice_guard
set_devicer   r8   KERNEL_NAME)r   num_gbr5  _argdefs	call_argsr  r  name_cnt	var_namesarg_namearg_sigvar_namer  const_tensorsymval_hintr|  r  current_devicer   stream_names                       rz   codegen_kernel_benchmarkz%TritonKernel.codegen_kernel_benchmark  s>   !,0II,D,D,F))Y56]]_ %	@ (HI%(I%>  +!'!$x.!12gg,,X6$$#*$4QWW5E5E5P5PQTQ]Q]Q_5`4aacdedkdkdtdtdd  AD  AO  AO  AQ  eR  dS  S]  ^a  ^l  ^l  ^n  ]o  ox  y|  yF  yF  yH  xI  IJ  K !2!22#$77#4#4X#>L$$#*$4QWW5E5E5P5PQ]QbQbQd5e4ffhijipipiyiy  jE  jE  FR  FY  FY  F[  j\  i]  ]g  ht  h{  h{  g|  |E  FR  FX  FX  EY  YZ  [  1"#''"2"2"<"<W\\"JK
 %4&'$$z[M%BC6WW@@BFGG,,66w}}EE$$#*OE7*VHIV]VcVcUddef #DXJO    *A +B T==?@wtyy';&<A>?K%	@N 	9:<<>$$]]_ 
	uQWW%7%7%D%DU%K$LANO   GG&&11%8 !'ug.  K=0B5'!KL  ;22344G}TUV
	 	JK]]_ 	uQWW%7%7%D%DU%K$LANO   GG&&11%8   c+"9"9:;;XY		 	DE]]_ 	N R 01L y12=>N	  g%	@ %	@X 
	 
	  	 		  sf   L2YAY!BY$YAY4$A%Y(	Y4<A;Z YY	YY%(Y1	-Y44Y= Z
c                    t        j                  dj                  t        j                  j
                  j                  d                  S )Nzl
            from torch._dynamo.testing import rand_strided
            {}
            import torch
        get_raw_stream)textwrapdedentrD  rC   r  r  import_get_raw_stream_asr   s    rz   imports_for_benchmark_kernelz)TritonKernel.imports_for_benchmark_kernelK  s:     F177%%>>?OPQ
 	
r|   c                    | j                   ry| j                  ry| j                  r| j                  sJ y| j                  ryy)Nr  r  r  r  	pointwise)r  r  r  r$  r   s    rz   _get_heuristiczTritonKernel._get_heuristicT  sD    !''*&&(((()""r|   c                    t         j                  j                  j                         t        j                         t
        j                  t
        j                  t
        j                  j                  t
        j                  t
        j                  t
        j                  t
        j                  t
        j                  t
        j                  j                  t
        j                  j                   t
        j                  j"                  d} t         j$                  j&                  d| d<   t        j(                         rd| d<   t
        j*                  rLt
        j*                  | d<   t
        j,                  | d<   t
        j.                  | d<   t
        j0                  | d<   t
        j2                  r9t
        j2                  | d	<   t
        j4                  | d
<   t
        j6                  | d<   | S )N)backend_hash$are_deterministic_algorithms_enabledassert_indirect_indexingautotune_local_cacheautotune_pointwiseautotune_remote_cacheforce_disable_cachesdynamic_scale_rblockmax_autotunemax_autotune_pointwisemin_split_scan_rblockspill_thresholdstore_cubinTis_hiprw  profile_bandwidthprofile_bandwidth_regexprofile_bandwidth_output/profile_bandwidth_with_do_bench_using_profilingcoordinate_descent_tuning coordinate_descent_search_radius'coordinate_descent_check_all_directions)rs   utils_tritontriton_hash_with_backendr  r   r  r  r   r  r  r  r  r  r  r  r  r  rx  ry  rw  r  r  r  r  r  r  r  )inductor_metas    rz   inductor_meta_commonz!TritonKernel.inductor_meta_common`  su    "KK//HHJ494^4^4`(.(G(G$*$?$?"(--"B"B%+%A%A$*$?$?$*$?$?"//&,&C&C%+]]%H%H%}}<<!==44
 ==(&*M(#)-M+&##171I1IM-.7=7U7UM348>8W8WM45FF KL ++00 56 77 <= >> CD r|   c                   ! t               }i }| j                  j                         D ]  \  }}t        |      r| j                  st
        j                  j                  j                  |      }t        |t        t        j                  f      sd}nt        t        |            }|||<    ||j                  t                      t
        j                  j!                         j"                  }|dk(  r|j                  d       n|j                  d       t$        j&                  r|j                  | j)                                | j*                  j-                         \   }	!}	t/        !      D ]  \  }
}t        |t0              st3        t        j4                  |j6                        }|t
        j                  j                  j8                  v sbt1        |j:                  t
        j                  j                  j8                  |         !|
<    t=        t>                  }| j@                  D ]  }|| j*                  jB                  v r(|jE                  | j*                  jB                  |          || j*                  jF                  v rj|t
        j                  jH                  vrN|| jH                  vr@|jE                  t3        tJ        | j*                  jF                  |         jL                         || j*                  jN                  v s| j*                  jN                  |   }t        |tP              rJ |jE                  |        tS         !      D ]O  \  }}t        |tT              s|jV                  tX        jZ                  k(  s5|jE                  |j:                         Q t]        |      }| j_                         D ]Z  }t1        |j`                   d|jb                        }!je                  |        je                  tg        |j:                               \  !fd}| jh                  D ]K  }|jj                  r| jl                  r|jn                  ) ||j`                  jq                          d       M | jr                  r |d       tu        !| jv                   	      }|ty        jz                  t
        j                  j!                               i d
}t
        j                  j|                  xs t
        j                  j~                  }| j                         j                  t        | j                        t?        t        j                        ||| j                  | j                  | j                  d| j                         }| jr                  r| jl                  |d<   d }t$        j&                  st$        j                  r| j                         dz  }||d<   t        !      g|d<   t        !      D ]  }d|d   !|   j:                  <    || _N        | j                          | j                  D ]$  }|j                  d       |j                  |       & | j                  r2d| j                          d| j                  j$                  d|d|d	}n| j                  r;| j                  j                         }d| j                          d|d| d|d|d}nYd}t        |      dk(  rt        t        !            dk(  rd}nd}d| j                          d|d| d|d|d | j                   d}|j                  |       |j                  d!|xs t?        t        j                         d"dj                  d#  D               d$       |j                         5  | j                  |       | j*                  j                         D ]  \  }}|j                  | d%|         |j                  | j                         d d d        t$        j&                  r |j                  | j                  |             |j                         S # 1 sw Y   IxY w)&Ni    cpuz"triton_helpers.set_driver_to_cpu()z"triton_helpers.set_driver_to_gpu()r  c                    t               rj                  t        |              j                  t        | d             y )NT)is_constexpr)r>   rc  rI   rG   )r  argdefsr  s    rz   add_constexpr_argz6TritonKernel.codegen_kernel.<locals>.add_constexpr_arg  s/    -/  h!78NN78$?@r|   r   r  )
size_dtyper  )r  r|  r{  )	grid_typer  kernel_namemutated_arg_namesoptimize_memr   num_loadnum_reductionr  g    eAkernel_num_gbconfigsrE   r{  r   z$
                @triton_heuristics.z(
                    config=zI,
                    filename=__file__,
                    triton_meta=z$,
                    inductor_meta=z;
                )
                @triton.jit
            z!(
                    size_hints=z%,
                    reduction_hint=r   r  ztile_hint=TileHint.SQUARE,ztile_hint=TileHint.DEFAULT,r   zH
                    filename=__file__,
                    triton_meta=z*,
                    min_elem_per_thread=zdef r  c              3  <   K   | ]  }|j                           y wrq   )	full_namer  s     rz   r   z.TritonKernel.codegen_kernel.<locals>.<genexpr>Z  s     CcVWAKKMCcr  r  r  )arM   r%  r  r9   r$  rC   r  r   symbolic_hintrb  r3  r   r>  r-   r   r   r}  r  r   benchmark_kernelr  rC  rt  rQ  rR   r	   r   r  inv_precomputed_replacementsrA  r   r   	mutationsinput_buffersrT  rN  removed_buffersrN   
inner_nameoutput_buffersrQ   r   rT   	zero_moderU   ZERO_ON_CALLr  r  r!  r  rc  rG   r,  r'  r  
tensor_dimr   r  r`   rr  r)   r6  is_inferenceis_backward_get_grid_typerr   setr  r8   DESCRIPTIVE_NAMEr   r  r  r  r  estimate_kernel_num_bytesr\   r]   r  ri  r  r   r  r  r&  get_reduction_hintr  r^   r5  r  r   rZ  codegen_static_numelsaliasesr  r  r   )"r   rA  coderw  r!  r  
numel_hintrl  device_typer  r}  r  r  mutated_argsmutationmutation_argargnamer   sizeargr  triton_meta_signaturer  r  r  r  arg_numr  heuristics_linereduction_hint	tile_hintoldnewr  r  s"                                   @@rz   codegen_kernelzTritonKernel.codegen_kernel  s|   
![[..0 	+MFE"6*43H3H))77>Jj3*>? !	+C
O<	!*Jv)	+, <KK134''==?DDKe#@A@A&&D==?@#'99#;#;#= Iq	* 	FAs#w' ellCHH5QWW--JJJ#*!''"2"2"O"OPV"W$IaL	 "#( 	/H499222  !8!8!BCDII555AGG$;$;;D$8$88  )B)B8)LMXX 499333#yy77A%lJ???  .	/6  3 	/LGS3-MM%6%C%CC  .	/ l+++- 	2DU3TZZ@GW%NN77<<01	2	A $$ 	=D  T%>%>&!2!2!4 5U;<	= %%h' 1$"2"2G!
 /&--agg.Q.Q.ST'
 ww++Bqww/B/B ,,.77!$"5"56{;;<!-(!//
 '')
 %%484M4MM01""f&>&>335;F-3M/*"+I"6!7I +95 	BG@AK$Yw%7%<%<=	B '++ 	 FNN2KK	  #$$($7$7$9#: ; --447 8!!, 0##0"3 4O ""!]]==?N#$$($7$7$9#: ;  *~ .$$2#3 4!!, 0##0"3 4	O I:!#/	:;q@ <I =I#$$($7$7$9#: ;  *~R	{ ;!!, 0##0"3 4))-)A)A(B C	O 	O$473{6678$))Cc[bCc:c9ddfg	
 [[] 	#&&t, II--/ 1S#c#/01KK		"		# ""KK55f=>}}	# 	#s   2A%aa(c                   t         j                  j                  j                  |       } t	        | t
        j                  t        f      rt        |       }t        |      }|S d}t         j                  j                  j                  | |      sC|dkD  rt        d|        |dz  }t         j                  j                  j                  | |      sC|S )N   i @  z!Failed to find static RBLOCK for r   )rC   r  r   simplifyrb  r   r>  r3  r-   statically_known_leqr  )rP  r3  s     rz   _get_persistent_RBLOCKz#TritonKernel._get_persistent_RBLOCKg  s    !!**62fu}}c23f+C!#&C 
 Cgg&&;;FCH?$'H%QRRq gg&&;;FCH 
r|   c                N    	 t         j                  |        y# t        $ r Y yw xY w)NTF)r  r  r  )rP  s    rz   has_persistent_RBLOCKz"TritonKernel.has_persistent_RBLOCKu  s*    	//7 		s    	$$c                   d	d}| j                   D ]M  }|j                  r| j                  rdt        j                  j
                  j                  |j                        } ||      r)|j                  |j                   dt        |              |j                  r| j                  r| j                  r1| j                  | j                  |j                              }d| d}n| j                  |j                        }|j                  |j                  j!                          d|        |j                  dk(  s/| j"                  s=|j                  d       P y)
a  
        We get a small speedup from hard coding numels if they are static.

        This code stomps on the passed-in values by writing an constant to the top of the kernel.

        In a kernel like:
        def KERNEL_NAME(in_ptr0, in_ptr1, out_ptr2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):

        We would add
        xnumel = 4096
        r0_numel = 768

        After the signature, before the kernel code, if we decided to make these static. As its hardcoded, it becomes
        a better signal to triton on how to unroll and do some static indexing. So, it's not so much that downstream
        knows that its a static numel, as that you just plop a constant into the kernel.
        c                B    t        | t        j                  t        f      S rq   )rb  r   r>  r3  )r  s    rz   is_static_integerz=TritonKernel.codegen_static_numels.<locals>.is_static_integer  s    dU]]C$899r|   znumel = z*triton_helpers.constexpr_next_power_of_2((z + RSPLIT - 1) // RSPLIT)zBLOCK: tl.constexpr = r  zXBLOCK: tl.constexpr = 1N)r  r   r~   r}   )r,  r'  r$  rC   r  r   r  r  r   r!  r3  r  r  r  r&  r  r   r   )r   r  r  r   simplified_tree_numelr  r3  s          rz   r  z"TritonKernel.codegen_static_numels}  s   $	: $$ 	;D$$(=(=()(8(8(A(A$**(M%$%:;NNdkk](3?T;U:V#WX  T%>%>-- JJt';';DJJ'GHEFugMfgC55djjAC$++"3"3"5!66LSERS{{c!dmm9:	;r|   c                   t        | j                  D cg c]  }t        |j                          c}      }| j                  r|dk(  sJ t
        j                  S |dk(  rt
        j                  S |dk(  rIt        t        | j                  | j                              rt
        j                  S t
        j                  S |dk(  rt
        j                  S t        d|       c c}w )NrE   r   r   z"Unsupported number of dimensions: )r  r,  r3  r'  r  r&   CooperativeReductionGridGrid1Dr   r   needs_yz_grid_overflowGrid2DWithYZOverflowGrid2DGrid3Dr  )r   r   r  s      rz   r  zTritonKernel._get_grid_type  s    8H8HI***+IJ%%6M6$===!V$+++!V3t22D4D4DEF(===$+++!V$+++=aSABB Js   C!c                   | j                   D ]  }t        |j                  t        j                  t        j
                  f      r|j                  }n*t        j                  j                  j                  ||      }|j                  r| j                  s|j                  |       |j                  t        |              y rq   )r,  rb  r  r   r>  r   rC   r  wrapper_codegenerate_numel_exprr'  r$  rc  r  )r   rA  r  	arg_typesr   r  s         rz   rk  z#TritonKernel.add_numel_to_call_args  s    $$ 	-D$**u}}ell&CDzzww++??dK$$(=(=  &  d,	-r|   c                   t         j                  j                  }|j                          | j                  j                         \  }}}}| j                  |||       | j                  j                  D ]  }|j                  |        |j                  ||d|| j                         t        | j                  j                        D ]  }|j                  |        y )NT)r   r  r  )rC   r  r  write_triton_header_oncerC  rt  rk  workspace_argsgenerate_workspace_allocationgenerate_kernel_callr  r)  generate_workspace_deallocation)r   rA  r6  wrapperr  r  r  wss           rz   call_kernelzTritonKernel.call_kernel  s    ''&&((*%)YY%=%=%?"9a##D)Y?))** 	6B11"5	6 	$$(( 	% 	
 499334 	8B33B7	8r|   c                   t         j                  j                  }| j                  j	                         \  }}}}t        ||      D ]w  \  }}t        |t              st         j                  j                  r|j                  d| d| d       Jd| d}|j                  |       d| d}|j                  |       y y )Nz:AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_check_inf_and_nan("z", z));zassert not z.isnan().any().item()z.isinf().any().item())
rC   r  r  rC  rt  r   rb  rS   cpp_wrapperr   )r   r  r  r  arg_signaturesr  arg_signaturer+  s           rz   codegen_nan_checkzTritonKernel.codegen_nan_check  s    ''&&*.))*B*B*D'9na"%i"@ 
	,C-377&&%%TUXTYY\]`\aade )-BCD%%d+(-BCD%%d+
	,r|   c                    t        |i |S rq   )r  )r   rC  r  s      rz   create_cse_varzTritonKernel.create_cse_var  s     $1&11r|   c                   |j                    d| j                  | j                  |j                               }|j                  j
                  r| j                  j                  |       y | j                  j                  |       y r  )	rA  r  r&  r  rootr  rY  r   r  )r   entryr+  s      rz   codegen_iteration_ranges_entryz+TritonKernel.codegen_iteration_ranges_entry  sd    **SD,@,@,L!M NO::((. II%r|   c                   |j                   J | j                  |j                         }| j                  }|dk7  rd| dnd}| j                  r| j                  r|j
                  r| d}d|j                  j                          d| | S )Nr  r  r   r   z + rsplit_startztl.arange(0, zBLOCK))r  indexing_size_strrr  r  r  r'  r!  r   )r   r  rj  rr  r  s        rz   r  z)TritonKernel.iteration_ranges_ranges_code  s    +++%%e&6&67&&*5*C4}A&&&))""x/Fu||1134F4&IIr|   c                ^    | j                   }| j                         }dg|z  }d| d| d| dS )NrE   r>  r   r   )rr  r  )r   r  r  rr  r  rj  s         rz   iteration_ranges_scalar_codez)TritonKernel.iteration_ranges_scalar_code  sC     &&&&(sTz$r%;-q99r|   c                $   |j                   J d|j                    d}| j                  |      r#d| d|j                   dz    d|j                    d}|j                  j                  ||      }| j                  dk7  r| d	| j                   dS |S )
Nztl.program_id(r   r  z + tl.program_id(rE   z) * tl.num_programs(r  r  r  )r  r  	pid_cacherQ  rr  )r   r  r2  pids       rz   iteration_ranges_get_pidz%TritonKernel.iteration_ranges_get_pid  s    ~~)))u~~.a0 &&u- cU+ENNQ,>+??STYTbTbSccefCoo!!#s+z)U$t//022
r|   c                    |j                   dk(  xr[ |j                   xrL | j                   xr= t        j                  j
                  j                  |j                  t                      S ru  )	r  has_zdimr  rC   r  r   r  r  r,   )r   r  s     rz   r  z#TritonKernel.needs_yz_grid_overflow  sa    NNa YNN"Y...Y GG$$99%++~GWXX		
r|   c                    | j                   r | j                   |j                          d   S t        |j                            S )Nr   )r  r   r*   )r   r!  s     rz   r  zTritonKernel.max_block  s;    $$'7u%=>>//r|   c                   | j                   sy| j                  rW|j                  j                          d| j                  v r.| j                  |j                  j                          d   dk(  r6yt        j
                  j                  j                  |j                  d      ry|j                  r(| j                  r| j                  |j                        }n9|j                  dk(  r| j                  rd}n| j                  |j                        }|j                  r| j                  r|| j                         z  }t        j
                  j                  j!                  |j                  |      r[|j"                  dk7  xsJ |j$                  xs< t        j
                  j                  j'                  |j                  t)                     S y)NFr   rE   Tr  )r  r  r!  r   rC   r  r   r   r  r'  r  r  r   r  r  r  rG  r  r  r  r,   )r   r   r  s      rz   r  zTritonKernel._has_constant_mask  sa   !!DKK$5$5$7#8!>$BSBS!S  DKK$5$5$7#8!>?1Dww77

AF !:!:33DJJ?I[[CDMMIt{{3I!;!;!DOO$55I 7788YO" W==W77##88^EUV r|   c                d    | j                   d   }|j                  dk(  sJ | j                  |      S )Nr   r  )r,  r!  r  )r   xtrees     rz   r  z TritonKernel._has_constant_xmaskG  s5      #||s"""&&u--r|   c                    | j                   D ]2  }| j                  |      s|j                  |j                   d       4 |j                  d       y )Nr   r   )r,  r  r3  r!  )r   r   r   s      rz   r  zTritonKernel.filter_masksL  sN    $$ 	8D&&t,!!T[[M"67	8
 	&!r|   c                    t        t        j                        d | j                   D cg c]  }t        |    c}S c c}w rq   )r(  r   r   r#  r   )r   r   s     rz   get_reduction_prefixesz#TritonKernel.get_reduction_prefixesT  sB     ]::;<Ud>U>UV
 t
 	
 
s   <c                   | j                   D cg c]  }|j                  s| }}dj                  t        d |D                    }|j	                  d| j                  |              | j                   D cg c]+  }|j                  rt        j                  |j                     - }}t        |      }|j	                  d| j                  |              yc c}w c c}w )z^
        Generates code that flattens ND reduction numels, block sizes, etc. into 1D.
        r  c              3  :   K   | ]  }|j                    d   yw)r  Nr{  r  s     rz   r   z8TritonKernel.codegen_reduction_numels.<locals>.<genexpr>a  s     "UTdkk]%#8"Ur}  z	rnumel = zRBLOCK: tl.constexpr = N)
r,  r'  r   r  r   r  r   r   r   r;   )r   r,  r   reduction_treesrP  	rn_blocksrN  s          rz   r  z%TritonKernel.codegen_reduction_numels[  s    
 -1,<,<RD@Q@Q4RRF"U_"UUV	$**V"4!567
 ((
   %%dii0
	 

 y)/

60B/CDE S

s   CC90Cc                |    | j                         }|D cg c]  }t        j                  | | fi | c}S c c}w )zK
        Helper to initialize symbols like rn_numel, rn_base, etc.
        )r$  r   r   )r   r  r  rn_prefixesr!  s        rz   r  z#TritonKernel._get_reduction_symbolsm  s=     113JUVxx0;F;VVVs   !9c                    | j                         }| j                  ddd      }t        t        |      dz
        D cg c]  }t	        ||dz   d        c}t        j                  d      gz   S c c}w )z
        Compute coefficients to convert ND reduction indices to linear indices.
        For example:
          rindex = r0_index * r1_numel * ... * rn_numel + ... + rn_index.
        r  Tr   rE   N)r$  r  r*  r  r;   r   r>  )r   r*  	rn_numelsrJ  s       rz   _get_reduction_index_coeffsz(TritonKernel._get_reduction_index_coeffst  s{     113//PT/U	;@[AQTUAU;V
47M)C!GI./
]]1 	 
s   A0c                :    | j                         }t        ||      S )zK
        Compute linear reduction indices from N dimensional ones.
        )r-  r:   )r   
multi_indscoeffss      rz   r  z'TritonKernel._flatten_reduction_indices  s     113,,r|   c                $   | j                  ddd      }| j                  ddd      }| j                  |      }|j                  d| j                  |              | j                  |      }|j                  d| j                  |              y)zX
        Generates code that converts ND reduction indices into linear indices.
        r   Tr   r   z
roffset = z	rindex = N)r  r  r   r  )r   r,  
rn_offsetsrn_indsr9  rindexs         rz   r  z&TritonKernel.codegen_reduction_indices  s    
 00d 1 

 --gtQU-V 11*=
4#4#4W#=">?@009	$"3"3F";!<=>r|   c                   |j                   }|j                  r%|j                  |j                   d| d| d       n|j                  D|j                  |j                   d| j                  |              |j                  | d       n|j                  | d| j                  |       }n| j                  || d      }|j                  | d| j                  |       d|j                          d|j                   d| g       | j                  |      r(| j                         }|j                  | d	| d
       y |j                  | d|j                   d| d       y )Nr  z	offset + r  z
offset = 0r   z	offset = r  r   zmask = tl.full(z, True, tl.int1)zmask = z < r  )r!  r  r   rA  r  r  r  r  ru  r  r   r  r	  )r   r  r  r  r+  rm  s         rz   r  z,TritonKernel.iteration_ranges_codegen_header  sb    LL==NNejj\QCy4@A^^#NNejj\T-N-Nu-U,VWXNNaS
+,+Id&G&G&N%OP881#VMOOc4#@#@#G"HAGGI;V[\zzl#dV, ""5)'')ENNaSw6FGHNNaS

|3qc?@r|   )r   TN)r  zdict[str, sympy.Expr]r  zOptional[FixedTritonConfig]r~   r   r  r  r~   r   r   r   )r   r   )r   )rA  r   r  r   r  r   r~   ztuple[str, str])r  r   rj  r   r)  r}   r   r}   )rA  r   r   r   rq   )
rA  r   r   r   r  rK   rU  rB   r~   r   NN)r  rK   r]  z.tuple[str, sympy.Expr, sympy.Expr, sympy.Expr]r^  rK   r_  r  r  r}   r`  z Optional[tuple[str, sympy.Expr]]ra  zOptional[CSEVariable]r~   rK   )r  r   r  r  r~   r   )
r  r  r/  r  r  rA   r  +Union[CSEVariable, tuple[CSEVariable, ...]]r~   r8  )r  r  )rA  r   r   r   r  r8  )r  tuple[torch.dtype, ...]r~   r   )r  r9  r  zUCallable[[tuple[CSEVariable, ...], tuple[CSEVariable, ...]], tuple[CSEVariable, ...]]r  tuple[CSEVariable, ...]r~   r:  )
r  r9  r  r:  rR  r}   rS  r}   r~   r:  )r~   rJ  )r~   z type[triton_heuristics.GridExpr])rA  r   r6  zOptional[IRNode]rZ  )r~   r  )r  rX   )r  rY   r~   r   )r  rY   r  r   r~   r   )r  rY   r~   r}   )r!  r   r~   r3  )r   rY   r~   r}   )r   r   r~   r   )r,  rM   r~   r   )r  r   r~   zlist[sympy.Symbol]rY  )r/  r  r~   r   )r  rY   r  rM   r~   r   )Mrr   r   r   r  r  r   r%  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r  r   r-  r(  r,  rX  rS  ri  rn  rt  r  r  r  r  r  r  r  r  r   r  rH  rU  ri  ro  r  r  r  r\  r  r  r  r  r  r  rk  r  r  r  r  r  r  r  r  r  r  r  r  r2   r$  r  r  r-  r  r  r  r  r	  s   @rz   r  r    s   %I%%).E&.O
 48%3%%3
 2%3 
%3N"

#J*:0

 " " fTfTR	 EG- - !- -<- 	- ^8MM M 	M
 M4EP SW(( *(3>(FO(	(T  480411 C1 &	1
 $1 1 11 .1 
1f.
"DD D &	D
 ;D 
5DL&DO&"9
v5(&!
%6(( ( ;	(T1Sfm"'m"
m" (m" 
!m"^>"'>" (>" 	>"
 >" 
!>"@b%H Xt

 % %N]~    $;LC
-8(,2&J:(:14:	:
0
(T.
" 
 
F$W 
 
-? A(A0>A	Ar|   r  c            
      `    e Zd ZU eZded<    eej                  ej                  ej                  ej                  ej                  ej                  ej                  ej                  g      Zd fdZedd       Zd Zd ZdddZ	 d	 	 	 ddZ	 	 	 	 	 	 	 	 dd	Z	 	 	 	 	 	 	 	 dd
Zd Z xZS )TritonSchedulingz	type[Any]kernel_typec                    t         |   |       |t        |d      sy |j                  D ]$  }t	        |t
        t        f      st        |_        & y )Nr4  )	r  r  r   r4  rb  r1   r/   debug_triton_codedebug_device_str)r   	schedulerr6  r  s      rz   r  zTritonScheduling.__init__  sM    #GIw$?OO 	:D$0B CD(9%	:r|   c                    t         j                  j                  st         j                  j                  r't	        g | j
                  t        j                        S | j
                  S rq   )r   r   cooperative_reductionsforce_cooperative_reductionsr   backend_featuresrH   REDUCE_TO_SINGLE_ELEMENT)rv   r|  s     rz   get_backend_featuresz%TritonScheduling.get_backend_features  sR     MM00}}99P#&&P(O(OP  ###r|   c                   t         j                  j                  }t        ||      \  }}|r|j	                  |       t
        j                  rvddlm}m	 t        fd|D              sY|D cg c]  }t        ||      r|j                           }}|j	                  |j                   ddj                  |              y y y c c}w )Nr   )r.   ForeachKernelSchedulerNodec              3  6   K   | ]  }t        |        y wrq   )rb  )r   r  rI  s     rz   r   z3TritonScheduling.codegen_comment.<locals>.<genexpr>  s      >?
189s   z Fused node name list: r   )rC   r  r  r6   r   r   debug_fusiontorch._inductor.schedulerr.   rI  r   rb  get_namecommentr   )	r   node_scheduler  origins_detailed_originsr.   r  
node_namesrI  s	           @rz   codegen_commentz TritonScheduling.codegen_comment  s    ''&&%8%P""g&
  CP  +!!%67 JJL
 
 !!''>tyy?T>UV s   .#Cc                   t         j                  j                  }||j                  v r|j                  |   }|S t        j
                  j                  r$t        |t        j
                  j                        nd}t        |      d d }dj                  d|||j                         g      }||j                  |<   t        j
                  j                  r|nd}|j                  t        t        j                        |      }|j                  t        t        j                         |      }|j                  dd      }t#        t%        |j'                               d      \  }	}
}t)               }t*        j-                         rt*        j                  ||       |j/                  d	|d
       |j1                  |d       t         j                  j3                         }|j/                  d|j4                   d       d| }t7        ||      \  }}|d|z   dz   |z   z  }|j9                  ||j;                         |       t=        j>                  d      rt=        j@                  |||       |S )Nr   r   r  r   triton_z#pragma CMT#pyzasync_compile.triton(z, '''Tr  z''', device_str='z')z# kernel path: rr  kernel_metadata)!rC   r  r  src_to_kernelr   r   descriptive_namesr5   rD   r   next_kernel_suffixunique_kernel_namesreplacer   r8   r  r  r#   r"   r  rM   async_compileuse_process_poolr   r   r}  r  r6   define_kernelr   r    is_metric_table_enabledlog_kernel_metadata)r   src_coderO  r  r  r  
fused_namekernel_category	subs_name	_basenamer  kernel_pathcompile_wrapperr  metadata_commentrP  detailed_originss                    rz   r`  zTritonScheduling.define_kernel  s4   ''&&w,,,!//9Kj c ==22 &mV]]5T5TU 
 AJ2ANO((?J8R8R8TUK /:G!!(+'-}}'H'HiI
  ''K,H,H(I;WH''K,C,C(DiPH  ''s;H(08>>;K1Ld(S%Iq+,.O--/ $$Y9%%(=i]%&PQ""84"8WW@@BN%%(9.:M:M9Nb&QR!0>(;M7(S%G%w 58H HH!!_5579I ../@A++KhOr|   c                    | j                  |d      }t        j                  |      }| j                  ||t	        d |D                    S )z
        Benchmark fused list of nodes and return the execution time
        in milliseconds on randomly generated inputs.
        T)r  c              3  <   K   | ]  }|j                           y wrq   rM  r   r  s     rz   r   z9TritonScheduling.benchmark_fused_nodes.<locals>.<genexpr>3  s     :WA1::<:Wr  )rR  )generate_kernel_code_from_nodesr$   r,  benchmark_codegened_moduler   )r   r4  n_spills_thresholdrc  rP  s        rz   benchmark_fused_nodesz&TritonScheduling.benchmark_fused_nodes+  sV    
 77PT7Ux(..#
:WQV:W0W / 
 	
r|   c                  	
 t        t        j                  j                        }t	               5  |j                  t        j                  j                               5  dfd

fd}
fd}||nt        dg      }t        j                  d|j                          |        j                  fcddd       cddd       S j                         	j                  j                  	   j                  	 d          j(                  }t+        |      d
k(  sJ |d   j,                  |kD  rt'        d	      nNt/        j0                  	fd      t+        j2                        dkD  rt/        j0                  	fd      z
  t        j                  d|        |        j                  fcddd       cddd       S # t        $ rn}t         j"                  j$                  r t        j                  d||       t'        d	       |        j                  fcY d}~cddd       cddd       S d}~ww xY w# 1 sw Y   nxY wddd       y# 1 sw Y   yxY w)z$Benchmark an already compiled moduleNc                 ~     j                   J t        j                  j                   j                         d   dz   S Nr   z.kernel_perf__file__ospathsplitextrP  s   rz   cache_file_pathzDTritonScheduling.benchmark_codegened_module.<locals>.cache_file_pathA  s6    ||///ww''5a8>IIr|   c                             } t        | d      5 }|j                  t                     d d d        y # 1 sw Y   y xY w)Nwopenwriter   )rz  fdr}  mss     rz   store_cachez@TritonScheduling.benchmark_codegened_module.<locals>.store_cacheE  s;    &($_ &HHSW%& & &s	   9Ac                             } t         j                  j                  |       r.t        |       5 }t	        |j                               cd d d        S y # 1 sw Y   y xY wrq   )ry  rz  existsr  floatreadrz  r  r}  s     rz   
load_cachez?TritonScheduling.benchmark_codegened_module.<locals>.load_cacheJ  sM    &(77>>$'d 0r$RWWY/0 00s   AA unknown%kernel src code for %s written to: %sr   z*Exception (%s) in compiling fused nodes %sinfrE   c                 4      j                     d         S r=  
clone_argsrC  callwrapped_jit_functions   rz   r  z=TritonScheduling.benchmark_codegened_module.<locals>.<lambda>z       D!@!5!@!@$!G!JK r|   c                 "     j                     S rq   r  rC  r  s   rz   r  z=TritonScheduling.benchmark_codegened_module.<locals>.<lambda>  s     ? 4 ? ? F r|   z+The fused kernel for %s took %.3f ms to run)r   rC   r  r  r   r|  r}  r   r  debugrx  get_argsr  rU  r  	Exceptionr   r   .disallow_failing_autotune_kernels_TESTING_ONLYr  	launchersr  n_spillsr'   benchmark_gpur  )r   rP  rr  rR  device_interfacer  r  r  r  rC  r}  r  r  r  s    `       @@@@@rz   rq  z+TritonScheduling.benchmark_codegened_module6  sK    4AGG4G4GH P	$##AGG$G$G$IJP	$ BJ&
 )4
*i[:Q  II7
 B~3<<'AP	$ P	$ P	$D <<>D88D#&;; (4)44d;A>? -66Iy>Q&&& |$$'995\ !..K +==>Bk77F B II=
 Ms||#aP	$ P	$ P	$P  
(==OO		@
 5\3<<''eP	$ P	$ P	$P
(QP	$ P	$ P	$ P	$ P	$sh   .I%"AI<	I%(I8GB3I	I%	IAI0I1I5	I%III	I%%I.c                   |j                  d      }|xr  t        d |j                         D              }| j                  }|rddlm} |}|rd|d<   |j                  d      r
d|d	<   d|d<   t        j                  |j                        s|j                  d	      rJ d|d	<   t        j                  j                  ||||      } ||i |}| j                  |||      S )
NrH  c              3  <   K   | ]  }|j                           y wrq   )is_split_scan)r   r6  s     rz   r   z9TritonScheduling.create_kernel_choices.<locals>.<genexpr>  s      (
%)D (
r  rE   )TritonSplitScanKernelFoverride_cooperative_reductionrU  Toverride_persistent_reduction)contains_opr   scheduler_nodesr=  triton_split_scanr  r  r  reduction_numelrQ  rC   r  triton_kernel_kwargsadd_multi_kernel_choices)	r   kernel_featureskernel_argskernel_kwargsis_scanr  r=  r  r  s	            rz   create_kernel_choicesz&TritonScheduling.create_kernel_choices  s     "--f5 
C (
-<-L-L-N(
 %
 +/*:*:@/K>CM:; &&v.=AM9:>CM:;11/2Q2QR$(()HIII=BM9:		66+}
 k;];,,V[-PPr|   c           	        |g}t         j                  j                  s|S |j                  xr |j	                  d       }|j
                  xr |j	                  d       }|r%|j                   | j                  |i |ddi       |r|j                  j                  }t        j                  j                  j                  |d      r[|j                   | j                  |i |ddix}       |r2|j                  r&|j                   | j                  |i |ddd       t        |      dkD  r.|dd  D ]  }	|j                  |	_         |j!                  d        |S )	Nr  r  Fi   )r  r  rE   c                    | j                   S rq   )r  )ks    rz   r  z;TritonScheduling.add_multi_kernel_choices.<locals>.<lambda>  s    q'='= r|   )r2  )r   r   multi_kernelr  rQ  r  rc  r=  r&  r  rC   r  r   r  r  must_keep_buffersrU  )
r   r  r  r  kernelsoptional_persistentoptional_cooperativerP  r  kernel2s
             rz   r  z)TritonScheduling.add_multi_kernel_choices  s    (.h}}))N$99 
-BSBS+C
 ?
  &;;  
MDUDU,E
 A
 NN    # 38  __44Fww44VUC-T--$' 8= E '5+E+ENN((((+ <A:?	 w<!"12; E,2,D,D)E LL=L>r|   c                   fdfd}fd}dg }}d}t         j                  j                  }t        |      t         j                  _        t         j                  j                  }t        |      t         j                  _        t
        j                  dkD  }	t
        j                  dkD  }
| j                  |d|	|
d      }|D ]  \  }}}|D cg c]  }|j                          }}|D cg c]  }|D ]  }|j                           }}}|j                  t        t        j                        d      }t        j                   |      t"        j%                  d	|j&                          |       \  &|z  }|z  }|j)                  j&                         ܉j+                         j,                  j.                    j0                   d          j2                  }t5        |      d
k(  sJ |d   j6                  dkD  rt9        d      xn3t;        j<                  fd      t;        j<                  fd      t"        j%                  dt        d |D                      |        |z  }|z  }|j)                  j&                          |t         j                  _        |t         j                  _        |||fS c c}w c c}}w )Nc                 ~     j                   J t        j                  j                   j                         d   dz   S rv  rw  r|  s   rz   r}  z@TritonScheduling.benchmark_combo_kernel.<locals>.cache_file_path  s6    <<+++77##CLL1!4~EEr|   c                             } t         j                  j                  |       rCt        |       5 }t	        d |j                         j                         D              cd d d        S y# 1 sw Y   yxY w)Nc              3  2   K   | ]  }t        |        y wrq   )r  )r   r  s     rz   r   zNTritonScheduling.benchmark_combo_kernel.<locals>.load_cache.<locals>.<genexpr>  s      Eaq Er  r7  )ry  rz  r  r  r  r  splitr  s     rz   r  z;TritonScheduling.benchmark_combo_kernel.<locals>.load_cache  s^    "$Dww~~d#$Z F2  E2779??3D EEF FFs   .A,,A5c                             } t        | d      5 }|j                  t              dz   t              z          d d d        y # 1 sw Y   y xY w)Nr  r  r  )rz  r  r}  r  ms_clones     rz   r  z<TritonScheduling.benchmark_combo_kernel.<locals>.store_cache  sH    "$DdC 8BR3X678 8 8s   *AAr   g        T)subkernel_nodescustom_part_algorithmenable_autotunemixed_sizesonly_gen_src_coderU  r  rE   r  c                 4      j                     d         S r=  r  r  s   rz   r  z9TritonScheduling.benchmark_combo_kernel.<locals>.<lambda>/  r  r|   c                 (     j                     d   S r=  r  r  s   rz   r  z9TritonScheduling.benchmark_combo_kernel.<locals>.<lambda>2  s    ;0;;TB1E r|   zDThe fused kernel for %s took %.3f ms to run, %.3f ms to clone inputsc              3  <   K   | ]  }|j                           y wrq   rn  ro  s     rz   r   z:TritonScheduling.benchmark_combo_kernel.<locals>.<genexpr>7  s     <A1::<<r  )rC   r  r  r   inplaced_to_remover   combo_kernels_autotunecombo_kernel_allow_mixed_sizesgenerate_combo_kernel_code	get_nodesrM  r]  r   r8   r  r$   r,  r  r  rx  rc  r  r  rU  r  r  r  r  r  r'   r  )r   	node_listr  r  total_ms	file_listtotal_clone_msremoved_buffers_originplaced_to_remove_origr  r  kernel_code_listrc  r  
node_groupr6  fused_node_listsr4  r  namesr  rC  r}  r  rP  r  r  r  s                        @@@@@@@rz   benchmark_combo_kernelz'TritonScheduling.benchmark_combo_kernel  s   
	F	 	8
  ) # ww66",-A"B"#''"<"<%/0G%H" 77!;;;a?::%"&+#" ; 
 (8 2	+#Ha=GHT 0HH/?OeOAQZZ\O\OEO''K,C,C(DiPH""8,CII7
 &<LB~B(*  .<<>D88D#&;;  0%00$7:;,66Iy>Q&&&|$$q( %e,X !..K '44E IIV<<<	 MNHh&NS\\*e2	+f #7%<"22i  IOs   K7K#)rA  zOptional[Scheduler]r~   r   )r|  ztorch.device)   )r~   tuple[float, str])r  N)rR  zOptional[OrderedSet[str]]r~   r  )r  rg   r  	list[Any]r  r  r~   list[TritonKernel])r  r  r  r  r  r  r~   r  )rr   r   r   r  r=  r   r   rH   FOREACH	BUCKETIZEINPLACE_BUFFERSMASKED_SCATTER_WITH_INDEXSCANSORTTRITON_TEMPLATESTUPLE_REDUCTIONrE  r  r   rG  rS  r`  rs  rq  r  r  r  r  r	  s   @rz   r<  r<    s   )K)!""$$**44++**		
: $ $48t	
 RVU$5NU$	U$n#Q+#Q #Q &	#Q
 
#QJ33 3 &	3
 
3jZ3r|   r<  c                   g }| j                         }|t        |t        j                        sJ |r0|j                  $|j                  | j                          d       |S ddlm} | j                         }|J | j                  j                  |      }t        |t        |f      sJ dt        |              t        j                  j!                  |      5  t"        j$                  }|j'                  | j)                               j+                         }|t"        _        d d d        |j                  | j                          d       |j                  t-        j.                  d             |S # 1 sw Y   RxY w)Nz" Unfinalized multi template bufferr   )CUDACombinedSchedulingz]Scheduling backend should be SIMD or CUDACombined when generating debug Triton strings, got: z Triton code:z    )get_template_noderb  r   MultiTemplateBuffermake_kernel_renderrc  rM  0torch._inductor.codegen.cuda_combined_schedulingr  rz  rA  get_backendr[   r  rC   r  set_current_devicer    generated_kernel_countrp  r  r  r  rZ  )r6  linesmulti_templater  r|  backendold_generated_kernel_counttriton_codes           rz   r?  r?  D  sd   E++-N!Z@V@V%WWW.;;C((JKL2 L/	
 "!!!..,,V4'N4J#KL 	
klpqxlykz{	
L WW''/ 	H *1)G)G&!AA eg  .HG*	H 	(67X__[&9:L	H 	Hs   A	E66E?r   )r  r   rd  r   re  r   r~   r   r6  )r  r  r~   r  )r  r  r~   r}   )r  zUnion[CSEVariable, Any]r~   r}   )r~   re   r[  )rx   r}   r~   zCallable[[_T], _T])r6  r.   r~   rJ  )
__future__r   rf  rP  r  r  r  loggingr  ry  r  collections.abcr   r   r   typingr   r   r	   r
   r   r   r   sympy.printing.precedencer   rs   torch._loggingtorch.utils._pytreer  _pytreer  torch._dynamo.device_interfacer   torch._dynamo.utilsr   r   torch._prims_commonr   torch.utils._ordered_setr   torch.utils._sympy.functionsr   r   r   torch.utils._tritonr   utils._sympy.symbolr   r   r   r   utils._sympy.value_rangesr   r   r   r   r    r^  r!   	codecacher"   r#   r$   ops_handlerr%   runtimer&   runtime.benchmarkingr'   runtime.hintsr(   r)   r*   r+   runtime.runtime_utilsr,   r-   rA  r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   virtualizedr@   rk  rA   rB   rC   wrapper_benchmarkrD   block_analysisrF   commonrG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   simdrV   rW   rX   rY   rZ   r[   triton_utilsr\   r]   r^   r_   r`   r  ra   typesrb   rc   r  re   rf   simd_kernel_featuresrg   rh   	getLoggerrr   r  _logginggetArtifactLoggerperf_hint_logschedule_log
fusion_logrm   r   r   r   	dataclassr   r   r   rl  r  r%  r  r  r  r  r  r  r  r  r(  r*  _initialize_pointwise_overridesr  rH  r   rb  r{  r   r  r  r  r<  r?  r   r|   rz   <module>r     s5   "        	  .  F F  0   $ $ C < 0 / K K 2 X X 4 " " ( 8 8 ( ' .  D W W     C B B /    "   %  L8	Bg!00<H~~//*E^^--hA
6 6  4 $ 4 *, ,: R R R6 N+ N+ N+b++/+<P++>aQM aQH 	3
&8
;P *(.bi&k i&X  / / 9D$O D$N$+ $+N : : :&! !H # # #
%uS%S/-A'BBC 
U&A:/0 U&ApLL3~ L3^r|   