
    VhY                       U d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlmZmZ d dlmZ d dlmZmZmZmZmZ d dlmZ d dlmZ d dlZd dlZd dl Zd dl!m"Z" d dl#m$Z$ d d	l%m&Z&m'Z'm(Z(m)Z) d d
l*m+Z+ d dl,m-Z- d dl.m/Z/ ddl0m1Z1 ddl2m3Z3m4Z4 ddl5m6Z6m7Z7m8Z8m9Z9 ddl:m;Z;m<Z<m=Z= ddl>m?Z?m@Z@mAZAmBZBmCZC ddlDmEZE ddlFmGZGmHZHmIZImJZJ ddlKmLZLmMZMmNZN ddlOmPZP ddlQmRZR ddl4mSZSmTZT ddlUmVZV ddlWmXZX ddlYmZZZ ddl[m\Z\ ddl]m^Z^m_Z_m`Z`maZambZbmcZcmdZdmeZemfZfmgZgmhZhmiZi ddljmkZk  ej                  em      Zni Zoepeqef   erd <   d!Zsd"Zter
d dluZud d#lvmwZw  G d$ d%      Zx ex       Zyej                   G d& d'             Z{ej                   G d( d)             Z| G d* d+      Z} ej                          G d, d-             Z~ G d. d/ekj                        Z G d0 d1eI      Z ej                  d      d2        Z G d3 d4eA      Z G d5 d6      Z G d7 d8e4j                        Z G d9 d:eS      Z ej                  d      d;eeq   fd<       Zd= Z G d> d?      Z G d@ dA      Z G dB dCe      Z G dD dEe      Z ej                  d      d;efdF       Zd;eqfdGZdHeqdIeqdJeeS   d;eqfdKZ G dL dMe<      Zdaee   erdN<   dO ZdPeepeSef   eqee   eeS   gdf   fdQZdR Z G dS dT      ZddUl2mZ y)V    N)as_completedThreadPoolExecutor)StringIO)AnyCallableOptionalTYPE_CHECKINGUnion)Self)patch)get_interface_for_device)rand_strided)countersdynamo_timedidentitypreserve_rng_state)clear_on_fresh_inductor_cache)FileLock)
OrderedSet   )CeilDiv   )configir)
TensorMetaTritonBenchmarkRequestTritonCPUBenchmarkRequestTritonGPUBenchmarkRequest)	code_hashPersistentCachePyCodeCache)CSEVariableIndentedBufferKernelTemplateOpOverridesWorkspaceArg)SIMDKernelFeatures)gen_common_triton_importstexprTritonKernelTritonScheduling)	config_ofequal_1_arg_indicessignature_to_meta)pexpr)CUDACompileError)ChoiceCallerPrimitiveInfoType)	StoreMode)benchmarker)DeviceProperties)	FixedGrid)ceildivFakeIndentedBufferget_dtype_sizeis_gpuPlaceholderrestore_stdout_stderr	sympy_dotsympy_index_symbolsympy_producttriton_typetriton_type_to_torchunique)VVERIFYTF)IterationRangesRootc                       e Zd Zy)KernelNamespaceN__name__
__module____qualname__     P/home/dcms/DCMS/lib/python3.12/site-packages/torch/_inductor/select_algorithm.pyrG   rG   _       rM   rG   c                   `    e Zd ZU dZeej                     ed<   eej                     ed<   d Z	y)BenchmarkTensorszERepresents a set of inputs and outputs for autotuning with a templateinput_tensorsoutput_tensorc                 2    | j                   | j                  fS N)rR   rS   selfs    rN   unpackzBenchmarkTensors.unpackn   s    !!4#5#555rM   N)
rI   rJ   rK   __doc__listtorchTensor__annotations__r   rX   rL   rM   rN   rQ   rQ   g   s(    O%%ELL))6rM   rQ   c                      e Zd ZU dZeed<   eed<   dZeej                     ed<   ddefdZ
e	 ddeej                     d	eej                     d
ej                  dej                  deej                     defd       Zd Zy)AutotuneArgsa  During autotuning, we need to pass the same inputs to all choices.
    Note:
        Since we typically have a mix of external choices and triton choices, we create
        two lists of inputs for the same underlying buffers:
        - External inputs (for aten kernels): Include offset for sliced tensors
        - Triton inputs: Use base pointer for sliced tensors, without offset
    tritonexternNexpectedreturnc                 >    |r| j                   }|S | j                  }|S )z9Returns the inputs and output tensors for a given choice.)ra   r`   )rW   ra   bench_tensorss      rN   get_benchmark_tensorsz"AutotuneArgs.get_benchmark_tensors   s$    '- 48;;rM   example_inputsexample_inputs_externout
out_externc                 @     | t        ||      t        ||      |      S )zDFactory method to create AutotuneInputs from separate inputs/outputs)r`   ra   rb   )rQ   )clsrg   rh   ri   rj   rb   s         rN   from_choice_argszAutotuneArgs.from_choice_args   s)     #NC8#$9:F
 	
rM   c                     t        j                  j                  | j                  j                  | j
                  fi | y)z2Verify the correctness of the benchmarking resultsN)r[   testingassert_closera   rS   rb   )rW   kwargss     rN   verifyzAutotuneArgs.verify   s+     	""4;;#<#<dmmVvVrM   FrU   )rI   rJ   rK   rY   rQ   r]   rb   r   r[   r\   rf   classmethodrZ   r   rm   rr   rL   rM   rN   r_   r_   r   s     '+Hhu||$+5E 
  ,0
U\\*
  $ELL1
 \\	

 LL
 5<<(
 

 
WrM   r_   c                   B     e Zd ZdZd fdZd	deddfdZdefdZ xZS )
PartialRenderz
    Some parts of a template need to be generated at the end, but
    inserted into the template at the start.  This allows doing a bunch
    of replacements after the initial render.
    rc   Nc                 >    t         |           || _        || _        y rU   )super__init__codereplacement_hooks)rW   rz   r{   	__class__s      rN   ry   zPartialRender.__init__   s    	!2rM   hook_keyc                     || j                   vr|rt        | d      y | j                   |   J d       | j                  j                  | | j                   |                | _        d | j                   |<   y )Nz) not registered in self.replacement_hooksz hook_key can only be called once)r{   RuntimeErrorrz   replace)rW   r}   stricts      rN   finalize_hookzPartialRender.finalize_hook   s    4111"j IJ  %%h/; 	
.	
; II%%h0P0F0Fx0P0RS	+/x(rM   c                     | j                   j                         D ]+  \  }}| j                  j                  | |             | _        - | j                  S rU   )r{   itemsrz   r   )rW   keyfns      rN   finalize_allzPartialRender.finalize_all   sH    --335 	5GC		))#rt4DI	5yyrM   rc   NT)	rI   rJ   rK   rY   ry   strr   r   __classcell__r|   s   @rN   rv   rv      s,    3
0c 04 0c rM   rv   c                   R   e Zd ZU eed<   dZee   ed<   dZee   ed<    e	j                  e      Zeed<    e	j                  e      Zeed<    e	j                  e      Zeed<    e	j                  e      Zeed	<   dZeej"                     ed
<   dZeed      ed<   dZd Zd Zy)SubgraphInfobodyNtemplate_masktemplate_out)default_factorycomputeindexing_codeloadsstoresops_handlerrE   range_treesc                     d| _         y )N)r   numels)only_copy_if_non_none_fieldsrV   s    rN   __post_init__zSubgraphInfo.__post_init__   s
    ,E)rM   c                     t        j                  |       D ci c]#  }|j                  t        | |j                        % c}S c c}w rU   )dataclassesfieldsnamegetattr)rW   fields     rN   to_dictzSubgraphInfo.to_dict   s?    ?J?Q?QRV?W
6;EJJejj11
 	
 
s   (A)rI   rJ   rK   r#   r]   r   r   r   r   r   r   r   r   r   r   r   rC   WrapperHandlerr   rZ   r   r   r   rL   rM   rN   r   r      s    
#'M8C='"&L(3-&/k//OG^O$5K$5$5n$UM>U-K--nME>M.[..~NFNN.2K!**+2 :>K$456=FF
rM   r   c                        e Zd ZdZdedeeef   dee   f fdZ	dede
j                  fdZdd	efd
Z	 ddede
j                  dededef
dZdefdZd Z xZS )ModificationWrapperz=Handles placeholder substitutions during subgraph processing.subgraph_numberfixed_inputsmaskc                 ~    t         |   t        j                         d| | _        || _        || _        || _        y )NPlaceholderSubstitution_)rx   ry   rC   opsr   kernelr   r   )rW   r   r   r   r   r|   s        rN   ry   zModificationWrapper.__init__   s;     	..?@	(	rM   r   indexc                    || j                   vr| j                  |      }| j                  |      }t        j                  j                  |      j                  }d| d| d}|t        j                  t        j                  fv r/t        j                  j                  r|dz  }t        j                  }| j                  j                  j!                  | j                  j"                  ||      }|S | j                  j                  j!                  | j                  j"                  d| j                   |    dt        j                        S )z*Handle loading from tensor or fixed input.tl.load( + )z.to(tl.float32)dtype()r   _process_indexing_add_kernel_inputrC   graph
get_bufferr   r[   float16bfloat16r   r`   codegen_upcast_to_fp32float32r   csegenerater   )rW   r   r   	index_strvar	var_dtypelineri   s           rN   loadzModificationWrapper.load   s   t(((..u5I((.C**4066IcU#i[2D emmU^^<<MM88))!MM	++//**4;;+>+>I*VCJ{{''KK1T%6%6t%<$=Q!?u}} ( 
 	
rM   	index_varc                 *    t        t        |            S )z(Convert index variable to symbolic form.)r>   r   )rW   r   sizecheckwrap_negs        rN   indirect_indexingz%ModificationWrapper.indirect_indexing   s    !#i.11rM   valuemoderc   c           	          | j                   J d       |dk(  sJ d       | j                  |      }| j                  |      }d| d| d}d| d| d| d| j                    d		}|S )
zCurrently only supports stores for atomic adds coming from scatter nodes
        This is used by flex_attention's backwards grad for captured buffers, see
        zeros_and_scatter lowering
        z2Mask is required for inner stores in modifications
atomic_addz-Only atomic_add is supported for inner storesztl.broadcast_to(, z.shape)ztl.atomic_add(r   z, sem='relaxed'))r   r   r   )rW   r   r   r   r   buf_namer   stores           rN   r   zModificationWrapper.store  s     yy$ 	
@	
$ |#T%TT#))$/**51	&ykE7'B	 
#i[5'DII;N^_rM   c                 L    | j                   j                  j                  |      S )z1Add name as input to kernel and return input ref.)r   argsinputrW   r   s     rN   r   z%ModificationWrapper._add_kernel_input  s    {{%%d++rM   c                 j    | j                   j                  | j                   j                  |            S )z=Process and rename indexing, adding symbols as kernel inputs.)r   kexprrename_indexing)rW   r   s     rN   r   z%ModificationWrapper._process_indexing  s&    {{  !<!<U!CDDrM   r   rU   )rI   rJ   rK   rY   intdictr   r   r   ry   sympyExprr   r   r"   r3   r   r   r   r   r   s   @rN   r   r      s    G  38n	
 sm
 
UZZ 
,23 2
 SW %

3>FO	$,c ,ErM   r   c                   >    e Zd Zdddeddfdeeej                        dee   ddf fdZ	e
j                  defd	       Ze
j                  defd
       Zd Zd Zd Zd Zd Zd ZdedefdZd+dZdefdZd Z	 d+dedee   dee   defdZ	 	 	 d,dededeee   ee   f   dee   deeeef      defdZ	 	 d-deee   ee   f   d edee   defd!Z d" Z!d# Z"d$ Z#ddddd%de$jJ                  f fd&Z&d' Z'd+ded(eejP                     fd)Z)dee   fd*Z* xZ+S ).TritonTemplateKernelFr   N	subgraphsworkspace_argrc   c                    t        |j                               }t        |   |t        j
                  j                  dt        g |             || _        || _	        i | _
        || _        || _        |
| _        || _        || _        || _        || _        |	| _        || _        || _        || _        i | _        d | _        || _        || _        |%| j4                  j6                  j9                  |       i | _        t=               | _        t=               | _         t=               | _!        tE               | _#        tE               | _$        tE               | _%        tE               | _&        tE               | _'        d | _(        d | _)        d | _*        y )Nxr0_)features)+r?   get_sizerx   ry   r   SOner'   input_nodesoutput_nodenamed_input_nodesdefineskernel_nameuse_jit
num_stages	num_warpsgrid_fnmeta
call_sizesprefix_argssuffix_argsepilogue_fnrender_hookstriton_metar   r   r   workspace_argsappendsubgraph_bodiesr   prologue_supported_inputsprologue_fused_inputs#prologue_fused_inputs_preserve_zeror8   r   r   r   r   r   r   r   r   )rW   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   numelr|   s                    rN   ry   zTritonTemplateKernel.__init__  sn   $ k2245ww{{ (E2 	 	
 '&!#&$"	$&&&8<<E +$II$$++M: 9; ;E,& 7Al"DNL0 %7$8	'9';-?-A%7%9
&8&:,0+/7;rM   	body_namec              #   Z   K   t         fdt        j                  t              D              sJ t        j                  t              D ci c]#  }|j                  t         |j                        % }}| j                  v sJ |        j                  |   }|j                         j                         D ]#  \  }}|||j                  v rt         ||       %  j                  st        j                  n fd} |       5  d  d d d        t        di t        j                  t              D ci c]#  }|j                  t         |j                        % c} j                  |<   |j                         D ]  \  }}t         ||        y c c}w # 1 sw Y   xY wc c}w w)Nc              3   J   K   | ]  }t        |j                          y wrU   )hasattrr   ).0r   rW   s     rN   	<genexpr>z9TritonTemplateKernel.set_subgraph_body.<locals>.<genexpr>m  s!      
*/GD%**%
s    #c                  p    t        j                   j                  t        j                                     S rU   )rC   set_ops_handlerr   get_ops_handlerrV   s   rN   <lambda>z8TritonTemplateKernel.set_subgraph_body.<locals>.<lambda>  s&    **4+;+;A<M<M<O+PQ rM   rL   )allr   r   r   r   r   r   r   r   r   setattrr   
contextlibnullcontext)rW   r   r   	old_statesubgraphr   contexts   `      rN   set_subgraph_bodyz&TritonTemplateKernel.set_subgraph_bodyk  s     
3>3E3El3S
 
 	
 

 #)),7
 HHgdCHH--
	 
 D000;);0''	2"**,224 	&JC}(M(M!MD#u%	& ## ""Q 	
 Y 		*6 +
 '--l; '$11+
Y' $//+ 	&JCD#u%	&3
"	 	s7   A	F+(F4BF+F*F+5(F&=F+F#F+c              #      K   || j                   vsJ t        t               d d       | j                   |<   | j                  |      5  d  d d d        y # 1 sw Y   y xY wwrU   )r   r   r#   r  )rW   r   s     rN   create_subgraph_bodyz)TritonTemplateKernel.create_subgraph_body  sb      4 4444*6+
Y'
 ##I. 		 	 	s   AA A	A AA c                      y)NFrL   rV   s    rN   need_numel_argsz$TritonTemplateKernel.need_numel_args  s    rM   c           	      6   t        t        | j                  j                  j	                                     }g }t        t        j                  | j                  | j                  f            D ]  \  }}t        j                  j                  j                  |j                               }t        j                   t"        j$                  |d      }t'        |j)                               }|j+                  ||z  dt-        ||k        z   z          t/        |      S )z
        Estimate the total number of bytes this kernel takes.
        For in/out nodes, sizes are counted twice: once for reading and
        once for writing.
        r   )lenrB   r   inplace_buffersvalues	enumerate	itertoolschainr   r   rC   r   sizevars
size_hintsr   	functoolsreduceoperatormulr9   	get_dtyper   r   sum)rW   ninplace_args	num_bytesiinpr   r   
dtype_sizes           rN   estimate_kernel_num_bytesz.TritonTemplateKernel.estimate_kernel_num_bytes  s     F499#<#<#C#C#EFG		0@0@4CSCSBU VW 	PFAs77##..s||~>D$$X\\4;E'8JUZ/1s1};L7M3MNO		P
 9~rM   c           	      l   | j                   ry| j                  j                         \  }}}}t        || j                  |      t        j                  | j                  j                               i d}t        |      g|d<   t        |      D ]  }d|d   ||   j                  <    | j                  j                  dd       }| j                  j                  dd       }| j                  j                  d	d       }|r||d<   |r||d<   |r||d	<   || _        d
t        t         j"                        it%        j&                         t)        j*                         }	t,        j.                  st,        j0                  r| j3                         dz  }
|
|	d<   d| j4                   d| j6                   d|d|	d	S )Nz@triton.jit)
size_dtypeargdefs)	signaturedevice	constantsconfigsr   r0  matrix_instr_nonkdimwaves_per_eukpackr   g    eAkernel_num_gbzE
            @triton_heuristics.template(
                num_stages=z,
                num_warps=z,
                triton_meta=z ,
                inductor_meta=z0,
            )
            @triton.jit
        )r   r   python_argdefsr.   index_dtyper5   creater   
get_devicer,   r-   r   r   getr   r   r;   DESCRIPTIVE_NAMEr*   inductor_meta_commonr6   setup_grid_as_argsr   profile_bandwidthbenchmark_kernelr*  r   r   )rW   r-  _r.  r   arg_numr2  r3  r4  inductor_metanum_gbs              rN   	jit_lineszTritonTemplateKernel.jit_lines  s   << #'99#;#;#= Iq*d&6&6 '--d.>.>.I.I.KL'
 #,I"6!7I*95 	BG@AK$Yw%7%<%<=	B#yy}}-CTJyy}}^T:		gt,2FK./*6K'#(K & 3{;;<
//1
 **,

 ##v'>'>335;F-3M/* OO, ->>* +(O ,,/ 0	 	rM   c                 .      fd}| j                   d<   y)Nc                  n    j                   j                         ^} }dj                  d | D               S )Nr   c              3   <   K   | ]  }|j                           y wrU   	full_namer  r   s     rN   r  zATritonTemplateKernel.gen_argdefs.<locals>.hook.<locals>.<genexpr>  s     @!@   )r   r6  join)arg_defsr@  rW   s     rN   hookz.TritonTemplateKernel.gen_argdefs.<locals>.hook  s1    99335LHqii@x@@ABrM   z	<ARGDEFS>)r   )rW   rN  s   ` rN   gen_argdefsz TritonTemplateKernel.gen_argdefs  s    	C
 *.+&rM   c                     | j                   S rU   )r   rV   s    rN   gen_definesz TritonTemplateKernel.gen_defines  s    ||rM   c                     t        d |D              sJ t        d       j                   j                  t	         j                         j
                  z
   }t	        |      t	        |      k(  s:J t	        |      t	        |       j                  t	         j                        f        j                  d j                   D ]+  } j                  j                  |j                                - t        ||      D ]  \  }}d| }| j                  |<   |j                         t        j                  j                  v rE|j                          j                  v rb| j                  j                  |j                         <    |D ]   } j                  |   }|j                         t        j                  j                  v r>|j                          j                  v r[ j                  j                  |j                            }|j!                         j"                  dk(  rj%                  | d|        t'         j)                  |j!                         j"                              }j%                  | d| d|          j                  t	         j                         j
                  z
  d D ]s  }|j                         t        j                  j                  v r.|j                          j                  v rK j                  j                  |j                                u  fd	}d
 j*                  vsJ | j*                  d
<   y
)zb
        Hook called from template code to generate function def and
        needed args.
        c              3   <   K   | ]  }t        |t                y wrU   )
isinstancer   rJ  s     rN   r  z2TritonTemplateKernel.def_kernel.<locals>.<genexpr>  s     8!:a%8rK  r   )initial_indentNarg_r    = r   c            	      ,   j                   j                         ^} }t               }|j                  t	                      |j                  j                                |j                  dj                   ddj                  d | D               d       |j                         5  |j                  j                         |j                  j                                d d d        |j                         S # 1 sw Y   |j                         S xY w)Nzdef r   r   c              3   <   K   | ]  }|j                           y wrU   rH  rJ  s     rN   r  z@TritonTemplateKernel.def_kernel.<locals>.hook.<locals>.<genexpr>!  s     3TaAKKM3TrK  z):)r   r6  r#   splicer(   rD  	writeliner   rL  indentr   getvalue)rM  r@  rz   renamesrW   s      rN   rN  z-TritonTemplateKernel.def_kernel.<locals>.hook  s    99335LHq!#DKK134KK()NNt''($))3T83T*T)UUWX  0DLL)G,,./0 ==?"0 ==?"s   (;C;;Dz<DEF_KERNEL>)r
  r#   r   r   r  r   r   r   get_namezipr   rC   r   removed_buffersr   input_buffers
get_layoutoffsetr[  r)   r   r   )	rW   argnames
named_args
input_noder   arg_namerd  rN  r^  s	   `       @rN   
def_kernelzTritonTemplateKernel.def_kernel  s	   
 8x8888 2%%s4#3#34t7G7GG

 8}J/ 	
M
O  !	2
 	
/ **+=T-=-=> 	3JIIOOJ//12	3 !$Hj 9 	FD*dV}H+5D""4(""$(?(??""$(B(BB=EDII##J$7$7$9:	F  	ED//5J""$(?(??""$(B(BByy..z/B/B/DEH$$&--2!!TF#hZ"89t33J4I4I4K4R4RST!!TF#hZs6("CD	E **3t/?/?+@4CSCS+S+UV 	3J""$(?(??""$(B(BBIIOOJ//12	3	# T%6%6666,0.)rM   r   r   c                     t        |t              sJ || j                  j                         |   }n2t        |t              sJ | j
                  |   j                         |   }t        | j                  |            S )z
        Hook called from template code to get the size of an arg.
        Will add needed args to pass it in if it is dynamic.
        )rT  r   r   r   r   r   r)   r   )rW   r   r   vals       rN   r   zTritonTemplateKernel.size,  sv    
 %%%%<""++-e4CdC(((((.779%@CT))#.//rM   c           
      l   || j                   j                         }n/t        |t              sJ | j                  |   j                         }t        |t
              rt        | j                  ||               S dj                  |D cg c]  }t        | j                  |             c}      S c c}w )z
        Hook called from template code to get the stride of an arg.
        Will add needed args to pass it in if it is dynamic.
        r   )	r   
get_striderT  r   r   r   r)   r   rL  )rW   r   r   rk  r'  s        rN   stridezTritonTemplateKernel.stride9  s    
 <""--/CdC(((((.99;CeS!--c%j9::yy#FQ% 4 4Q 78FGGFs   	!B1r   c                 4   t        |t              sJ t        | j                  t              sJ |t	        | j                        k  s J d| dt	        | j                                | j
                  j                         dk(  sJ d       | j                  |   S )Nz9Invalid subgraph number provided to create_modification, z must be <  z1Body should be clear before adding a modification)rT  r   r   rZ   r  r   r]  )rW   r   s     rN   _get_subgraphz"TritonTemplateKernel._get_subgraphH  s    /3///$..$///T^^!44 	
GGXXcdghlhvhvdwcxy	
4 yy!!#r) 	
?	
) ~~o..rM   c                     t        t        j                        sJ dt                      fd}j                  j                  j                  |g       S )z}Handle processing for a single scatter graph.

        Args:
            scatter_graph: The scatter graph to process
        z;scatter_graph must be an instance of ComputeBuffer but got c                 X    t        d t        | j                               D              S )Nc              3   ,   K   | ]  \  }}||z    y wrU   rL   )r  x_irn  s      rN   r  zYTritonTemplateKernel._handle_scatter_graph.<locals>.contiguous_strides.<locals>.<genexpr>_  s      !,ffs   )r$  r`  rm  )r   scatter_graphs    rN   contiguous_strideszFTritonTemplateKernel._handle_scatter_graph.<locals>.contiguous_strides]  s-     03A}7O7O7Q0R  rM   )rT  r   ComputedBuffertypedatastore_outputr   )rW   rv  rw  s    ` rN   _handle_scatter_graphz*TritonTemplateKernel._handle_scatter_graphS  s`     -):):; 	
I$}J]I^_	
;	 !!.. 2B
 	
rM   output_namer   c                 n   d}d}g }d| d| | j                   v r|dz  }d| d| | j                   v r| j                  d| d|       5  | j                  |      }t        | |||      }	t	        j
                  |	      5  t        |t        j                  t        f      sJ dt        |              t        |t              r(|D ]"  }
|j                  | j                  |
             $ n`t        |j                  t        j                        r! |j                  j                         d      }n|j                  j!                  d      }ddd       | j#                          |At        |t$              sJ |J | j&                  j)                  | d|j*                          n/|J |D ]&  }| j&                  j)                  t%        |             ( | j&                  j-                         }| j.                  j1                  t3                      |cddd       S # 1 sw Y   xY w# 1 sw Y   yxY w)	a  This creates a modification function for a subgraph.
        To use this inside a template, the first argument should specify which subgraph to codegen for

        Args:
            subgraph_number (int): The index of the subgraph in self.subgraphs
            output_name (Optional[str]): The name of the output variable to store the result in
            mask (Optional[str]): An optional mask to use for the store operation. If provided, this mask
                will be applied to the store.
        r   Nmod_r@  r   zLExpected the subgraph to be a ComputedBuffer or a List[ComputedBuffer], got rL   rW  )r   r  rq  r   rC   r  rT  r   rx  rZ   ry  r   r|  rz  InputBuffermake_loaderinner_fncodegen_bodyr   r   r[  r   r]  r   
invalidater   )rW   r   r}  r   r   numri   scattersr  modification_handlerrv  scatterbody_vals                rN   modificationz!TritonTemplateKernel.modificationg  s&     _%Qse,0D0DD1HC _%Qse,0D0DD&&o->au'EF 	))/:H#6o|T$  ""#78 5!(R->->,EF bcghpcqbrsF h-)1 S (B(B=(QRSr~~>5(--335b9C"--004C5 &!+s333&		##{m3syyk$BC{"{' 6GII''G56 yy))+HHH
-=	 	
5 5	 	s&   5H+CHCH+H(	$H++H4
input_nameindicesotherindent_widthc           	      (
     j                   |    j                  j                  j                                t	        j                               t        j                  d      f}|d   |d   d} j                  ddd|d      }	d j                  d| d      5  t        |t        t        f      sJ t        t              sJ t        |t        t        d      f      sJ |	 _        |j!                         D 
ci c]/  \  }
}|
t"        j$                  j&                  j)                  |      1 c}}
 _        t        t-        t.        j0                  |            }|D cg c]  }t        j2                  |d	
       }}j                         D cg c]+  }t"        j$                  j&                  j)                  |      - }}t5        |      t5        |      k(  sJ |D cg c]  }t        j2                  |d	
       }}t5        |      t5        |      k(  sJ t7        | j                  d   j9                  |            D ]  \  }}|j;                  |        t=        t>        j@                  jC                  |      |      } jE                  |      } jF                  jI                  dtK        |      z           j                  d   jM                  t        j                  d      t	        |            }|j;                  d       ||nd _'        d _(        | _)         j                   |   jT                  jW                           jX                  j[                  t]                       jN                   G fddt"        j^                        }| _0         j                   |    jc                         |      }t"        j$                  j&                  je                  jf                  jh                  d      s,| jE                  jk                         jh                        z
  } jE                  |      }||k(  rd}nL jm                  | jP                   jN                        }ddl7m8} t        ||      sJ d|jr                   d} d| d| d|rd| d dz  ndz  ddd       d| d fd} jt                  vsJ | jt                  <   S c c}}
w c c}w c c}w c c}w # 1 sw Y   NxY w)a=  Loads an input and applies any necessary preprocessing or masking.

        Args:
            input_name (str): The name of the input to load.
            indices (Union[List, Tuple]): The index for each dimension of the input.
            val (str): The name of the variable to store the loaded value.
            mask (Optional[str]): An optional mask to use for the load operation.
            other (Optional[Union[float, int]]): The value to use for masked elements. Default is 0.0.
            indent_width (int): The number of spaces to use for indentation.
        r   r   r   NF)	pid_cacheinside_reductionis_reductionr   no_x_dimz<LOAD_INPUT_>Tinteger	xindex = xindexNonec            	       J    e Zd ZdZ	 d
dedej                  ddddf fd	Zy)@TritonTemplateKernel.load_input.<locals>.StoreOutputSubstitutionStoreOutputSubstitutionNr   r   r   r"   r   r3   c                    t         j                  j                  j                  |       |t         j                  j                  j
                  |<   |t         j                  j                  v r|j                  }t        |      }	dk7  r-|t         j                  j                  vsdk7  rd	 d| d d}|t         j                  j                  |      j                  k7  r8| dt        t         j                  j                  |      j                         d}t         j                  j                  j                   d| d       y y )	Nr  r   z	tl.where(r   r   z.to(rW  z.broadcast_to(xindex.shape))rC   r   store_buffer_namesaddr   store_cacher   r   r   r   r   r   r@   r   r[  )
rW   r   r   r   r   value_dtype	value_strr  r}  r   s
          rN   r   zFTritonTemplateKernel.load_input.<locals>.StoreOutputSubstitution.store  s    HH//33D95:AHHLL,,T2qxx=== ',kk$'J	(F2 (T(TT$z #,M?"YKr%PQ R & '!''*<*<T*B*H*HH+4+T+aggFXFXY]F^FdFd:e9ffg(hI ((22*m3yk9TU% >rM   rU   )rI   rJ   rK   r   r   r   r   r   )r  r}  r   s   rN   r  r    sA    0 )- !:: )	
 & rM   r  )
copy_shapeoverride_mask)IndexingOptionsr   z).broadcast_to(xindex.shape)z = tl.load( + (r   z, mask=z, other=c                     j                         5  j                  j                  t                      j	                          j                  j                  t                      j                         j                  vrJ j                  j                         t        j                  j                  j                         dz        j                         cd d d        S # 1 sw Y   y xY wN )r  r   r  r   r  r_  r   r   r[  textwrapr\  r]  strip)r}   r  rg  	load_coderW   s   rN   rN  z-TritonTemplateKernel.load_input.<locals>.hookB  s    ''1 Y##JL1!!###JL1&&(0J0JJ$000II''	2tyy'9'9';S<=OPVVXY Y Ys   CC--C6);r   r   r  r_  r?   r   r   Integerconstruct_range_treesr  rT  rZ   tupler   ry  r   r   rC   r   r  simplifyr   mapr%   parenSymbolr  r`  construct_entriesset_namer=   r   FlexibleLayoutrw  r   r   r[  r)   lookupr   r   template_indicesrz  freeze_layoutr   r  r   r   r   make_indexerstatically_known_equalslayoutrd  rc  indexingcodegen.tritonr  r   r   )rW   r  r}  r  r   r  r  tilingsgroupsr   kvr   index_symbolsslengthsr   range_tree_entrycontiguous_indexxindex_range_rootr  output_indexoutput_index_strout_indexingr  rN  r}   rg  r  r   s   ` `  ``                   @@@@rN   
load_inputzTritonTemplateKernel.load_input  s   ( ++J7
&&**:+>+>+@A !4!4!67q9IJ1:

 00" 1 
 	&&j\'CD }	!ge}555k3///dS$t*$5666*DGM||~Vtq!1agg..77::VDK3{00':;GDKLqU\\!T:LML=G=P=P=RSqww''003SGSw<3w<///DKLqU\\!T:LMLw<3w</// +.))!,>>wG+ 0&& !))$/0  )!!44W=}   $334DEIIe4D.E EF $ 0 0 3 : :a -"8! &&x0 *.)9vD (D$+D!"":.33AACHH
- ..M   !*:*:  D  7D//
;J4:224]CL
 77##;;!!((!  ,d.B.B))+22/    //=L//#+ #}} #00"&"4"4  -  
 <!,@@@..//KL !
 '-{:,dCSBTTUVIwtfHUG1==	S 	{}	!~ "*Q/		Y 		Y t00000&*(#S WLS M}	! }	!sD   'A"T	4S3=0T-S9T0S>T.TJ/T3TTrk  c                      j                  d      5  t        |t        t        f      sJ t        |t              sJ t        |t        t        d      f      sJ  j                  J t        t        t        j                  |            }|D cg c]  }t        j                  |d       }} j                  j                         D cg c]+  }t        j                  j                   j#                  |      - }}t%        |      t%        |      k(  sJ t'        | j(                  d   j+                  |            D ]  \  }	}
|
j-                  |	        t/        t0        j2                  j5                  |      |      } j7                  |      } j8                  j;                  dt=        |      z           j(                  d   j?                  t        j@                  jB                  tE        |            j-                  d       | _        | _#        | _$          j                  jK                         jM                         |      } j7                  |      }||k(  rt        j                  dd      }d jN                  v rtQ         jN                  d         ntR        jT                  }t        jV                  jX                  j[                  ||	      g}t]        j^                   j`                  d jb                    j`                  t%         j`                         jd                  z
  d       D ]7  }|jg                          |ji                   |jk                         |             9 t        jl                  jo                   j                  jq                         |  jr                  |         ju                          ddd        fd
}d jv                  vsJ | jv                  d<   yc c}w c c}w # 1 sw Y   9xY w)a  Stores the final output and appends any epilogue fusions if the buffer hasn't been optimized away.

        Args:
            indices (Union[List, Tuple]): The index for each dimension of the output. The dot product of
                these indices and output strides must match `val`.
            val (str): The value to store.
            mask (Optional[str]): An optional mask to use for the store operation. If provided, this mask
                will be applied to the store.
            indent_width (int): The number of spaces to use for indentation. This is used when the call to
                store_output is indented in the kernel definition.
        <STORE_OUTPUT>NTr  r   r  r  ACC_TYPEr   c                      j                          j                  j                  t                      t	        j
                  j                  j                         d z        j                         S r  )	r  r   r  r   r  r\  r   r]  r  )r  rW   s   rN   rN  z/TritonTemplateKernel.store_output.<locals>.hook  sO    HH
-??499#5#5#7|9KLRRTTrM   )<r  rT  rZ   r  r   ry  r   r  r%   r  r   r  r   r   rC   r   r  r  r  r`  r   r  r  r=   r   r  rw  r   r   r[  r)   r  r   r   r?   r   r  rc  r  r   rA   r[   r   r   r   namedvarr  r  r   r   r   r  r   r  r   r   r_  r   r  r   )rW   r  rk  r   r  r   r  r  r  r   r  r  r  	acc_dtypeepilogue_argsrg  rN  s   `   `            rN   r{  z!TritonTemplateKernel.store_outputQ  s   $ &&'78 3	 ge}555c3'''dS$t*$5666%%---3{00':;GDKLqU\\!T:LML6:6F6F6O6O6Q12  ))!,G  w<3w</// +.))!,>>wG+ 0&& !))$/0  )!!44W=}   $334DEIIe4D.E EFQ&&uww{{M'4JKTT "&D #D$+D!G4++668EEGVL//=L//$||HdC * %TYYz%:;]] 
 XX\\223i2HIM'oo  !34#3#34  T%5%5!69I9I!I!KL N
 ((*$$%=Z%;%;%=m%LMN EEKK  ))+   -0
 g3	 j	U  t'8'8888.2*+q M3	  3	 s+   A=O=O3/O=0O8=KO=3
O==Pc                 n    t         |j                  di | j                         || j                        S NrL   )rv   rendertemplate_envr   )rW   templaterq   s      rN   r  zTritonTemplateKernel.render  s7    HOO<d//1<V<
 	
rM   c                     t        |t        t        f      sJ t        |t              sJ t        |t              sJ  j                  |   j                         }t        t        t        j                  |            }t        |      t        |      k(  sJ dj                   fdt        ||      D              }d| d| d| dS )zw
        Optional helper called from template code to generate the code
        needed to load from an tensor.
        r   c              3   b   K   | ]&  \  }}t        j                  |             d |  ( yw)z * N)r)   r   )r  r  r'  rW   s      rN   r  z1TritonTemplateKernel.make_load.<locals>.<genexpr>  s6      
:>!QuT))!,-.c!5
s   ,/r   r  z), z, other=0.0))rT  rZ   r  r   r   rm  r  r%   r  r  rL  r`  )rW   r   r  r   rn  r   s   `     rN   	make_loadzTritonTemplateKernel.make_load  s    
 'D%=111$$$$$$$$''-88:s;,,g677|s6{***

 
BEfgBV
 
 $tE7#dV<@@rM   c           	         | j                   | j                  | j                  | j                  | j                  | j
                  | j                  | j                  | j                  f	D ci c]  }|j                  | c}S c c}w )zA
        Generate the namespace visible in the template.
        )
ri  r   rn  r{  r  r  r  rO  rQ  rI   rW   r   s     rN   r  z!TritonTemplateKernel.template_env  su     		!!!!    

 KKO
 	
 
s   (A?dense_indexingr  r  	block_ptrc                T    t         |   |d| j                  | j                  |      S )zh
        Override the default indexing to use our custom mask and force
        dense indexing.
        Fr  )rx   r  r   r   )rW   r   r  r  r  r  r|   s         rN   r  zTritonTemplateKernel.indexing  s9     w  ((,,   
 	
rM   c                      y rU   rL   rV   s    rN   codegen_range_treez'TritonTemplateKernel.codegen_range_tree  s    rM   nodec                    t         j                  j                  }| j                  j	                         \  }}}}d}t        | j                  t              r4 | j                  j                  g | j                  | j                   }n9t        d | j                  D              r7 | j                  g t        t        | j                        | j                   }nt         j                  j                  rJ d       |j                  d| j                  j                           |j#                  | j                        }| j                  j                    d| j                  j$                   }	|j'                  d|	 ddj)                  t        t*        | j                               d| d	       |j'                  d        t-        |      d
v sJ d       |j/                  |       |j/                  t        t0        |             | j2                  |j5                  | j2                         |j7                  |||| j8                  d       | j2                  |j;                  | j2                         y y )NrL   c              3   \   K   | ]$  }t        |t        t        j                  f       & y wrU   )rT  r   r   r  rJ  s     rN   r  z3TritonTemplateKernel.call_kernel.<locals>.<genexpr>  s      NAU]]34Ns   *,z#cpp_wrapper requires SymbolicGridFnzimport .*r   r   r   )r      zgrid_fn should return 3 valuesT)	arg_typesr   r`   )rC   r   wrapper_coder   r6  rT  r   SymbolicGridFn
sympy_callr   r   r
  r  r   cpp_wrapperadd_import_oncerJ   add_meta_oncerI   r   rL  r/   r  extendry  r   generate_workspace_allocationgenerate_kernel_callr   generate_workspace_deallocation)
rW   r   r  wrapperr@  	call_argsr  	grid_argsr   fn_names
             rN   call_kernelz TritonTemplateKernel.call_kernel  s   ''&&%)YY%=%=%?"9a	dllN3///LL$))LINdooNN$Kc#t&?KKIww**Q,QQ*##gdll.E.E-F$GH((3D0014<<3H3H2IJGG9AdiiE4??(CDERvQO T"9~'I)II'#T9-.)11$2D2DE$$(( 	% 	
 )33D4F4FG *rM   c                      | j                   g t        j                  j                  j	                  | j
                        | j                   D cg c]  }t        |       c}S c c}w rU   )r   rC   r   r  r  r   r   r   )rW   r   s     rN   kernel_benchmark_extra_argsz0TritonTemplateKernel.kernel_benchmark_extra_args  s`     "T\\ !!,,T__=?Cyy
 F
 	
 
   A'rU   )N           )Nr  ),rI   rJ   rK   r   r   rZ   r   rx  r&   ry   r  contextmanagerr   r  r  r  r*  rD  rO  rQ  ri  r   r   rn  rq  r|  r  r
   r   r  floatr  r{  r  r  r  r   r   r  r  IRNoder  r  r   r   s   @rN   r   r     sD    7;04!L< D!2!234L<   -!L<" 
#L<\ &3 & &@ c  +ZHT0 0S 0H	/S 	/
0 #	33 c]3 sm	3 
3t #-0ss s tCy%*,-	s
 sms eSj)*s sr #P tCy%*,-P  P  sm	P 
 P d
A 
. 
zz
. H  H8BII+>  HD
T#Y 
rM   r   c                  b    	 dd l } | j                  | j                        S # t        $ r Y y w xY w)Nr   )	undefined)jinja2EnvironmentStrictUndefinedImportError)r  s    rN   _jinja2_envr    s?    !!,, " 
 	
  s   " 	..c            	            e Zd ZU  ej                         Zi Zeed f   e	d<   ddede
deddf fdZddeddddfd	ee   fd
Z xZS )TritonTemplateall_templatesr   gridsourcerc   Nc                     t         |   |       || _        | j                  |      | _        || j
                  vsJ d       | | j
                  |<   || _        y )Nzduplicate template name)rx   ry   r  _template_from_stringr  r  debug)rW   r   r  r  r  r|   s        rN   ry   zTritonTemplate.__init__!  sZ    	226:4---H/HH-#'4 
rM   r   r   c                    $  j                   sJ d       t               }t        j                  j	                         sdd<   j                         D ]  \  }}|j                  | d| d        |j                         }t        j                  d|      }d j                   }t        |j                        }t        j                  ||f      }t        j                   ||      st#        d	      |
|j                  }
|||| j$                  |
||||d$t'        j(                  t*        j,                  d j/                  |            5  t*        j,                  j1                  |j2                        5  t5        d(||dd$5 }	 |j7                   j                         }|j9                  d      5  |j;                         }d
d
d
        j>                  rtA        d       djC                  g tE        jG                               D cg c]  }| dtI        |           c}d| d|       dz   }tK        jL                  |      }d
d
d
       d
d
d
       d
d
d
       tO        jP                  jR                  jG                               }tO        tU        d |D                    }|d
tW        |       |k(  s	J ||f       tO        |D cg c]!  }t*        j,                  jY                  |      # c}      }t*        j,                  jZ                  j]                  t_        t`        jb                  tO        |jP                  jZ                  jG                                     td        jf                        }d j                   dti         jj                         }$ fd} jl                  J   j$                  g t*        j,                  jZ                  j]                  |
td        jf                         }!|j2                  jn                  dk(  rtp        }"ntr        }" |"|jl                  |jt                  |g ||!||jw                  dd      jw                  dd      jw                  dd      ty        jz                  |      ty        jz                  |            }#t}        |||| j                  d      j                  dd       |#t        jw                  d!d"      jw                  d#d"      jw                  d$d"      f      ||t        jw                  dd
            t        jw                  d%d
            d&|	|j                  j                         '
      S # 1 sw Y   xY w# t<        $ r Y d
d
d
       d
d
d
       d
d
d
       y
w xY wc c}w # 1 sw Y   UxY w# 1 sw Y   ZxY w# 1 sw Y   _xY wc c}w ))ap  This function generates a TritonTemplateCaller

        Args:
            input_nodes: List of input nodes
            layout: Output layout
            num_stages: Number of stages for triton launch
            num_warps: Number of warps for triton launch
            prefix_args: Number of input nodes to be passed as arguments
            suffix_args: Number of input nodes to be passed as arguments
            epilogue_fn: Optional epilogue function to be called on the output
            subgraphs: Optional subgraphs to be passed as arguments, these will be inlined
                into the triton template string
            mutated_inputs: Optional list of input nodes that are mutated by the kernel, this is helpful
                if you need to return multiple outputs. You can pass them as inputs and mark them as
                being mutated by the kernel.
        zrequires jinja2False
ALLOW_TF32z : tl.constexpr = 
buf_outr   r  triton_z;64-bit indexing is not yet implemented for triton templatesN)r   r   r   r   r   r   r   r   r   r   r   r#  Fr   r   r   r   r  zGenerated Code:
-=znum_stages=z
num_warps=c              3   <   K   | ]  }|j                           y wrU   )r_  rJ  s     rN   r  z*TritonTemplate.generate.<locals>.<genexpr>  s     *MA1::<*MrK  fallbackr@  c                     t        dt        t        j                        | dd}t	        j
                  |j                  j                        }||fS )NFr  rL   )r   r   r;   KERNEL_NAMEr  partialr  r  )out_noder   r  kernel_optionsrq   rW   r   s      rN   make_kernel_renderz3TritonTemplate.generate.<locals>.make_kernel_render  sb    )  7 78$+	
 !F &&F
 6>!rM   cpur2  r   r3  r4  r   )module_pathmodule_cache_keyr   
extra_argsr   r   r2  r3  r4  input_tensor_metaoutput_tensor_metar   r   BLOCK_MBLOCK_KBLOCK_Nr  )
tile_shaper   r   
allow_tf32acc_type)log_infomutated_inputsr   allowed_prologue_inpsrL   )Dr  r   r[   cudais_tf32_supportedr   writer]  r   Bufferr   r?   r   r  r  r+   can_use_32bit_indexingNotImplementedErrorr  r   objectrC   r   _fake_get_dtypeset_current_devicer/  r   r  r  r   ZeroDivisionErrorr  printrL  sortedkeysreprr!   r   r  r   rb  rB   r  r   r  r  r  r   expandr   unbacked_symint_fallbacknextindex_counter__file__ry  r   r   r   r:  r   from_irnodesTritonTemplateCallerr  r   r   r   copy)%rW   r   r  r   r   r   r   r   r   r2  r   r   rq   r   r   rk  fake_outr   r   buffersr   r  rz   kwargextramodinput_call_argsexpected_input_argsr  full_input_nodesr'  kernel_hash_namer#  r  	bmreq_clsbmreqr"  s%   `          ``                       @rN   r   zTritonTemplate.generate)  s   > }}///}* zz++-#*F<  	>ID#MMTF"4SE<=	>""$99)F;		{+fkk*//+{;66ugF%M  J '$"yy$&&&"
 LL+t/C/CH/MN!	0GG&&v}}5!	0 ! '$+	
 !!	0 !==?--.>? 3#002D3
 zz)40 *0)> %  %gQtF5M':&;<
 &j\2 %YK0	 
  ""4/CC!	0 !	0 !	0F   9 9 > > @A $F*M*M$MN9%8!9:>QQ 	
T
 	
Q
 !!QA!''"4"4Q"7!QRWW%%00eFKK$8$8$=$=$?@A44 1 


 %TYYKqd6H6H1I0JK	"  ||'''tyy 
WW((88 ) 

 
 ==&1I1I WW#++d+!!',BA!FNA6**Wa((556FG)66v>'
 $KK$$S$/!

9b1

9b1

9b1 )&!&**\4"@A

:t <= *'"("B"B"G"G"I-
 	
g3 3$ #!	0 !	0 !	0 !	0/!	0 !	0 !	0 !	0 !	0 !	0X "Rs   *W;V8V+-U?;U2	U??V+V&-+V+V8 W&W2U<7U??	V#	V+	V8W"V#	#V++V50V88W	=WWrs   )rI   rJ   rK   r  countrE  r  r   r   r]   r   ry   r   r   r&   r   r   r   s   @rN   r
  r
    st    #IOO%M13M4--.3S  S $  04
  -
rM   r
  c                   z     e Zd Z	 d
dddddd	 d fdZd Zd Z ej                  d      d        Z	 dd	Z	 xZ
S )ExternKernelChoiceNTF)r   has_out_variantop_overloaduse_fallback_kernelkernel_creatorc                   t         |           |xs |j                  }t        |      sJ t	        t
        |      r
J d|        || _        || _        || _        t        t
        ||       || _
        || _        || _        y )Nzduplicate extern kernel: )rx   ry   rI   callabler  extern_kernelsr   cpp_kernel_namerX  r  rY  rZ  r[  )	rW   r   
cpp_kernelr   rX  rY  rZ  r[  r|   s	           rN   ry   zExternKernelChoice.__init__  s     	&v>40T4MdV2TT0	).f-&#6 ,rM   c                 6    t        t        | j                        S rU   )r   r^  r   rV   s    rN   to_callablezExternKernelChoice.to_callable  s    ~tyy11rM   c                      d| j                    S )Nzextern_kernels.r   rV   s    rN   	call_namezExternKernelChoice.call_name  s     ,,rM   c                    | j                         }| j                  t        |dd      t        |dd      g}	 |j                  t	        j
                  |             t        dj                  |            S # t        $ r Y %w xY w)NrI   rp  rJ   r  )	rb  r   r   r   inspect	getsource	Exceptionr   rL  )rW   r   partss      rN   hash_keyzExternKernelChoice.hash_key	  s{    IIB
B'Bb)

	LL**2./ %))  		s   $A5 5	B Bc                 D    || _         t        | |||| j                        S )NrX  )ordered_kwargs_for_cpp_kernelExternKernelCallerrX  )rW   r   r  rn  rq   s        rN   bindzExternKernelChoice.bind  s+     .K*!+vvt?S?S
 	
rM   rU   r   )rL   )rI   rJ   rK   ry   rb  re  r  	lru_cacherk  rp  r   r   s   @rN   rW  rW    sb     -
 !- 
-.2- Y* *" ')	

rM   rW  c            	            e Zd Z	 	 	 	 ddeeeeeee   f   f      dee	   dee
e      ddf fdZd Zd Zdefd	Zd
 Zd Zd Zdeeeeee   f   f   fdZd Zd Z xZS )rH  Nr1  r   r3  rc   c                 8   t         |   ||||       || _        || _        |i }|| _        | j                  j                  d| j                  j                  | j                  j                  d       || _        |	| _	        |
|
| _        y t               | _        y )NTriton)backendr   r   )rx   ry   r#  rT  r1  updater   r   r2  r   r   r3  )rW   r   r   r  r#  descriptionrT  r1  r2  r   r3  r|   s              rN   ry   zTritonTemplateCaller.__init__%  s     	{FK@"4-2
H(0#"jj33!ZZ11	
 -*%:%F! 	"LVL 	"rM   c                V    | j                   J  | j                   j                  |d|iS )NrS   )rT  	benchmark)rW   ri   r   s      rN   ry  zTritonTemplateCaller.benchmarkG  s.    zz%%%#tzz##T===rM   c                 T    | j                   J | j                   j                          y rU   )rT  
precompilerV   s    rN   r{  zTritonTemplateCaller.precompileK  s"    zz%%%

rM   c                 P    d| j                   j                   d| j                   dS )NzTritonTemplateCaller(r   r   )rT  r%  rw  rV   s    rN   __str__zTritonTemplateCaller.__str__O  s*    &tzz'='=&>bAQAQ@RRSTTrM   c                      d| j                    S )Nztemplate_kernels.rd  rV   s    rN   re  zTritonTemplateCaller.call_nameR  s    "499+..rM   c                     dj                  | j                  j                  dd      d   | j                  j                  g      S )Nr  r@  r   r   )rL  r   rsplitrT  r&  rV   s    rN   rk  zTritonTemplateCaller.hash_keyU  s>    xx		  a(+

++
 	
rM   c           	          t         j                  j                  t        j                  | j                  | j
                  | j                  | j                  | j                              S )N)r  inputsr#  r2  r3  )	r   	TensorBoxr8  TritonTemplateBufferr  r   r#  r2  r3  rV   s    rN   r   z TritonTemplateCaller.output_node]  sR    ||""##{{''#'#:#:#22&*&@&@
 	
rM   c                     | j                   S )RInformation returned here is logged to the autotune log file when that is enabled.)r1  rV   s    rN   	info_dictzTritonTemplateCaller.info_dicth  s    }}rM   c                     | j                   S rU   )r#  rV   s    rN   get_make_kernel_renderz+TritonTemplateCaller.get_make_kernel_renderl  s    &&&rM   c                     d}| j                         }|d   }t        |      }|d   }|d   }|d   }|d   }|d   }	d| d	| d
| d| d| d|	 S )Nr`   r.  r   r   r   r   r   ztype=z	_BLOCK-M=z	_BLOCK-K=z	_BLOCK-N=z_numstages=z
_numwarps=)r  eval)
rW   	type_nameinfotile	tile_valsr*  r,  r-  r   r   s
             rN   autoheuristic_idz%TritonTemplateCaller.autoheuristic_ido  s    	~~L!J	A,A,A,,'
%	yk7)9WIYwiWbcmbnnx  zC  yD  E  	ErM   )NNNN)rI   rJ   rK   r   r   r   r
   r2   rZ   r&   r   ry   ry  r{  r}  re  rk  r   r  r  r  r   r   s   @rN   rH  rH  $  s     04;? 
 e-t4E/FFGGH
 
  - 
  (
38 
 
 
D> U U/
	
4U+<dCT>U+U%V VW 'ErM   rH  c                        e Zd Z	 ddddeddf fdZdefdZ fdZd	 Zd
 Z	d Z
deeeeee   f   f   fdZd Z xZS )ro  NTrm  choicerc   c                r    t         |   |j                  ||d       || _        |xs i | _        || _        y )Nrp  )rw  )rx   ry   r   r  rq   rX  )rW   r  r   r  rq   rX  r|   s         rN   ry   zExternKernelCaller.__init__~  s9     	k6rJl.rM   c                 >    d| j                   j                          dS )NzExternKernelCaller(r   r  re  rV   s    rN   r}  zExternKernelCaller.__str__  s    $T[[%:%:%<$=Q??rM   c                   |j                         dk(  ry| j                  rt        |   |d|iS | j	                         } || }t
        j                  j                  j                  j                  |t        |j                               t        |j                                      |j                  |       t        j                  ||i       S )Nr   r  ri   )r   rX  rx   ry  rb  r[   _C_dynamoguardsassert_size_strider  r   rn  copy_r4   )rW   ri   r   algoout_newr|   s        rN   ry  zExternKernelCaller.benchmark  s    99;!7$d444##%DDkGHH##66sxxz*E#**,,? IIg((tR88rM   c                     | j                   j                         }| j                  r t        j                  |fi | j                  S |S rU   )r  rb  rq   r  r   r  s     rN   rb  zExternKernelCaller.to_callable  s:    [[$$&;;$$R74;;77	rM   c                 $   dj                  | j                  j                  gt        | j                  j                               D cg c]  }| dt        | j                  |          ! c}| j                  j                               S c c}w )Nr  r  )rL  r  r   r?  rq   r@  rA  rk  )rW   rL  s     rN   rk  zExternKernelCaller.hash_key  s    xx   "((8(8(:!; gQtDKK$6789 $$&	
 		
s   $B
c           	      &   | j                   j                  ri| j                   j                  J d       t        j                  j
                  | j                   j                  g| j                  i | j                  }n| j                   j                  1 | j                   j                  | j                  i | j                  }n| j                  rt        j                  nt        j                  } || j                  | j                  | j                   j                         | j                   j                  | j                   j                  | j                   j                  | j                        }t        j                   j                  |      S )Nz6Please provide an op_overload to use ir.FallbackKernel)r  r  python_kernel_namer_  rn  rY  rq   )r  rZ  rY  r   FallbackKernelr8  r   rq   r[  rX  ExternKernelOutExternKernelAllocr  re  r_  rn  r  )rW   innerrl   s      rN   r   zExternKernelCaller.output_node  s/   ;;**;;**6 H6 %%,,''*.*:*:>BkkE [[''3.DKK..0@0@PDKKPE(,(<(<"$$"BVBVC{{''#';;#8#8#: $ ; ;.2kk.W.W KK33{{E ||""5))rM   c                 <    d| j                   j                         dS )r  ra   )ru  kernel_call_namer  rV   s    rN   r  zExternKernelCaller.info_dict  s!       $ 5 5 7
 	
rM   c                 4    d| j                   j                   S )Nextern_)r  r   rV   s    rN   r  z#ExternKernelCaller.autoheuristic_id  s    ))*++rM   rU   )rI   rJ   rK   rW  ry   r   r}  ry  rb  rk  r   r   r
   r2   rZ   r  r  r   r   s   @rN   ro  ro  }  sw     / /"/ 
/@ @9

*0
4U+<dCT>U+U%V VW 
,rM   ro  rc   c                  ^    t         j                  j                  dd       } | sy d| vr|  d} | S )NTORCHINDUCTOR_MM_LOGGING_FILEjson.json)osenvironr:  )mm_file_names    rN   get_mm_log_filenamer    s7    ::>>"A4HL\!&u-rM   c                    | j                  dd      }t        |      }|5  	 t        |       5 }t        j                  |      }d d d        j                  |       t        | d      5 }t        j                  ||d       d d d        d d d        y # 1 sw Y   PxY w# t
        t        j                  f$ r g }Y pw xY w# 1 sw Y   ?xY w# 1 sw Y   y xY w)Nr  z.lockwr  )r\  )	r   r   openr  r   FileNotFoundErrorJSONDecodeErrorr   dump)filenamerz  	lock_filelockflog_datas         rN   append_to_logr    s      '2IID	 
-	h (199Q<(
 	(C  	-AIIh!,	-
- 
-( (!4#7#78 	H	
	- 	-
- 
-s]   CBBBC(B?CB	BB<9C;B<<C?C	CCc                   R    e Zd ZddZd ZdefdZdej                  fdZ	de
fdZy)	 DataProcessorChoiceCallerWrapperrc   Nc                 Z    || _         ||| _        nd | _        ||| _        y d | _        y )Nc                 
    | |fS rU   rL   r   ys     rN   r	  z;DataProcessorChoiceCallerWrapper.__init__.<locals>.<lambda>  
    q!f rM   c                     | S rU   rL   r   s    rN   r	  z;DataProcessorChoiceCallerWrapper.__init__.<locals>.<lambda>      A rM   )_wrapped_preprocessor_postprocessor)rW   wrappedpreprocessorpostprocessors       rN   ry   z)DataProcessorChoiceCallerWrapper.__init__  s5    #!-D!4D$"/D"-DrM   c                 .    t        | j                  |      S rU   r   r  r   s     rN   __getattr__z,DataProcessorChoiceCallerWrapper.__getattr__      t}}d++rM   c                    | j                  ||      \  }} | j                  j                  |d|i}| j                  |      }||ur|j	                  |       |S )Nri   )r  r  ry  r  r  )rW   ri   r   new_argsnew_outresults         rN   ry  z*DataProcessorChoiceCallerWrapper.benchmark  s]     ..tS9'((((@@%%g.gIIgrM   c                 X    | j                   j                         }| j                  |      S rU   )r  r   r  )rW   r  s     rN   r   z,DataProcessorChoiceCallerWrapper.output_node  s%    **,""6**rM   c                 "    d| j                    dS )Nz!DataProcessorChoiceCallerWrapper(r   r  rV   s    rN   __repr__z)DataProcessorChoiceCallerWrapper.__repr__
  s    24==/CCrM   r   )rI   rJ   rK   ry   r  r   ry  r   r  r   r   r  rL   rM   rN   r  r    s6    	.,u +R\\ +D# DrM   r  c                   :    e Zd ZdZ	 	 d	dZd Zd Zd ZdefdZ	y)
DataProcessorTemplateWrappera  
    A wrapper class for a kernel template.

    This class together with `DataProcessorChoiceCallerWrapper` provides a convenient way to
    preprocess and postprocess data before and after using the wrapped template. A typical
    usage is to reorder or filter the input nodes in order to match the expected input of other
    kernel choices like a ATen kernel. A more complicated usage is to prepack the weights.
    See the example from :mod:`cpp_gemm_template` for more details.
    rc   Nc                     ||| _         nd | _         ||| _        nd | _        d|v sJ d|v sJ  ||d   |d         \  |d<   |d<    |di || _        y )Nc                 
    | |fS rU   rL   r  s     rN   r	  z7DataProcessorTemplateWrapper.__init__.<locals>.<lambda>#  r  rM   c                     | S rU   rL   r  s    rN   r	  z7DataProcessorTemplateWrapper.__init__.<locals>.<lambda>'  r  rM   r   r  rL   )r  r  r  )rW   wrapped_template_clsr  r  rq   s        rN   ry   z%DataProcessorTemplateWrapper.__init__  s     #!-D!4D$"/D"-D&&&6!!!2>=!6(#33
/}vh/ -6v6rM   c                 .    t        | j                  |      S rU   r  r   s     rN   r  z(DataProcessorTemplateWrapper.__getattr__/  r  rM   c                 P     t        | j                        j                  | |fi |S rU   )ry  r  maybe_append_choice)rW   choicesrq   s      rN   r  z0DataProcessorTemplateWrapper.maybe_append_choice2  s%    6tDMM"66tWOOOrM   c                 |     | j                   j                  di |}t        || j                  | j                        S r  )r  r   r  r  r  )rW   rq   choice_callers      rN   r   z%DataProcessorTemplateWrapper.generate5  s;    ...88/4--t/B/B
 	
rM   c                 "    d| j                    dS )NzDataProcessorTemplateWrapper(r   r  rV   s    rN   r  z%DataProcessorTemplateWrapper.__repr__;  s    .t}}oQ??rM   r   )
rI   rJ   rK   rY   ry   r  r  r   r   r  rL   rM   rN   r  r    s4    7 
7,,P
@# @rM   r  c                   (     e Zd Zdeddf fdZ xZS )ErrorFromChoicer  rc   Nc                 H    |d| d| z  }t         |   |       || _        y )Nz
From choice r  )rx   ry   r  )rW   msgr  
inputs_strr|   s       rN   ry   zErrorFromChoice.__init__@  s.    xr*66rM   )rI   rJ   rK   r1   ry   r   r   s   @rN   r  r  ?  s    L   rM   r  c                       e Zd Zy)NoValidChoicesErrorNrH   rL   rM   rN   r  r  F  rO   rM   r  c                      dt         j                  v rt        t         j                  d         S t        t         d      rt	        t        j
                  d            nt        j                         } | sJ | S )NTORCHINDUCTOR_COMPILE_THREADSsched_getaffinityr   )r  r  r   r  r  r  	cpu_count)r  s    rN   get_num_workersr  J  sc    &"**42::=>?? 2*+ 	B  #$\\^ 
 9rM   c                 d    t        | D cg c]  }t        j                  |       c}      S c c}w rU   )rA  AlgorithmSelectorCachekey_of)r   r   s     rN   create_inputs_keyr  X  s'    ;Ga'..q1GHHGs   -r   
inputs_keyr  c                     dj                  | |t        j                         g|D cg c]  }|j                          c}z         S c c}w )N:)rL  r[   get_float32_matmul_precisionrk  )r   r  r  r  s       rN   create_precompile_keyr  \  sM     88..0	

 ,3
36??
3	4  4s   A
c                   T    e Zd Zd fdZddZ	 	 	 ddee   deee	e
ej                  gej                  f   f      de	fdZe	 dd       Zed	ed
eej(                     deeef   dedef
d       Zed        Ze	 dd       Zed        Zde
eeef   eee   ee   gdf   fdZ xZS )r  Nc                 V    t        |   |i | i | _        g | _        t	        |        y rU   )rx   ry   precompile_cachefeedback_saver_fnsr   )rW   r   rq   r|   s      rN   ry   zAlgorithmSelectorCache.__init__j  s8    $)&) @B 	 	 	&d+rM   c                 8    | j                   j                          y rU   )r  clearrV   s    rN   cache_clearz"AlgorithmSelectorCache.cache_clear{  s    ##%rM   r  input_gen_fnsprecompilation_timeout_secondsc           	      0    ddl m} j                  j                  dk(  rd}D 	cg c]  }	|	|		 c}	t        j
                  j                  HD 
cg c]<  }
t        j                  t        j
                  j                  |
j                        r|
> c}
t        j
                  j                  HD 
cg c]<  }
t        j                  t        j
                  j                  |
j                        r|
> c}
t               x}rId   j                         d d \  }}d   j                         d   }t        |dt        |||f      i       t!              d	k(  rd
k7  rdnd}t#        d| d      t$        j'                  dt        t!                           t!              dk(  r"t)        d	   |      sd	   j+                         S t-        j.                  d        fd       t1              dt2        g d f   f fd}fdt        j4                  rddlm} |j;                           fd |      |rt        j<                  st        j>                  rfd}tA               }D ]"  }
t)        |
tB              s||
jD                  z  }$ tF        jH                  jJ                  jL                  jO                  tF        jH                  jJ                  jQ                  ||            S        }|i k(  sd	   |vrd	   j+                         S tS        jT                  ||jV                        }|j+                         }t$        j'                  dt        |             |S c c}	w c c}
w c c}
w )Nr   )CUDATemplateCallerr$  Fr   r+  invoker   convolutionmax_autotune_gemm_backendsmax_autotune_conv_backendsz7No choices to select, please consider adding ATEN into zM config (defined in torch/_inductor/config.py) to allow at least one choice. z%Max autotune selects from %s choices.c                  ,    j                         S rU   )make_benchmark_fn)r  r  r   r  rW   s   rN   r  z:AlgorithmSelectorCache.__call__.<locals>.make_benchmark_fn  s    ))';VVrM   rc   c                    t         j                  d       d }dk  rt         j                  d       |S t        t               t	        |             }|dk  r|S t
        j                  j                  dk(  r<t
        j                  j                  dk(  rt
        j                  j                  dk  r|S j                  | d       }|rXt	        |      t	        |       k(  rt         j                  d	       |S t         j                  d
t	        |      t	        |              t        j                  r"t        j                  st        j                  s|S t!        |       }j"                  j%                  |      x}rt         j                  d       |S t         j                  dt	        |       |       d }fd}t'        |      t(        j*                  j,                  j/                         }i i i t1               }	| D ]k  }
|
j3                         |	v rt         j                  d|
       -|	j5                  |
j3                                t7        |
d      sYt9        |
t:              xr t9        |
j<                  t>              }|r|jA                         rtC        |
j<                  jD                        5 }|jG                         }d d d        |jI                  |
j<                  jJ                        jL                  }t         j                  d|
       n(jO                  ||
      }t         j                  d|
       tQ        jP                         |<   |jS                  |       |
|<   n tU        jV                  d       tY               fd              }|j"                  |<   |S # 1 sw Y   xY w)NzStarting precompilationc                       y rU   rL   r   rq   s     rN   no_opzBAlgorithmSelectorCache.__call__.<locals>.precompile.<locals>.no_op  s    rM   r   z7Precompilation timeout is None or <= 0, returning no_opr        )ry  z'Timings found in cache, returning no_opz<Found only %d/%d timings for %s, not skipping precompilationz0Precompile function found in cache, returning itzCMultithreaded precompilation for %d choices using %d worker threadsc                     t         j                  d|        t               5  | j                          d d d        y # 1 sw Y   y xY w)Nz,Precompiling choice with captured stdout: %s)logr  r<   r{  )r  s    rN   precompile_with_captured_stdoutz\AlgorithmSelectorCache.__call__.<locals>.precompile.<locals>.precompile_with_captured_stdout  s8    		H&Q*, (%%'( ( (s	   ;Ac                     | v sJ t        j                          |    z
  | <   t        j                  d| |           y )Nz<Precompilation complete for future: %s, elapsed time: %.02fs)timer  r  )futureelapsed_timesstart_timess    rN   on_completezHAlgorithmSelectorCache.__call__.<locals>.precompile.<locals>.on_complete  sE    ,,,(,		k&6I(If%		R!&)rM   )max_workersz Skipping already seen choice: %sr{  )r   source_codez-Submitted triton async compile for choice: %sz#Submitted precompile for choice: %sc                  \   t         j                  d       t        d   dxx   dz  cc<   t              D ]`  } | j	                         x}rt         j                  d||           0t        d   dxx   dz  cc<   t         j                  d|    |           b j                  d	
       y )NzWaiting on futuresinductorselect_algorithm_precompiler   )timeoutz$Exception %s for benchmark choice %s select_algorithm_num_precompilesz,Precompiling benchmark choice %s took %.02fsT)wait)r  r  r   r   	exceptionerrorr  shutdown)r  er  executorfuturesr  s     rN   wait_on_futureszLAlgorithmSelectorCache.__call__.<locals>.precompile.<locals>.wait_on_futuresC  s     		./$%BCqHC*: F #,,..q.		BAwv !,-OPTUUPJ#FO)&1  !!t!,rM   )-r  r  minr  r  sysversion_infomajorminormicror  r  r   search_autotune_cachemax_autotunemax_autotune_gemmr  r  r:  r   r[   	_inductorasync_compileAsyncCompiler   rk  r  r  rT  rH  rT  r   use_process_poolr  r%  readr`   r   r  submitr  add_done_callbackr  rq  r<   )r  r  num_workerstimingsprecompile_keyprecompile_funcr  r  r6  seen_choicesctriton_cuda_choicefiler  r  r+  r  r)  r*  r  r  r   r  rW   s                   @@@@rN   r{  z3AlgorithmSelectorCache.__call__.<locals>.precompile  sG   II/0 /61Q6		STo/W>Ka   &&!+$$**b0$$**a/ kk	 " G  w<3w</IIGH L HHVGG	 ++##v'?'?24WMN"&"7"7";";N"KKK		LM&&HHUG(
 *kBH!OO99FFHMJLGGIKIKM 6@\L (::<</II@!D $$QZZ\21l+)3/* *I$QWW.GH ' *m.L.L.N!!''"5"56 6$*.))+K6!.!5!5()(;(; "6 " &  		"QSTU!)1PRS!T		"GK*.))+K',,[9&'GFO3(6   &"$- % '-* 5DD!!.1""O6 6s    N77O 	c                     t         j                  d       t         ddd      5           |       cd d d        S # 1 sw Y   y xY w)NzStarting autotuning_template_autotuningTcompile_time_autotune_time_uslog_pt2_compile_eventdynamo_compile_column_us)r  r  r   )r  r  r   s    rN   autotunez1AlgorithmSelectorCache.__call__.<locals>.autotune^  sN    II+,&,-&*)H 4
 +(*734 4 4s	   ?A)tuning_poolc                 P   t        j                          }t         ddd      5   |         d d d        t        j                          |z
  }t        j                  d|       t        j                          }j	                  
      }t        j                          |z
  }t        j                  d|       |r&t        d |j                         D              rt        j                         j                  rt        d   d	xx   d
z  cc<   j                         j                  s?t        j                         t        j                  k(  st        j                  j                   rj#                  	|||       j$                  D ]  } ||	        |S # 1 sw Y   nxY w)N_template_precompilingTrF  rG  z#Precompilation elapsed time: %.02fszAutotuning elapsed time: %.02fsc              3   H   K   | ]  }t        j                  |         y wrU   )mathisfinite)r  timings     rN   r  zIAlgorithmSelectorCache.__call__.<locals>.do_autotuning.<locals>.<genexpr>  s!      .4DMM&))s    "r   select_algorithm_autotuner   )r  r   r  r  r  r
  r  r  
cache_infocurrsizer   getEffectiveLevelloggingDEBUGr   tracelog_autotuning_resultslog_resultsr  )precompile_fnprecompile_start_tsprecompile_elapseautotune_start_tsr=  autotune_elapsefeedback_fnrJ  r  r   r  r  r   rW   s          rN   do_autotuningz6AlgorithmSelectorCache.__call__.<locals>.do_autotuningm  s}   "&))+&./&*)H  
   !%		.A AII;=NO $		kk	G #iik,==OII7I3 8?8H  *) ++-66$%@AQFA ",,.77((*gmm;<<66  +wAR  $66 AGT;@A NO   s   FF%c                            } t        d      }| j                         D ]"  \  }}t        |t              st	        ||      }$ | j                         D ci c]  \  }}||k  st        |t              s|| } }}| S c c}}w )Ninf)r   r   rT  ro  r,  )r=  min_extern_choicer  rQ  r  ra  r[  s        rN   get_timingsz4AlgorithmSelectorCache.__call__.<locals>.get_timings  s    '6$)%L!&-mmo KNFF!&*<=,/0A6,J)K )0$ 11)&2DE	 DL  s   "Br   zselected choice: %s),codegen.cuda.cuda_kernelr  r/  ry  r   test_configsautotune_choice_name_regexresearchr   autotune_choice_desc_regexrw  r  r   r  r   r  r  r  r  rT  r   r  rq  r  r   autotune_in_subprocautotune_processrK  
initializer3  r4  r   rH  r3  r[   r5  r   r  r8  MultiTemplateBufferbuiltinsr,  __getitem__)rW   r   r  r   r  r  r  return_multi_templater  r  rA  r  MKNbackend_configr{  rK  re  r3  r=  selected_keyselected_choicerJ  ra  r  r  r[  s   ```````                @@@@@rN   __call__zAlgorithmSelectorCache.__call__~  sg    	A
 $(:(:e(C$)!
 )0Ff63E6F99E !99''BBFF G 99E !99''BBMM G /00<0r?++-bq1DAqB((*2.A,31ay>(BCw<1 =( -1 
 &I.IY Z_ _  			93s7|;LMw<1gaj*<=qz--//			T	"	W 
#	W '{3
S	#8BH#5 S	# S	#j	4 %%5 ""$)	 )	V #7+ f&9&9V=U=U* 6@\! Ea!56)Q-D-DD)E ??%%//66""66)   .b=GAJg51:))++||G1D1DE&224		'_)=>a	 Gs   N	N	!ANANc                    	 i dt         t        t           t        t           f   dt        f fd	t
        rt        t        |       d       dt        dt        dt        fddt         t        t           t        t           f   dt        t         t        t        f   t        f   f	fddt         t        t           t        t           f   ffd	}t        j                  r|}|S }|S )
Nr  rc   c                    t              D ci c]6  \  }}|j                          j                  |
j                        |      8 }}}t	        |j                               }D cg c]6  }||j                            j                  r||j                            nt        j                  ||j                            t        j                  j                  j                  |j                         t        j                        t        j                  j                  j                  |j!                         t        j                        t        j                  j                  j#                  |j%                         j&                  t        j                              9 }}
j                        }t        j                  ||j)                         |j+                         t        j                  j                  j#                  j&                              }d }	t,        r% | d   j.                  |d|i |j1                         }	t2        j5                  |||||	      S c c}}w c c}w )Nr  r   ri   )r  r_  r:  benchmark_example_valuerZ   r  	is_mkldnnr[   
as_stridedrC   r   r  r  r   r   rC  rm  	size_hintrc  rd  r   rn  rD   ry  cloner_   rm   )r  r'  r   unique_example_inputsrg   rg  rh   ri   rj   rb   rl   r  r   r  s             rN   
get_inputsz<AlgorithmSelectorCache.make_benchmark_fn.<locals>.get_inputs  s%    &k2%Aq 

Om//33N3NOPQRR%! % ""7">">"@AN* #.)%( # -Z-@-@-BCMM ***=*=*?@))-j.A.A.CD((33&//1%+%D%D 4  ((33&113%+%D%D 4  ((22&113::%+%D%D 3 %! %, --f5C))SXXZqww/?/?/I/I&--/XJ H$
$$&;LL%++-00% I%
%s   ;I%+D<I+z tuning requests:r  autotune_argsc                 ~   t        | t              }|j                  |      }|j                         \  }}|j	                           | j
                  |d|i}t        d |D        d      }t        |      }|j                         r|j                          t        r"|j                   |j                  di t         |S )Nri   c              3      K   | ]:  }t        |j                  j                        s#|j                  j                   < y wrU   )r:   r/  ry  )r  tensors     rN   r  zhAlgorithmSelectorCache.make_benchmark_fn.<locals>.benchmark_choice_in_current_process.<locals>.<genexpr>  s+     V6&--BTBT;U##Vs
   $AAr4  rL   )rT  ro  rf   rX   zero_ry  rD  r   is_availablesynchronizerD   rb   rr   )	r  r  	is_externbenchmark_tensorsinptsoutputr  device_typedevice_interfaces	            rN   #benchmark_choice_in_current_processzUAlgorithmSelectorCache.make_benchmark_fn.<locals>.benchmark_choice_in_current_process  s     #6+=>I - C CI N-446ME6LLN%V%%u9&9FV%VK  8D,,. ,,.-00<$$$.v.MrM   c                     	|       }i }| D ]  }	  ||      }|||<    |S # t         $ r4}t        j                  dt        |             t	        d      }Y d }~Ad }~wt
        $ r+}t        j                  d|       t	        d      }Y d }~sd }~wt        $ rI}t        |      }d|v r|dz  }n	d|v r|dz  }t        j                  d|       t	        d      }Y d }~d }~wt        $ r}t        d	| d
|       d }~wt        $ rR}	 ddl
m} t        ||      r!t        j                  |       t	        d      }n|n# t        $ r |d w xY wY d }~9d }~ww xY w)NzECUDA compilation error during autotuning: 
%s. 
Ignoring this choice.rc  zNot yet implemented: %szinvalid argumentz>

This may mean this GPU is too small for max_autotune mode.

zillegal memory accessz*

Either error in template or triton bug.
z<Runtime error during autotuning: 
%s. 
Ignoring this choice.zIncorrect result from choice z

r   )OutOfResources)r0   r  r&  r   r   r9  warningr   AssertionErrorri  triton.runtime.autotunerr  rT  r  )
r  r  r=  r  rQ  r(  r  r  r  r  s
           rN   benchmark_in_current_processzNAlgorithmSelectorCache.make_benchmark_fn.<locals>.benchmark_in_current_process  sr     (FG! ()%*@PFL #)Q()T NO ( *IIaA #5\F* *KK 91="5\F# *a&C)S0cc2c9#RRCIIX #5\F% (7xtA3G  ! 
*	*K%a8KKN%*5\F"#G # ' *T)* #
*sV   	$	E*AE(!BE?CE*C;;E5D>=E>EEEc                     ddl m} | D cg c]  }t        |t              s| }}| D cg c]  }t        |t              r| }} |      }|j	                  |j                  |             |S c c}w c c}w )Nr   )rn  )rp  rn  rT  ro  rv  benchmark_in_sub_process)r  rn  rA  ra   r`   r=  r  s         rN   r  zJAlgorithmSelectorCache.make_benchmark_fn.<locals>.benchmark_in_sub_processM  st     + ")NAJq:L,MaNFN!(RA
1>P0QaRFR26:GNN+DDVLMN ORs   A2A2A7A7)r
   rZ   ro  rH  r_   rW  r>  r  r1   r   r   r   rm  )
rl   r  r   r  r  r  ry  r  r  r  s
   ` ```  @@@rN   r  z(AlgorithmSelectorCache.make_benchmark_fn  s     M.	4 23T:N5OOP.	.	 .	` S\N"345	 	1=		(/	4 23T:N5OOP/	%*,@@A5HI/	b	4 23T:N5OOP	  )) % 	  . 	 rM   r   r   r=  elapser]  c                 2   t         j                  j                  | |||       t        j                  st        j
                  rt        sy dj                  |D cg c]f  }dj                  t        t        t         j                  j                  j                  |j                         t        j                                    h c}      }t        j                  dk(  ry t        j                  }t!        j"                        d | }|d   }fd}	t%               }
|
rqd| v rm|d   j                         d d	 \  }}|d
   j                         d
   }t        |||f      j'                         D cg c]
  } |	|       c}i}t)        |
|       |   }t*        j,                  j/                  d|  d| d       |D ]  }|   }|rH|j0                  }t*        j,                  j/                  d|j2                   d|dd||z  dd| d	       Rt*        j,                  j/                  d|j2                   d|dd        t        j4                  rdnd}t*        j,                  j/                  | d|dd|ddt7               d       y c c}w c c}w )Nr   r   r  r   rf  c                 T   t        | t        j                  j                  j                        rd|    dS t        | t        j                  j                  j
                        sJ | j                         }|d   }t        |      }|d   }|d   }|d   }d|    ||||d   |d	   d
S )Ncublas)ry  r  r.  r   r   r   r`   r   r   )ry  r  r*  r,  r-  r   r   )rT  r[   r5  select_algorithmro  rH  r  r  )r  r  r  r  r*  r,  r-  r=  s          rN   get_choice_infoz;AlgorithmSelectorCache.log_results.<locals>.get_choice_info  s    &%//"B"B"U"UV ('&/BB88MM   ##%D%DT
IlGlGlG !""""<0!+. rM   mmr  r   r+  z	AUTOTUNE r   z)
z  r  z.4fz ms z.1%r  z ms <DIVIDED BY ZERO ERROR>

SubProcessSingleProcessz AUTOTUNE benchmarking takes z seconds and z seconds precompiling for z	 choices
)rC   r  rY  r   r3  r4  PRINT_AUTOTUNErL  r  r   r   r  r  r   rC  autotune_num_choices_displayedr?  rr  r  r@  r  r-  stderrr6  rw  r   rm  r  )r   r   r=  r  r]  nsizestop_kbestr  mm_filenamert  ru  rv  r  out_dict	best_timer  kernel_descriptionautotune_type_strs     `                 rN   rZ  z"AlgorithmSelectorCache.log_resultsc  s    	
&&+w0A	
 ##v'?'?		 %  ((33JJL%+%D%D 4 
 00A511wG$7$78!<Qx	4 *+44<r?++-bq1DAqB((*2.A Q1Iw||~ VV!8 VH +x0DM	

9TF!E7#67 
	FV_F%+%7%7"

  Qvcl$y67I#6NaPbOccef 

  Qvcl2OP
	 #66LO 	 	

 !!>vcl-XijmWn(WjB	
Wj !Ws   A+J3Jc                    t        | t        j                        rt        j                  d|       } t        | t        j                        r| j                         } t        j                  t        j                  j                  j                  | j                         t        j                        t        j                  j                  j                  | j                         t        j                        | j!                         | j#                         | j$                  j&                  t        j                  j                  j                  t        j                  j)                  |       t        j                              S )zh
        Convert an ir.Buffer into a concrete torch.Tensor we can use for
        benchmarking.
        faker  r  )rT  r   Layoutr7  BaseViewunwrap_viewr  generate_example_valuerC   r   r  r  r   r   rC  rm  r9  r#  r  rd  get_allocation_size)r  s    rN   r}  z.AlgorithmSelectorCache.benchmark_example_value  s    dBII&99&6DdBKK(##%D &<<GG''88 (  GG''!88 (  OONNKKGG''++D188 ( 
 	
rM   c                     t               5  ||| k(  rt        | ||||      cd d d        S t        |||||      j                  | |      cd d d        S # 1 sw Y   y xY w)N)r/  r   
extra_size)r   r   r  )r   rn  r/  r   r  allocation_sizes         rN   r  z-AlgorithmSelectorCache.generate_example_value  st      ! 	+&/T*A#!)	+ 	+ $#!) *T6*!	+ 	+ 	+s   AAAc                    t         j                  j                  }| j                         j                  t        | j                               g|j                  | j                         t        j                        |j                  | j                         t        j                        |j                  | j                         j                  t        j                        S )zt
        Extract the pieces of an ir.Buffer that we should invalidate cached
        autotuning results on.
        r  )rC   r   r  r9  ry  r   r#  r  r   r   rC  rm  r  rc  rd  )r  r  s     rN   r  zAlgorithmSelectorCache.key_of  s     77##OO"" !
   88 ! 
   !88 ! 
 !((88  
 	
rM   r   c                 :    | j                   j                  |       y rU   )r  r   r  s     rN   add_feedback_saverz)AlgorithmSelectorCache.add_feedback_saver	  s     	&&r*rM   r   )Ni  FrU   )rI   rJ   rK   ry   r  rZ   r1   r   r   r   r   r   r7  r[   r\   rz  rt   r  staticmethodr   r  r   rZ  r}  r  r  r   r  r   r   s   @rN   r  r  i  sX   ,"& SW.5#J l#J  S(BII;3L*M%M NOJ ),JX
  V Vp [
[
"))_[
 lE)*[
 	[

 ![
 [
z 
 
B AE+ +. 
 
.+,%&T#Y\8JKTQ
+rM   r  _ALGORITHM_SELECTOR_CACHEc                      t         
t               a d|vr't        j                  j                  j
                  |d<   t        | i |S )Nrs  )r  r  r[   r5  r   benchmark_epilogue_fusionr  s     rN   autotune_select_algorithmr  	  sI     ($:$<!f,OO""<< 	&' %d5f55rM   r   c                 N    t         
t               a t         j                  |        y rU   )r  r  r  )r   s    rN   r  r  ,	  s!     !($:$<!004rM   c                      t        |       dk(  r?t        j                  j                  t        j                  j	                  | d               S | D cg c]  }t        |       c}S c c}w )Nr   r   )r  r   ExternKernelrequire_stride1realize_inputrealize_inputs)r   r   s     rN   r  r  5	  sO    
4yA~..r/L/LTRSW/UVV'+,!N1,,,r  c                   P    e Zd ZdZdedeeeef   f   fdZdeeeef   fdZ	d Z
y)	r  z
    Wrapper around a grid function that allows either int or sympy inputs.

        @SymbolicGridFn
        def grid(x, meta, *, cdiv):
            return cdiv(x, meta["BLOCK_X"])
    r   .c                 8   || _         i | _        i | _        t        j                  |      j
                  }dt        t        fdt        j                  t        fdt        j                  t        ffD ])  \  }}}||v s|| j                  |<   || j                  |<   + y )Ncdivr,  max)r   
kwargs_int
kwargs_symrg  r.  
parametersr   r7   r   Minr,  Maxr  )rW   r   paramsr   fn_symfn_ints         rN   ry   zSymbolicGridFn.__init__D	  s    ""2&11Wg&EIIs#EIIs#%
 	/ D&&
 v~(.%(.%	/rM   rc   c                 >     | j                   |i || j                  S rU   )r   r  rW   r   rq   s      rN   rz  zSymbolicGridFn.__call__R	  !    tww::$//::rM   c                 >     | j                   |i || j                  S rU   )r   r  r  s      rN   r  zSymbolicGridFn.sympy_callU	  r  rM   N)rI   rJ   rK   rY   r   r  r   ry   r   rz  r  rL   rM   rN   r  r  ;	  sD    /8CsC})=$=> /;5c3+? ;;rM   r  )lowering)rq  r  r   r  rg  r  r  rV  rO  r!  r  rj  r-  r  r  concurrent.futuresr   r   ior   typingr   r   r   r	   r
   typing_extensionsr   unittest.mockr   r   r[   torch._inductor.async_compiletorch._dynamo.device_interfacer   torch._dynamo.testingr   torch._dynamo.utilsr   r   r   r   torch._inductor.utilsr   torch.utils._filelockr   torch.utils._ordered_setr   utils._sympy.functionsr   rp  r   r   rn  r   r   r   r   	codecacher   r    r!   codegen.commonr"   r#   r$   r%   r&   codegen.simd_kernel_featuresr'   r  r(   r)   r*   r+   codegen.triton_utilsr,   r-   r.   codegen.wrapperr/   excr0   r1   r2   r   r3   runtime.benchmarkingr4   runtime.hintsr5   runtime.triton_heuristicsr6   utilsr7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   virtualizedrC   	getLoggerrI   r  rD   r   r   r]   r  rW  
concurrenttorch._inductor.codegen.simdrE   rG   r^  	dataclassrQ   r_   rv   r   r   r   r   rq  r  r
  rW  TritonTemplateCallerBaserH  ro  r  r  r  r  r   r  r  r   r  r  rZ   r  r  r  r  r   r  r  r  r  rL   rM   rN   <module>r     sC             	 	 
   ?  @ @ "    $ C . T T ? * / ,   ? >  =  T S " ! / " - + 0     g! S#X @	 	
 !" 6 6 6 %W %W %WP H 
 
 
.BE!** BEJr
< r
j T K
^ K
\6
 6
rVE266 VErS, S,l TXc]  - D D>.@ .@bl 	, 	 T
 
 
Ic I



)-l);

p
+_ p
+f ?C 8$:; B
65$|U*+S$s)T,=OPRVVW5-; ;> rM   