
    VhE                   2   d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlmZ d dlmZmZmZmZmZmZmZ d dlmZ d dlZd dlZd dlZd dlmZ d dlmZ d dlmZmZm Z  d d	l!m"Z"m#Z#m$Z$m%Z% d
dl&m'Z' ddl(m)Z)m*Z*m+Z+ ddl,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2m3Z3 ddl*m4Z4m5Z5 ddl6m7Z7 ddl8m9Z9m:Z: ddl+m;Z;m<Z<m=Z= ddl>m?Z?m@Z@mAZAmBZBmCZCmDZDmEZEmFZFmGZGmHZHmIZI ddlJmKZKmLZLmMZM ddlNmOZO ddlPmQZQmRZRmSZSmTZT ddlUmVZV ddlWmXZXmYZYmZZZm[Z[ er
d dl\m]Z]m^Z^m_Z_  ej                  ea      Zbej                  j                  ead      Zeej                  j                  ead      Zfej                  j                  ead      Zg eT       j                  Zi eg d       Zjej                   G d! d"             Zl G d# d$el      Zm G d% d&el      Znd4d'Zo ed(eQeQ)      Zp G d* d+eSep   eep         Zq G d, d-e<      Zr ej                  d./       G d0 d1             Zs G d2 d3et      Zuy)5    )annotationsN)Counter)AnyCallableGenericno_type_checkOptionalTYPE_CHECKINGUnion)TypeVar)immutable_dict)
OrderedSet)FloorDivIdentityModularIndexing)free_symbol_is_type
prefix_strsymbol_is_typeSymT   )counters   )configir	scheduler)prologue_preserves_zero_mask)	code_hash)	MemoryDepStarDepWeakDep)IRNodeTritonTemplateBuffer)!indexing_dtype_strength_reduction)
green_textyellow_text)BaseSchedulerNodeBaseScheduling	WhyNoFuse)cache_on_selfexpr_fits_within_32bitget_dtype_sizeIndentedBufferPlaceholderprefix_is_reduction'set_kernel_post_grad_provenance_tracingsympy_index_symbolsympy_product
sympy_subsunique)ops
OpsWrapperV   )BlockPatternMatcher)CSEVariableindex_prevent_reorderingKernelPythonPrinter)MultiKernel)DisableReductionEnableReductionNodeScheduleMarkerSIMDKernelFeatures)IterableIteratorSequence
perf_hintsschedulefusion)zyxr0_r1_c                       e Zd ZdZej
                  j                  ej
                  j                  d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZee	e
dd                     Zd	dZee	e
d
d                     Z xZS )IterationRangesa  
    Each range tree represents multiple sets of iteration indexing
    in a single tiled dimension in the output kernel.

    If you have two loops ranges one (4, 3, 2) and another (4, 6),
    then the range tree will be:
            4 (i0)
        3 (i1)  6 (i3)
        2 (i2)
    Where i0 is shared between both loops, but then the split into
    different indexing vars.  All loop ranges must iterate over
    the same number of elements.
    )divisorlengthc                   t         
|           || _        || _        || _        || _        || _        || _        || _        || _	        |	| _
        y N)super__init__namevar_list
var_rangesnumelprefixrO   rP   kernelroot)selfrU   rV   rW   rX   rY   rZ   rO   rP   r[   	__class__s             L/home/dcms/DCMS/lib/python3.12/site-packages/torch/_inductor/codegen/simd.pyrT   zIterationRanges.__init__`   sO     		 $
	    c                ,    t        | j                        S rR   )r.   rY   r\   s    r^   is_reductionzIterationRanges.is_reductionx   s     #4;;//r_   c                ,    t        | j                        S rR   )r0   rU   ra   s    r^   symbolzIterationRanges.symbol~   s    !$)),,r_   c                z    t        j                         D ci c]  \  }}||
 }}}|| j                     S c c}}w rR   )r   itemsrY   )r\   symtrY   prefix_to_symts       r^   rg   zIterationRanges.symt   s>     <F;K;K;MN<4&$,NNdkk** Os   7)rU   strrV   list[sympy.Symbol]rW   dict[sympy.Symbol, sympy.Expr]rX   
sympy.ExprrY   ri   rZ   
SIMDKernelr[   IterationRangesRootreturnNonero   boolro   zsympy.Symbol)ro   r   )__name__
__module____qualname____doc__sympySOnerT   propertyr)   r   rb   rd   rg   __classcell__r]   s   @r^   rN   rN   P   s    . ww{{ % 3	
    " 
0 0   0- +   +r_   rN   c                       e Zd Z	 d		 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d
 fdZddZddZddZddZ	 	 	 	 ddZddZ		 	 	 	 ddZ
 xZS )rn   c          	         |i }t         |   |g i ||||        || _        i | _        || _        |r| j
                  r|	J || _        || _        |	| _        |
| _	        y )N)rU   rV   rW   rX   rY   rZ   r[   )
rS   rT   indexnodes	pid_cacherb   is_loop
tensor_dimgrid_dimhas_zdim)r\   rU   rX   rY   r   rZ   r   r   r   r   r   r]   s              r^   rT   zIterationRangesRoot.__init__   s     I 	 	
 
=?
 *3 t00X5EFF$  r_   c                <    d| j                   d| j                   dS )NzIterationRangesRoot(, z, ...))rU   rX   ra   s    r^   __repr__zIterationRangesRoot.__repr__   s    %dii]"TZZLGGr_   c                b    | j                   j                         D ]  }|j                           y rR   )r   valuescache_clear)r\   nodes     r^   r   zIterationRangesRoot.cache_clear   s*    JJ%%' 	D	r_   c                2    t        | j                   d      S )Nr   )r0   rY   ra   s    r^   	index_symzIterationRangesRoot.index_sym   s    !T[[M"788r_   c                   t         j                  j                  j                  ||z  | j                        rt        | j                         |      }nt        | j                         ||      }|| j                  vrt        | j                   t        t         j                  j                         ||||       }|t         j                  j                  |j                         <   | j                   j#                  |j                                || j$                  |j                         <   || j                  |<   | j                  |   S )zF
        Lookup a given RangeTreeEntry, creating it if needed
        )r6   graphsizevarsstatically_known_equalsrX   r   r   r   r   IterationRangesEntryrY   nextrZ   iter_vars_countrange_tree_nodesrd   rV   appendrW   )r\   rO   rP   exprr   s        r^   lookupzIterationRangesRoot.lookup   s     7733Gf4DdjjQDNN,g6D"4>>#3WfEDtzz!';;-QXX%=%= >?@D 8<AHH%%dkkm4MM  /-3DOODKKM*#DJJtzz$r_   c                    t         j                  j                  }g }t        |      D ](  }|j	                  | j                  ||             ||z  }* g t        |      S rR   )rx   ry   rz   reversedr   r   )r\   lengthsrO   itervarsrP   s        r^   construct_entriesz%IterationRangesRoot.construct_entries   s]     ''++w' 	'FOODKK89&G	' %(#$$r_   c                f    | j                  |      D cg c]  }|j                          c}S c c}w rR   )r   rd   )r\   r   es      r^   	constructzIterationRangesRoot.construct   s'    $($:$:7$CDq
DDDs   .c           
     N  	 |j                   D cg c]+  }t        j                  j                  j	                  |      - }}|D cg c]!  }|s|j
                  | j
                  k(  s |# }}|j                  d        t        j                  j                  g g 		fd}|D ]v  }t        j                  j                  j                  |j                        s8 || j                  t        |j                                     |j                   ||       x t        j                  j                  j                  | j                         s, || j                  t        | j                                      g t#              g t#        	      fS c c}w c c}w )z,Figure out vars from this tree used in indexc                    t         j                  j                  j                  | j                  t
        j                        S )N)fallback)r6   r   r   	size_hintrO   r   unbacked_symint_fallbackrJ   s    r^   <lambda>z4IterationRangesRoot.vars_and_sizes.<locals>.<lambda>   s/    !''**44		F$C$C 5  r_   keyc                    j                  | j                                j                  | j                         | j                  z  y rR   )r   rd   rP   )r   rO   
index_varssizess    r^   addz/IterationRangesRoot.vars_and_sizes.<locals>.add   s5    dkkm,LL%+Gr_   )free_symbolsr6   rZ   r   getrY   sortrx   ry   rz   r   r   r   rO   r   r   rX   r   )
r\   r   sr   nr   r   rO   r   r   s
          @@@r^   vars_and_sizesz"IterationRangesRoot.vars_and_sizes   sO    <A;M;MNa**..q1NN!CqQ188t{{+BCC

 	 	

 ''++
	,  	D77##;;DLL'RDKK$,,)HIJ,,I	 ww77

GLGXdjj'%BCD&*%&(:(5/(:::7 OCs   0FF"F"*F"rR   )rU   ri   rX   rl   rY   ri   r   intrZ   rm   r   Optional[dict[str, str]]r   rr   r   Optional[int]r   r   r   rr   ro   rp   ro   ri   ro   rp   rs   )rO   rl   rP   rl   ro   r   )r   list[sympy.Expr]ro   zlist[IterationRangesEntry])r   r   ro   rj   )r   rl   ro   z+tuple[list[sympy.Symbol], list[sympy.Expr]])rt   ru   rv   rT   r   r   r   r   r   r   r   r|   r}   s   @r^   rn   rn      s     /3(!(! (! 	(!
 (! (! ,(! (! "(!  (! (! 
(!TH9 .%'%	#%E;;	4;r_   rn   c                  p     e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 d	 fdZd
dZddZddZd
dZddZddZ	ddZ
 xZS )r   c                $   t         |   ||j                  |z  |j                  |j                  |j
                  |||j                  |j                  	       || _         t        j                  d       | j                        | _        || _        y )N)	rU   rX   rV   rW   rY   rO   rP   rZ   r[   )rS   rT   rX   rV   rW   rY   rZ   r[   parent	functools	lru_cache_codegencodegenr   )r\   rU   rO   rP   r   r   r]   s         r^   rT   zIterationRangesEntry.__init__  s~     	,,'__((==== 	 
	
 0y**40?	r_   c                    d| j                    d| j                   d| j                   d| j                   d| j                   dS )NzIterationRangesEntry(r   ))rU   rO   rP   r   rW   ra   s    r^   r   zIterationRangesEntry.__repr__  sH    &tyykDLL>DKK=PRSWS\S\R]]_`d`o`o_ppqrrr_   c                L    fd| _         d | j                   _        | _        y )Nc                      S rR    )rU   s   r^   r   z/IterationRangesEntry.set_name.<locals>.<lambda>   s    t r_   c                      y rR   r   r   r_   r^   r   z/IterationRangesEntry.set_name.<locals>.<lambda>!      r_   )r   r   rU   )r\   rU   s    `r^   set_namezIterationRangesEntry.set_name  s    ##/ 	r_   c                8    | j                   j                          y rR   )r   r   ra   s    r^   r   z IterationRangesEntry.cache_clear$  s      "r_   c                X    t         j                  j                  |        | j                  S rR   )r6   rZ   codegen_iteration_ranges_entryrU   ra   s    r^   r   zIterationRangesEntry._codegen'  s    	//5yyr_   c                   g }t        | j                  t        j                        r|S t        | j                  t        t
        f      sJ t        | j                               | j                  j                  dd  D ]l  }t        |t        j                  t        j                  f      r.|j                  }t        |      dkD  sIt        d |D              s\|j                  |       n |S )Nr7   r   c              3  P   K   | ]  }t        |t        j                           y wrR   )r   r   SIZE.0r   s     r^   	<genexpr>z8IterationRangesEntry.precomputed_args.<locals>.<genexpr>4  s       ,56N1dii0,   $&)
isinstancer   rx   Symbolr   r   typeargsIntegerr   lenallr   )r\   precomputed_argsargsymbolss       r^   r   z%IterationRangesEntry.precomputed_args+  s    -/dii.##$))h%@AR4		?RA99>>!"% 	1CcEMM5<<#@A**w<!# ,:A, ) %++C0	1  r_   c                ,    t        | j                        S rR   )hashrU   ra   s    r^   __hash__zIterationRangesEntry.__hash__:  s    DIIr_   c                X    t        |t              sJ | j                  |j                  k(  S rR   )r   r   rU   )r\   others     r^   __eq__zIterationRangesEntry.__eq__=  s&    %!5666yyEJJ&&r_   )rU   ri   rO   rl   rP   rl   r   rl   r   rN   ro   rp   r   )rU   ri   ro   rp   r   )ro   r   ro   r   )r   objectro   rr   )rt   ru   rv   rT   r   r   r   r   r   r   r   r|   r}   s   @r^   r   r     sf      	
    
.s
# 'r_   r   c                    | t        d      k(  ry| t        d      k(  ryt        j                  |       ryt        |       S )Ninfzfloat("inf")z-infzfloat("-inf")zfloat("nan"))floatmathisnanrepr)values    r^   constant_reprr   B  s9    e	%-		E	;r_   CSEVariableType)bounddefaultc                      e Zd ZU dZeZded<   ded<   dZded<   ded	<   	 	 	 d5	 	 	 	 	 	 	 	 	 	 	 d6 fd
Ze	e
ed7d                     Zd8dZe	d9d       Zd:dZ	 	 	 	 	 	 	 	 	 	 	 	 d;dZd<dZd=dZd>dZd:dZd:dZd?dZd7dZd@dZdAdZd9dZdBdZ	 	 	 	 	 	 dCdZ	 	 	 	 	 	 dCdZdDdZdEdZe 	 	 	 	 	 	 dFd       Z!e"e#jH                  jJ                  f	 	 	 	 	 	 	 dGd        Z&	 	 	 	 dHd!Z'e"	 	 	 	 	 	 dId"       Z(dJd#Z)dJd$Z*dKd%Z+	 	 	 	 dBd&Z,dLdMd'Z-dNd(Z.dOd)Z/dPdQd*Z0e1jd                  	 	 	 	 	 	 dRd+       Z3dSd,Z4e d-        Z5d. Z6d/ Z7d0 Z8d1 Z9d2 Z:d3 Z;dTd4Z< xZ=S )Urm   zo
    Common base class for Triton/Halide codegen which both use flattened indexing rather than loop nests.
    zCallable[[sympy.Expr], str]sexprkexprFrr   allow_block_ptrri   kernel_namec                    |i }t         	           | _        |j                          _        t                _        t                _        |j                         D ci c]/  \  }}|t        j                  j                  j                  |      1 c}} _        g  _        i  _        t!        j"                          _        |j'                          _        ||n j+                          _        ||n j/                          _         j3                          _        d  _        t9        j:                  d       d fd       }| _         j?                  |       y c c}}w )Nc                    t         j                  j                  j                  | j	                               } j
                  D ]  }j                  | |      }  j                  |       S rR   )r6   r   r   simplify_with_rangesrW   range_treescombine_contiguous_dimscombine_modular_indexing_pairs)r   treer\   s     r^   simplify_indexingz.SIMDKernel.__init__.<locals>.simplify_indexing}  sb    GG$$99%ARSE(( B44UDAB 66u==r_   )r   rl   ) rS   rT   featuresget_mutations	mutationsr,   bodyindexing_coderf   r6   r   r   simplifynumelsr   r   	itertoolscountr   rb   inside_reduction should_use_cooperative_reductioncooperative_reductionshould_use_persistent_reductionpersistent_reductionwant_no_x_dimno_x_dimr   r   r   r   initialize_range_tree)
r\   tilingr   r   override_persistent_reductionoverride_cooperative_reductionrY   valr   r]   s
   `        r^   rT   zSIMDKernel.__init__Y  sK    I !//1"$	+-FLlln
7BvsFAGG$$--c22
 79JL(0 ( 5 5 7 .9 +668 	" -8 *557 	!
 **,(, 
		T	"	> 
#	> "3""9-=
s   "4E c                :    t        d | j                  D              S )Nc              3  2   K   | ]  }t        |        y wrR   )r.   )r   rY   s     r^   r   z0SIMDKernel.num_reduction_dims.<locals>.<genexpr>  s     I6&v.I   )sumr  ra   s    r^   num_reduction_dimszSIMDKernel.num_reduction_dims  s     IT[[IIIr_   c                    t         rR   NotImplementedError)r\   dtypes     r^   dtype_to_strzSIMDKernel.dtype_to_str      !!r_   c                T    | j                  | j                  j                               S rR   )r  r   select_index_dtypera   s    r^   index_dtypezSIMDKernel.index_dtype  s       !A!A!CDDr_   c                     yNFr   ra   s    r^   r  zSIMDKernel.want_no_x_dim      r_   c                   t        fdt        D              }| xs | }d	d}g d}	ddg}
|r|
}n
|r|	}n|	|
z   } |||      } ||	t              }g }t        |      D ]s  \  }}t        |      }|j	                  |      }|j	                  |      }||n|}|j                  t        | d|   ||| ||xr | j                   ||dv 
             u |S )
Nc              3  ,   K   | ]  }|v s|  y wrR   r   )r   rY   r  s     r^   r   z3SIMDKernel.construct_range_trees.<locals>.<genexpr>  s      %
61AF%
   	c                `    t        fd| D              D ci c]  \  }}||
 c}}S c c}}w )Nc              3  ,   K   | ]  }|v s|  y wrR   r   )r   r  masks     r^   r   zOSIMDKernel.construct_range_trees.<locals>.filtered_index_map.<locals>.<genexpr>  s     2U3PT32Ur'  )	enumerate)seqr*  idxr  s    `  r^   filtered_index_mapz<SIMDKernel.construct_range_trees.<locals>.filtered_index_map  s3    )22U#2U)U%S#S  s   *)rJ   rI   rH   rK   rL   r   rH   )r   r   r   r   r   )ro   zdict[Any, int])r   all_prefixesr+  r.   r   r   rn   r  )r\   r   r  rb   r  r  active_prefixesno_r_dimr.  	grid_dimsreduction_dimstensor_dimstensor_dim_mapgrid_dim_mapr   irY   r   r   r   s       `               r^   construct_range_treesz SIMDKernel.construct_range_trees  s#    % %
!-%
 
 (';|+;	
 $	(K#K#n4K ,KI))\B"?3 	IAv.v6L'++F3J#''/H!)AxE#he$6N'(J1J1J-J)% F]	& r_   c                    | j                  || j                  | j                  j                         | j                  | j
                        }| j                  j                  |       y rR   )r8  r  r   rb   r  r  r   extend)r\   r   r   s      r^   r  z SIMDKernel.initialize_range_tree  sR    00!!MM&&(KKMM
 	,r_   c                     y)zr
        Hook called right before codegen with every index that will be
        used in the fused kernel.
        Nr   )r\   indicess     r^   finalize_indexingzSIMDKernel.finalize_indexing  r   r_   c                v    | j                   }d| _         	 | j                  |||      || _         S # || _         w xY wr#  )r  store)r\   rU   r   r   priors        r^   store_reductionzSIMDKernel.store_reduction  s;    %% %	*::dE51$)D!ED!s   / 	8c                     yr#  r   ra   s    r^   r	  z+SIMDKernel.should_use_cooperative_reduction  r$  r_   c                     yr#  r   ra   s    r^   r  z*SIMDKernel.should_use_persistent_reduction  r$  r_   c                t    t        t        j                  j                  d | j                  D                    S )Nc              3  P   K   | ]  }|j                   j                            y wrR   )rW   rf   r   r   s     r^   r   z(SIMDKernel.var_ranges.<locals>.<genexpr>  s"      *,0%%'*r   )dictr  chainfrom_iterabler   ra   s    r^   rW   zSIMDKernel.var_ranges  s4    OO)) *484D4D* 
 	
r_   c                :    t        d | j                  D              S )Nc              3  J   K   | ]  }t        |j                  d u        y wrR   )r   r   rF  s     r^   r   z0SIMDKernel.triton_tensor_ndim.<locals>.<genexpr>  s     Q3td23Qs   !#)r  r   ra   s    r^   triton_tensor_ndimzSIMDKernel.triton_tensor_ndim  s    Q@P@PQQQr_   c                ^    dg| j                         z  }d||<   ddj                  |       dS )Nrp   :[r   ])rL  join)r\   r7  r   s      r^   indexing_size_strzSIMDKernel.indexing_size_str  s9    42244a499U#$A&&r_   c                    dg| j                         z  }| j                  D ]R  }|j                  |j                  r| j                  s)|j
                  j                          d||j                  <   T |S )N1BLOCK)rL  r   r   rb   r  rY   upper)r\   r   r   s      r^   dense_size_listzSIMDKernel.dense_size_list  sv    //11$$ 	GD&$$(=(=,0KK,=,=,?+@)Fdoo&	G r_   c                L    | j                         }ddj                  |       dS )NrO  r   rP  )rW  rQ  r\   r   s     r^   dense_size_strzSIMDKernel.dense_size_str  s)    $$&499U#$A&&r_   c                   t        |t              s|S |j                  d   }| j                  j	                  |      x}|S t        |||j                  i      }t        j                  j                  j                  |      }t        ||j                  j                         |j                  j                  t        j                  j                   |j                  j"                        j%                         i      S )Nr   )r   r   r   r   r   r2   r   r6   r   r   r   r[   r   r   rx   ry   rz   rX   rd   )r\   r   rJ   	tree_node	new_indexs        r^   r   z)SIMDKernel.combine_modular_indexing_pairs	  s    %1LJJqM..22155I>Luq)..&9:	GG$$CCIN	((*INN,A,AGGKK!5!5-&(
 	
r_   c                    t         j                  j                  j                  |      x}r!|\  }}t	        | j                  ||      |      S | j                  ||      S rR   )r6   r   r   expand_floor_divr   _combine_contiguous_dims)r\   r   r   
expand_resr]  denominators         r^   r   z"SIMDKernel.combine_contiguous_dims  s[     ))::5AA:A%/"I{D99)TJKXX00==r_   c                   t        |t        j                  t        j                  f      r|S |j	                  |      \  }}t        |      dk  r|S t        j                  j                  j                  ||t        |g||            \  }}}||k(  r|S |j                  |      }t        |t        t        | ||                        }	|	S )zI
        More aggressive simplification to merge contiguous dims
        r7   )r   rx   r   r   r   r   r6   r   r   _simplify_loopsr:   r   r2   rG  zip)
r\   r   r   r   r   	new_sizesreindex_prunenew_index_varsr]  s
             r^   r`  z#SIMDKernel._combine_contiguous_dims$  s     eemmU\\:;L //6
Eu:?L%&WW%5%5%E%E7US&
"	7F L	2ud3z7>;R+S&TU	r_   c                      j                   d   j                  xs  j                  t        j                   fd       } |       S )Nc               3     K    j                   j                         s j                  rJ d  y r j                          d _        	 d  r j                          d _        y # d _        w xY ww)NFT)r   rb   r  codegen_body)r\   should_flushs   r^   ctxz)SIMDKernel.disable_reduction.<locals>.ctx;  sl     ==--/0000 !!#$)D!-%%'(,%%s   AA5A) !A5)	A22A5)r   r   r
  
contextlibcontextmanager)r\   ro  rn  s   ` @r^   disable_reductionzSIMDKernel.disable_reduction8  sE    ''+33Qt7Q7Q		"	"	- 
#	-$ ur_   c                    t        |      t        | j                        k(  sJ t        || j                        D cg c]  \  }}|j                  |       c}}S c c}}w rR   )r   r   re  r   )r\   r   rP   rangess       r^   
set_rangeszSIMDKernel.set_rangesP  s]    7|s4#3#34444 #&gt/?/?"@
 V$
 	
 
s   Ac                $   t        d |D              r| D cg c]  }g  c}g fS t        j                  j                  | D cg c]  }g  c}| D cg c]  }j	                  |       c}t        j                         d
fd}	 	 	 	 	 	 	 	 dd}g }d}|D ];  }	g }
|	D ]  }j                  |d      r|
j                  d        )|t              k  r>j                  |   d      r)|dz  }|t              k  rj                  |   d      r)|dz   t              k  roj                  ||         rZj                  ||         st        |   }t        ||         }|
j                   || |||       ||dz   |                   |
j                  t        j                   |||                   " |j                  |
       > t        d D              sJ d d	|        |fS c c}w c c}w c c}w )Nc              3  8   K   | ]  }t        |      d k(    ywr   N)r   )r   rP   s     r^   r   z5SIMDKernel._split_iteration_ranges.<locals>.<genexpr>^  s     6Fs6{a6   c                    j                  |      }j                  |    |      st        t        |    |      | <   |    j	                  |       t              S rR   )r  statically_known_multiple_of	CantSplitr   r   r   )r7  r   
new_ranges	remainingsv	var_counts     r^   	add_rangez5SIMDKernel._split_iteration_ranges.<locals>.add_rangef  sZ    ;;t$D229Q<F#IaL$7IaLqM  &	?"r_   c                     d fd}|S )Nc                     |    z  |    z   S rR   r   )	flat_varsidx1idx2sizes    r^   getterzISIMDKernel._split_iteration_ranges.<locals>.make_combined.<locals>.getterr  s    io-	$??r_   )r  r   ro   rl   r   )r  r  r  r  s   ``` r^   make_combinedz9SIMDKernel._split_iteration_ranges.<locals>.make_combinedo  s    @ Mr_   r   r7   c                6    t         j                  j                  S rR   )rx   ry   Zero)_s    r^   r   z4SIMDKernel._split_iteration_ranges.<locals>.<lambda>}  s    EGGLL r_   c              3  t   K   | ]0  }t         j                  j                  j                  |      d k(   2 yw)r7   Nr6   r   r   r   r   s     r^   r   z5SIMDKernel._split_iteration_ranges.<locals>.<genexpr>  s*     I!177##--a0A5Is   68zfailed to set ranges  )r7  r   r   rl   ro   r   )r  rl   r  r   r  r   ro   z(Callable[[list[sympy.Expr]], sympy.Expr])r   r6   r   r   r  r  r  r   r   r   statically_known_gtr{  r|  r   operator
itemgetter)groupsr   groupr  gr  r  return_getters_groupscurrent_grouplength_groupreturn_gettersr  size1size2r}  r~  r  r  s                 @@@@r^   _split_iteration_rangesz"SIMDKernel._split_iteration_rangesW  s]    6g66$*+5B+R//WW:@-AQb-A
-34R[[^4	OO%		# 	#		$'	/2	5	 !## #	9LN$  --dA6"))*@A#c)n49S9Sm,:
 "Q&M $c)n49S9Sm,: !1$s9~5":P:P)M2; ::i6 (%m4E$T9]+CDE"))%!%mU;%ma&7? #)) ++ImT,JK= B "((8G#	9J IyII 	
#I;ay9	
I 000G , .B4s   	H	HHc                   t         j                  j                  }t        |d         dk(  r2|j	                  t        |      t        |d         |z        r|d   |gf}	 | j                  ||       y# t        $ r Y yw xY w)Nr7   r   TF)r6   r   r   r   r   r1   r  r|  )clsr  r   reduction_numelr   s        r^   is_compatiblezSIMDKernel.is_compatible  s     77##wqz?a,,f%gaj)O;
 qzO#45G	''8 		s   A2 2	A>=A>c                >   | j                   D ci c]  }|j                  |j                   }}| j                  s0|D ]+  }t	        |      st
        j                  j                  ||<   - g |j                         }| j                  ||| j                        S c c}w rR   )r   rY   rX   r  r.   rx   ry   rz   r   map_kernel_groups_to_node_sizesru  )r\   r   rtr  rY   r  s         r^   split_and_set_rangeszSIMDKernel.split_and_set_ranges  s     150@0@A""))RXX%AA$$  1&v.%*WW[[F6N1 $6==?#33FGT__UU Bs   Bc           
     F   t        |      t        |      k(  r!t        d t        ||      D              r || S | j                  ||      \  }}g t        j
                  j                   ||       }|D cg c]  }|D cg c]
  } ||       c} c}}S c c}w c c}}w )a  
        We may want to fuse `for i0 in s0*s1` into a tiled kernel with groups (s0, s1).

        To do this we need to split up the iteration space of i0 into something like:
            for i1 in s0:
              for i2 in s1:
                i0 = i1*s1 + i2
                ....

        This function matches and resplits lengths to the groups of
        this kernel to enable tiled + non-tiled fusions.
        c              3     K   | ]?  \  }}t         j                  j                  j                  t	        |      |z
        d k(   A ywrx  r6   r   r   r  r1   )r   rJ   r  s      r^   r   z=SIMDKernel.map_kernel_groups_to_node_sizes.<locals>.<genexpr>  s@      /
1 GG%%mA&6&:;q@/
s   AA)r   r   re  r  r  rH  rI  )	r  r  r   ru  r}  r  r   fnsfns	            r^   r  z*SIMDKernel.map_kernel_groups_to_node_sizes  s    & w<3v;&3 /
GV,/
 ,
 w'',/,G,GPW,X)
)LY__22:z3JKL8MN,"H,NN,Ns   7	B BBBc                6    t        |t        j                        S rR   )r   r   TMPr\   r   s     r^   is_indirect_indexingzSIMDKernel.is_indirect_indexing  s    "5$((33r_   c                   | j                  |      rydgt        | j                        z  }|j                  D ]g  }|| j                  vr| j                  |   }t        |j                  t              sJ ||j                  j                  xx   |j                  z  cc<   i t        j                  j                  j                  t        fdt        || j                  j!                               D              S )NFr7   c              3  F   K   | ]  \  }} |       |      k7    y wrR   r   )r   	idx_range
iter_ranger  s      r^   r   z,SIMDKernel.is_broadcasted.<locals>.<genexpr>  s,      
%	: Y8J#77
s   !)r  r   r  r   r   r   r   rn   r   rP   r6   r   r   r  anyre  r   )r\   r   index_numelsrd   entryr  s        @r^   is_broadcastedzSIMDKernel.is_broadcasted  s    $$U+sS--(( 	=FT222))&1Eell,?@@@++,<,	= 77##,, 
),\4;;;M;M;O)P
 
 	
r_   c                    t        |t              r)ddj                  t        | j                  |             dS | j                  | j                  |            S )a  
        Convert an index expr to a string that can be used in output code.
        e.g. a sympy expression "s2" may actually appear as "ks1" in the generated kernel.

        Index expressions often need to be passed in as arguments to the triton kernel.
        Rename_indexing and codegen_indexing keep track of the needed indices and add
        new parameters to the function signature.
        rO  r   rP  )r   listrQ  mapindex_to_strr   rename_indexingr  s     r^   r  zSIMDKernel.index_to_str  sN     eT"tyyT%6%6!>?@BBzz$..u566r_   c                n   | j                  |      }t        |t        j                  j                  j
                        }t        |j                  t        j                              s(t        |j                  t        j                              r3|j                  t        j                  j                  j
                        }t        |j                  t        j                              r|j                  t        j                        D ]g  }|j                  }t        |      dkD  st        d |D              s1|t        j                  j                  j                  |      i}t        ||      }i | j                  |      }t        |t               s|n|j"                  d   }| j%                  |      S )Nr   c              3  p   K   | ].  }t        |t        j                  t        j                  f       0 y wrR   )r   r   r   PRECOMPUTED_SIZEr   s     r^   r   z.SIMDKernel.prepare_indexing.<locals>.<genexpr>  s.      , #1tyy$2G2G&HI,s   46)r   r2   r6   r   r   precomputed_replacementsr   atomsrx   floorceilingsubsr   r   lookup_precomputed_sizer   r   r   codegen_indexing)r\   r   ar   replacements
simp_indexs         r^   prepare_indexingzSIMDKernel.prepare_indexing  sG    &&u-5!''"2"2"K"KLu{{5;;'(CEMM0J,KJJqww//HHIE u{{5==)*[[/ 	< ..w<!# ,$, ) %&qww'7'7'O'OPQ'R#SL&ul;E	< ++E2
 )X>JJOOTUDV 	 $$Z00r_   c                l   | j                   D cg c]  }|j                  r| j                  s| }}|rut        |      dkD  rgt	        d |D              }dj                  d |d | D              d| d  k(  s"J |d | D cg c]  }|j                   c}       t        |d |       |d | |S c c}w c c}w )Nr7   c              3  8   K   | ]  }|j                   d v   yw)xyzNrY   r   ts     r^   r   z0SIMDKernel.active_range_trees.<locals>.<genexpr>6  s     9aE)9ry   c              3  4   K   | ]  }|j                     y wrR   r  r  s     r^   r   z0SIMDKernel.active_range_trees.<locals>.<genexpr>7  s     ;188;s   zyx)r   rb   r  r   r  rQ  rY   r   )r\   reorderr  treesr  s        r^   active_range_treeszSIMDKernel.active_range_trees1  s    ''
q~~AVAVA
 
 s5zA~9599E77;U6E];;ueVW~M "'-PP M %U6E]3E&5M

Ps   B,B,?B1c                4   t         j                  j                  j                  || j	                               }t        |j                  t              D ]  }|| j                  v si }| j                  |   j                         D ].  }t         j                  j                  j                  |      ||<   0 t        |      dkD  r5t        | j                  |   j                  |      | j                  |   _        | j                  |   j                           |S )Nr   r   )r6   r   r   r   rW   sortedr   ri   r   r   r  r   r2   r   r   )r\   r   symr  pss        r^   r  zSIMDKernel.codegen_indexing=  s    ww44T4??;LM$++5 	5Cd+++  "//4EEG TB'(ww'7'7'O'OPR'SL$T|$q(6@--c277$7D))#.3 %%c*224	5 r_   c                    t        d      )NzNYI: codegen_nan_checkr  ra   s    r^   codegen_nan_checkzSIMDKernel.codegen_nan_checkN  s    !":;;r_   c                    t        d      )NzNYI: call_kernelr  )r\   rU   r   s      r^   call_kernelzSIMDKernel.call_kernelQ  s    !"455r_   c              #     K   | j                   }| j                  }|rt        j                  ||      }t	        j
                  |      }|| _         || _        	 | || _         || _        y# || _         || _        w xY ww)z:Context manager to add an additional mask to tl.load/storeN)
_load_mask_load_otherr4   logical_andr5   _unwrap)r\   r*  r   r@  	prior_vals        r^   
mask_loadszSIMDKernel.mask_loadsT  sy     
 $$	??4/D!!$' 	)J#DO(D $DO(Ds   AA=A* A=*A::A=c                (   | j                   j                         D ci c]  \  }}||j                   }}}t        ||      }i }| j                  D ]7  }t        |j                        }t        ||di      t        ||di      z
  ||<   9 |S c c}}w )a\  
        This gets the stride of the index for each of the tiling variables
        (technically, it does it at index 0)

        For example, if
        xindex = x0 + 512*x1 + 1024*r0
        x0 = (xindex//512)
        x1 = (xindex % 512)
        r0 = rindex // 1024

        this function would return
        {xindex: 512, rindex: 1024}
        r7   r   )r   rf   r   r2   r   r0   rU   )	r\   r   kvindex_to_tile_indexesindex_in_tile_varsstrides
range_treer   s	            r^   get_strides_of_loadzSIMDKernel.get_strides_of_loadh  s     8<7L7L7R7R7T Utq!AFF U U'/DE** 	J":??3A#$6A?*"QFC GAJ	
  !Vs   Bc                \    t        |t              rt        t        | |            S  | |      S rR   )r   tupler  )r  r   s     r^   _map_tuple_or_scalarzSIMDKernel._map_tuple_or_scalar  s'    eU#R((%yr_   c           	        g }t        t        | j                  j                  j	                                     }| j                  j                         \  }}}}| j                  j                         }t        j                  j                  j                  t        | j                  j	                                     }t        |      D ]2  \  }}||vr|j                  d       t        j                  j!                  |      }	t        j                  j                  j                  |	      }
|
|kD  rwt#        t$                  }d}||   D ]M  }t'        |t(        t*        f      r|j-                  d|        |dz  }3|j-                  |j.                         O t        |      |z  }n|
}t        j                  j1                  |      }t3        |      }|j                  ||z  dt5        ||k        z   z         5 t7        |      S )a+  
        Try the best to estimate the total size (in bytes) of the
        kernel's inputs and outputs, which is used for estimating the memory
        throughput of this kernel. This information is used for checking how
        far we are from the peak memory bandwidth. It's important that
        we want to avoid overestimating the sizes of the inputs and outputs,
        because it can wrongfully give us a very large memory traffic value,
        which may be even larger than the theoretical bandwidth and thus
        become very misleading. This is particularly problematic for cases
        where we slice some inputs. In those cases, we should only count
        the size of the "slices" instead of the original inputs, because
        only the slices contribute to the real memory traffic.
        r   no_index_dep_r7   )r   r3   r   inplace_buffersr   python_argdefsr   buf_accessesr6   r   r   r   r1   r  r+  r   	get_numelr   r   r   r   r    r   r   	get_dtyper+   r   r  )r\   nbytesninplace_argsr  	call_argsr  	out_numelr7  r   	arg_numelbuf_sizer<  no_index_dep_countdeprX   r  
dtype_sizes                    r^   estimate_kernel_num_bytesz$SIMDKernel.estimate_kernel_num_bytes  s    F499#<#<#C#C#EFG!YY5579a}}113 GG$$..}T[[=O=O=Q/RS		* 	MFAs ,&a ))#.Iww''11)<H)# %S/+%&"', /C!#'9:m4F3G$HI*a/*CII./ Gy0 GG%%c*E'.JMM%*,CM8I4J0JKL9	M: 6{r_   c           	     &   t        | j                  j                        dk(  rEt        | j                  j                        dk(  r#t        | j                  j                        dk(  ry| j                  j                         \  }}}}d}|D ]F  }t        j                  j                  |      }|s&|j                         }	t        |	j                        dk(  sOt        |	j                  D 
cg c]
  }
|
dk(  s	|
 c}
      dk(  r|t        j                  |	j                        }||}||k7  st        d| dd| d	| z         }t        j!                  |       |D cg c]m  }t        j                  j                  |      rJt        j                  t        j                  j#                  |      j                         j                        ndo }}|D cg c]Z  }t        j                  j                  |      r7t        j                  j#                  |      j                         j                  nd\ }}|D cg c]@  }|t        j                  j$                  v rd
n|t        j                  j&                  v rdndB }}|D 
cg c]  }
|
j(                   }}
t        d| d| d| d| d| dz         }t        j!                  |        y t+        d| d      }t        j!                  |       yc c}
w c c}w c c}w c c}w c c}
w )zr
        Print message if the kernel have mixed layout inputs.
        Only care about 4D tensor for now.
        r7   r   N   r   zExpected stride order z, but found stride orderr  z for kernel 
GraphInputIntermediateBufferz  param names z
  buf names z
  strides z	
  sizes z
  sources 
z%All the inputs for the triton kernel z have uniform layout)r   r   input_buffersoutput_buffersr  r  r6   r   try_get_buffer
get_layoutr  r   get_stride_orderstrider%   logwarning
get_buffergraph_inputsname_to_bufferrU   r$   )r\   r   argdefsr  
_signaturer  uniform_stride_orderarg_namebuflayoutrJ   stride_ordermsgrU   stride_order_list	size_listsource_listargdef_namess                     r^   warn_mix_layoutzSIMDKernel.warn_mix_layout  s    		''(A-DII,,-2DII--.!3
 ,0II,D,D,F)J#! 0	H''((2C^^%F6;;1$6;;9a!q&9:a?!226==A'/+7()\9%01E0FF^_l^<}EFC KK$ %.) ! 7711$7 ++GG..t4??AHH "	")% ) %.	! ! 7711$7 **40;;=BB!"!I ! %.# !	  177#7#77 %  177#9#99 2!	"#K # 5<#<qAFF#<L#<%(nYK|\m[no&ykk]"MNC KK$a0	b 3K=@TU
 	C[ :)!# $=s'   -
K:
8K:
"A2K?AL?AL	
Lc                   t        j                  ||d|      }d| _        t        j                  | j                  j
                  |      }t        j                  ||      }d| _        t        j                  ||      }t        j                  ||      }t        j                  ||d|      }t        j                  |||f      S )Nr  FT)r4   	reductionr  
index_exprr   r  truedivsubmulr5   r  )	r\   r  r   sum_rnumelmeandxdx2m2s	            r^   welford_reduce_fallbackz"SIMDKernel.welford_reduce_fallback  s    }}UE5%8 % = =uE{{4( $WWUD!ggb"o]]5%4!!4V"455r_   c                    t        j                  ||d|      }t        j                  ||      }t        j                  |      }t        j                  ||d|      }t	        j
                  ||f      S )Nmaxr  )r4   r  r!  expr5   r  )r\   r  r   vmaxr!  r,  vsums          r^    prepare_softmax_twopass_fallbackz+SIMDKernel.prepare_softmax_twopass_fallback  s\    }}UE5%8ggeT"ggcl}}UE5#6!!4,//r_   c                    t         rR   r  ra   s    r^   codegen_kernelzSIMDKernel.codegen_kernel  r  r_   c                     y rR   r   ra   s    r^   rm  zSIMDKernel.codegen_body"      r_   c                     y rR   r   )r\   r  s     r^   r   z)SIMDKernel.codegen_iteration_ranges_entry%  r3  r_   )NNN)r  dict[str, sympy.Expr]r   rA   r   r   r  Optional[bool]r  r6  ro   rp   r   )r  ztorch.dtypero   ri   r   rq   )r   r   r  rr   rb   rr   r  r5  r  rr   ro   list[IterationRangesRoot])r   zdict[str, str]ro   rp   )r<  Sequence[sympy.Expr]ro   rp   )rU   ri   r   rl   r   r9   ro   rp   )ro   rk   )r7  r   ro   ri   )ro   z	list[str])r   rl   ro   rl   )r   rl   r   rn   ro   rl   )ro   z'contextlib.AbstractContextManager[None])r   rl   ro   rj   )r  Iterable[sympy.Expr]r   Sequence[Sequence[sympy.Expr]]ro   zStuple[list[list[sympy.Expr]], list[list[Callable[[list[sympy.Expr]], sympy.Expr]]]])r  r9  r   r:  r  rl   ro   rr   )r   r:  ro   list[list[sympy.Expr]])r  r8  r   r:  ro   r;  )r   rl   ro   rr   )r   rl   ro   ri   F)r  rr   ro   r7  )r   rl   ro   rl   r   rR   )rU   ri   r   zOptional[IRNode]ro   rp   )r*  zUnion[str, OpsWrapper]r   Union[int, float]ro   zIterator[str])r   rl   ro   rk   )r  r   )>rt   ru   rv   rw   pexprr   __annotations__r   rT   r{   r)   r   r  r  r!  r  r8  r  r=  rA  r	  r  rW   rL  rR  rW  rZ  r   r   r`  rr  ru  staticmethodr  classmethodrx   ry   rz   r  r  r  r  r  r  r  r  r  r  r  rp  rq  r  r  r  r  r  r)  r/  r1  rm  r   r|   r}   s   @r^   rm   rm   O  s    */E&.&&!OT! /38<9=-.%-. %-. ,	-.
 (6-. )7-. 
-.^ J   J" E E4+4 4 	4
 &4 4 
#4l-*
R'
'
$>>':>	>':	(0
 J1$J1/MJ1
J1 J1X 
 ',ggkk	$ 0 $	
 
 ,
V5
V	
V O$O 0O
 
 O O84
,7$1$1 
$1L
"<6 )*)3D)	) )&0  
=~EN
60"r_   rm   c                     e Zd ZU eZded<   d Zd ZeZeZ	d Z
	 	 ddZe	 	 	 	 	 	 dd       Zd dZ	 	 	 	 d!d	Zd
 Zdd	 d"dZd Z	 d#	 	 	 	 	 	 	 	 	 	 	 d$dZd Ze ej.                  d      d%d              Ze	 	 	 	 	 	 d&d       Ze	 	 	 	 	 	 d'd       Ze	 	 	 	 	 	 	 	 d(d       Ze	 	 d)d       Zeej<                  j>                  f	 d*d       Z d Z!d+dZ"d#dZ#d Z$d Z%y),SIMDSchedulingz	type[Any]kernel_typec                &    t        d |D              S )Nc              3     K   | ]6  }t         j                  j                  j                  t	        |             8 y wrR   r  r   s     r^   r   z*SIMDScheduling.group_fn.<locals>.<genexpr>-  s*     PQQWW%%..}Q/?@Ps   <>)r  rY  s     r^   group_fnzSIMDScheduling.group_fn,  s    P%PPPr_   c                   t        |t        j                        st        |t        j                        r t        j                  j                  ||      S |j                  \  }\  }}|j                  \  }\  t        ||      }|j                         r)|j                         s|j                         rA |d       n8|j                         r(|j                         s|j                         r |d       |j                         r,|j                         r|k(  xr |k(  }|s |d||       |S |j                         s|j                         s|k(  r|k(  s|j                         s |d||       y|j                         D ]`  }|j                         r nN|j                         |j                         z  s7|j                  \  }\  }	}
||	k(  r||
k(  rT |d||	||
        y t        ||fd      D ]D  \  }}|j                         st        |j                         t              }|s || d       |c S  | j                  |j                         ||      }| j                  |j                         ||      }| j                  |j                         |j                         z   ||      }t         j"                  j$                  rVd}t'        |      d	kD  r%t'        |      d	kD  r||cxk(  xr |k(  nc }n||k(  }nt'        |      d	kD  r||k(  }|s |d
|||       yy|j                         s|j                         r|dk(  rdk7  sJ |z  k(  rt)        fd|j                         D              s	 |d       yt         j"                  j*                  r\|j                         sLt-        | j                  |j                         |      j/                               |dfdffv }|s |d       |S y|k7  r |d       |k(  S |j                         r|j                         rJ | j1                  ||      S )z
        Hook called by Scheduler to determine if the Triton backend
        can fuse node1 and node2.  These nodes might already be
        FusedSchedulerNodes.
        z&Split scan cannot fuse with reductionsz1numel/rnumel mismatch (reduce) (%s, %s), (%s, %s)z5numel/rnumel mismatch (non-reduce) (%s, %s), (%s, %s)Fz:numel/rnumel mismatch prologue mismatch (%s, %s), (%s, %s))node1node2z is not TritonTemplateBufferTr   ztiling mismatch (%s, %s, %s)r7   c              3  j   K   | ]*  }t         j                  f|j                                , y wrR   )rm   r  
get_ranges)r   r   numel2rnumel2s     r^   r   z*SIMDScheduling.can_fuse.<locals>.<genexpr>  s1       ,,fg->Os   03z"nodes numel/rnumel incompatibilityzinvalid tiling for reductionznodes numel incompatibility)r   r   ForeachKernelSchedulerNodecan_fuser  r(   is_split_scanrb   is_template	get_nodesused_buffer_namesget_buffer_namesre  get_template_noder"   select_tilingr   triton tiling_prevents_pointwise_fusionr   r    tiling_prevents_reduction_fusionr  r   can_fuse_horizontal)r\   rI  rJ  r  numel1rnumel1whyreduction_can_fuser   	pro_numel
pro_rnumelr   	node_nameis_triton_templatetiling1tiling2tiling3condis_reduction_tiling_validrM  rN  s                      @@r^   rP  zSIMDScheduling.can_fuse/  sO    eYAABj977G
 77@@NN${{FG${{FGu% )<)<)>!!#<=  "5+>+>+@!!#<=E$6$6$8!'6!1!Hg6H%G &%!!#E,>,>,@f$G);((*O ! !& 1 )++-!  $557%:P:P:RR$59ZZ22Iz &) 3:8M \ & ) ' * $)#)& !$UEN4F G 	.9==? *4++-/C*& .yk)EFG--	. (():FGLG(():FGLG((!EOO$55vwG }}==w<!#7|a'&'<W<&'1\A%"g-D6	 !!!#(:(:(<a<GqL00')) "__.  <= MMBB!--/05**5??+<fELLN1  !,1- 5:;4412V##!!#E,>,>,@@@''u55r_   c           
        g t        t        j                            t        t                  t        t                  d fd}fd}fd}fd}t        j
                  fd       }fd}	|D ]  }
|
v rj                  |
        ||
      r? |	|
      r |       5  	 d d d        r ||
      sxs t              nd  ||
       ` ||
      r" |       5  j                  |
       d d d        t        d d d	|
j                  d
           S # 1 sw Y   |xY w# 1 sw Y   xY w)Nc                b    | j                   \  }\  }}|k(  xr |k(  xs |z  k(  xr |dk(  S Nr7   r  r   r  
node_numelnode_rnumelrX   r$  s       r^   fits_in_main_bodyz@SIMDScheduling.generate_node_schedule.<locals>.fits_in_main_body  sH    +,77(A(
K%'AK6,A efn,A1Ar_   c                N    | j                   \  }\  }}|k(  xr |dk(  xr dk7  S rk  rl  rm  s       r^   fits_outside_reductionzESIMDScheduling.generate_node_schedule.<locals>.fits_outside_reduction  s4    +,77(A(
K&K;!+;K!Kr_   c                \    | j                   j                  D ]  }|j                  v s y y)NTF)read_writesreadsrU   )r   readcurrent_loop_buffer_usages     r^   expect_improved_memory_usagezKSIMDScheduling.generate_node_schedule.<locals>.expect_improved_memory_usage  s1    ++  99 99  r_   c                   j                  |        j                  |        j                  | j                  j                  D cg c]  }|j
                   c}       | j                         rt        | t        j                        rrt        | j                  t        j                        rNt        | j                  j                  t        j                        s j                  | j                                y j                  | j                  j                   D cg c]  }|j
                   c}       y c c}w c c}w rR   )r   r   updatert  ru  rU   rb   r   r   SchedulerNoder   r   ComputedBufferdataScanget_namewrites)r   rJ   rw  donenode_schedulenot_ready_yet_nodess     r^   schedule_node_in_loopzDSIMDScheduling.generate_node_schedule.<locals>.schedule_node_in_loop  s    HHQK  #%,,amm>Q>Q-Raff-RS
  q)"9"9:qvvr'8'89"166;;8#''

5)00!--BVBV1WQ!&&1WX .S 2Xs   D; E c               3  L  K   rd   t         u rj                          nj                  t               r1j	                  t               j	                  dz   t                d d  j                  t                j                           j                          y w)Nrk  r7   )r?   popr   r>   insertclear)rw  maybe_split_indexr  r  s   r^   end_current_reduction_loopzISIMDScheduling.generate_node_schedule.<locals>.end_current_reduction_loop  s      r!2o!E!!#$$%56 $$%68HI$$%6%:OL$(!  1%%'%++-s   B!B$c                    dk(  ry| j                   z  sy|rt        |d   t        t        f      rJ t	              S )Nr7   Frk  )	ancestorsr   r?   r>   rr   )r   r  r  r$  s     r^   #requires_closing_previous_reductionzRSIMDScheduling.generate_node_schedule.<locals>.requires_closing_previous_reduction  sN    {&7 b!O5E#F*   +,,r_   zunexpected group: (r   z) != r7   )r   r   r&   ri   rp  rq  r   r   r   r  r  )r\   r   rX   r$  rp  rr  rx  r  r  r  r   rw  r  r  r  r  s     ``       @@@@@r^   generate_node_schedulez%SIMDScheduling.generate_node_schedule  sa   #%)5568 )o/$.sO$5!+/		L		Y" 
	"	"	. 
#	.	-  	Dt|HHTN &6t]K35  -5QRV5W(9(OS=O% )-%%d+'-/1 /!((./ / *)%6(%

1O -	4 ' / /s   6D34D?3D<	?E	c                    |j                         }t        |d       j                  \  }\  }}| j                  |||      }t        j                  d|       | j                  t        |||            S )zK
        Given a set of pre-fused nodes, generate a Triton kernel.
        c                4    t        | j                               S rR   r   rb   r   s    r^   r   z-SIMDScheduling.codegen_node.<locals>.<lambda>#  s    c!..:J6K r_   r   zSchedule:
 %s)rS  r+  r  r  schedule_logdebugcodegen_node_schedulerA   )r\   r   r   r  rX   r$  r  s          r^   codegen_nodezSIMDScheduling.codegen_node  sp     04~~/? ,KLRR?E633E5&I+];))}eV<
 	
r_   c                   t        j                  t         j                        j                  }t	        |       sy|D cg c]0  }|j                         r|j                         j                         2 }}t        d |D              syt        j                  j                  j                  | |       |D ],  }t        j                  j                  j                  ||       . yc c}w )NFc              3  2   K   | ]  }t        |        y wrR   )r*   )r   r  s     r^   r   z8SIMDScheduling.can_use_32bit_indexing.<locals>.<genexpr>>  s     FD)$/Fr  T)torchiinfoint32r+  r*   has_tensor_outputr  storage_sizer   r6   r   r   	guard_leq)rX   buffersint_maxr  	buf_sizesr  s         r^   can_use_32bit_indexingz%SIMDScheduling.can_use_32bit_indexing,  s    
 ++ekk*..%e, 
$$& NN))+
	 
 FIFF 	
""5'2 	6DGG&&tW5	6
s   5C$c                   |j                   }| j                  ||j                  |j                        }| j	                  ||gd|i      }|D ]  }| j                  ||        t        j                  |       |D ]  }t        j                  |      5  |j                         }d d d        | j                  ||      }t        j                  j                  rt        ||       t         j#                  d|       ||_        t'        |      |_         ~t)        |      dkD  rt        |      }n|\  }t        j                  |      5  |j+                         D ]  }	|	j-                           	 d d d        | j/                  |       |j1                  |j$                         t        j2                  r|j5                          t        j6                  r|j7                  |d   j$                         t        j8                  xj:                  |j:                  z  c_        t        j8                  xj<                  |j<                  z  c_        t        j8                  j>                  j@                  rt        jB                  r|d   jD                  jG                         }
|j+                         D ]  }	|	jI                         }||
vr|	jJ                  J |	jJ                  jM                         }|CtN        d   dxx   dz  cc<   t        j8                  j>                  jQ                  d|jR                  d| d	        | jU                          y # 1 sw Y   xY w# 1 sw Y   xY w)
Nr   z+Generating kernel code with kernel_name: %sr7   r   inductorintermediate_hookszrun_intermediate_hooks(r   r   )+r  rW  rX   r  create_kernel_choices!codegen_node_schedule_with_kernelr=   merge_workspaces_inplacer6   set_kernel_handlerr1  define_kernelr   traceenabledr/   r  r  r   r   r   scheduler_nodesmark_runcodegen_commentr  nan_assertsr  r  r   removed_buffersinplaced_to_removewrapper_codesupports_intermediate_hooksgenerate_intermediate_hooksr   live_output_buffersr  r   get_origin_noder   	writelinerU   free_buffers_in_scheduler)r\   kernel_featuresr  r  kernelsrZ   src_coder   final_kernelr   	live_outsrU   origin_nodes                r^   r  z$SIMDScheduling.codegen_node_scheduleH  s   '55##?00/2Q2Q
 ,,fX
O'D
  	JF22=&I	J,,W5 	3F%%f- 3!0023,,X}fMK||##7! IIC[Q!,F(2F	3  w<!&w/L%O\!!,/ 	 '779   	  	]+  !9!9:**,!!(()?)?@	<#?#??	""l&E&EE" GG  <<22  
;;=I'779 
}}y(yy,,,"ii779*Z()=>!C>GG((221+2B2B1ERvQO
 	&&(k3 3&	  	 s   M&MM	M&c                (     | j                   |i |gS rR   )rD  )r\   r  kernel_argskernel_kwargss       r^   r  z$SIMDScheduling.create_kernel_choices  s)     D
 	
r_   c           	     <   |5  t        j                         }i }|D ]  }|t        u r |j                  |j	                                +|t
        u r|j                          D|j                          |j                  |j                               }|j                  t        j                  |j                  j                  |      j                                       |j!                  |j#                                |D ]  }|t        u r |j                  |j	                                +|t
        u r|j                          Dt%        |j                         |j                  |j                               }|j'                  |        	 d d d        y # 1 sw Y   y xY wrR   )rp  	ExitStackr>   enter_contextrr  r?   closedecide_inplace_updater  rL  rz  rG  fromkeys_bodyindexing_from_argsr   r=  keysr#   r   )r\   r  rZ   stackall_indexingr   r   s          r^   r  z0SIMDScheduling.codegen_node_schedule_with_kernel  sS    	-((*EL & ++''(@(@(BC_,KKM..0!'!<!<T__=N!OJ '' JJ99*ELLN $$\%6%6%89 & 	-++''(@(@(BC_,KKM 6djjA!'!<!<T__=N!OJLL,	--	- 	- 	-s   FFFFonly_gen_src_codec               f
   |j                   \  }\  }}|dk(  sJ |j                  j                  |j                        \  }}	i }
|j                         }g }|D ]  }|j	                         }|j                  |       ||z  s*t        |      dk(  sJ ||
t        t        |            <   |j                  j                  t        t        |                   g } t        |      dk(  sJ |5  |s|g|D ]  }|j                            |	       }|j                  d      5  |D ]0  }|j                  |j                  |j                                      2 |j                   j#                  t%                      ddd       |j&                  j)                         D ]+  \  }}d| d}|
j+                  |j-                         g       x}s0t/        d |D              }t1        j2                  d|       5  |j                  |      5  |D ]  }t        |j	                               dk(  r<t        |      dk(  r.t5        |      r#|xj6                  |j	                         z  c_        |j                  |j                  |j                                       |j                   j#                  t%                      ddd       ddd       . 	 ddd       t9        t:              s$|j=                  d	       |j=                  d
d       t?        j@                  |      5  |j&                  jC                         D ]  }d| d}|j=                  |d        |j                  d      5  t9        |t:              r|}n|j=                  d       |jD                  }ddd       g |||}t0        jF                  rH|jI                         dz  }|jK                          d d|jM                  |      jO                          }|rcddd       S | jQ                  ||      }t0        jR                  jT                  rtW        ||       ddd       | jY                         |j[                  |j                         t>        j\                  xj^                  |j^                  z  c_/        t>        j\                  xj`                  |j`                  z  c_0        | jc                          y# 1 sw Y   nxY w# 1 sw Y   =xY w# 1 sw Y   nxY w# 1 sw Y   CxY w# 1 sw Y   xY w# 1 sw Y   xY w)z
        Codegen a triton template

        If `only_gen_src_code` the src code will be returned instead of codegen'd into the wrapper
        r7   r   z<STORE_OUTPUT>Nz<LOAD_INPUT_>c              3  <   K   | ]  }|j                           y wrR   )can_codegen_without_upcasts)r   p_ns     r^   r   z2SIMDScheduling.codegen_template.<locals>.<genexpr>  s      5>A7795   ztriton.codegen_upcast_to_fp32z<DEF_KERNEL>z	<ARGDEFS>F)strictg    eAr  )2r  r   make_kernel_renderrT  rU  r   r   r   iterprologue_fused_inputsr   r  set_subgraph_bodyr   r  rL  cse
invalidater   named_input_nodesrf   r   r  r   r   patchr   #prologue_fused_inputs_preserve_zeror   ri   finalize_hookr6   r  r  codebenchmark_kernelr  imports_for_benchmark_kernelcodegen_kernel_benchmarkgetvaluer  r  r  r/   r  r  r   r  r  r  )r\   template_nodeepilogue_nodesprologue_nodesr  r  _numelr$  rZ   renderbuf_name_to_prologue_grouptemplate_readsprologue_groupprologuenamesr   partial_code
input_namebuffersubgraph_namecan_codegen_without_upcastprologue_noder  r  num_gbr   s                             r^   codegen_templatezSIMDScheduling.codegen_template  s    ,11FF{{&++>>}?Q?QR%'"&88:& 	$H--/E!!(+~%5zQ&@N*4U+<=,,00d5k1BC!#	$ >"a''' ,	@$ +<^< $DMMO$ "8L))*:; 4* QDLL!<!<T__=N!OPQ

%%jl34
 '-&>&>&D&D&F @"
F".zl! <%?%C%COO%r& >  25 5ES5 2.  7=W9W @ $55mD @1? "$'(F(F(H$IQ$N(+N(;q(@'CM'R(.(R(R,9,J,J,L)*(R !. 5 5$*$?$?(5(@(@(B%&!"" #JJ11*,?!@@ @@,	@\ ,,&&~6&&{5&A !!&) 	T %66;;= H
".zl! <**=*GH ))*:; 1lC0+H ../?@+00H1 NnMmMnMM&&99;cA::<=Rj66v>GGIJL  !1	T 	T4 ,,X}fMK||##7{S;	T> 	]+;(:(:;	6#9#99	""f&?&??"&&(_4 4&@ @@ @3,	@ ,	@t1 1	T 	Ts   ,5T!AS&:AT)T5T B1S3	8T  TA
T''1TA+T'9T'&S0	+T3S=8T  T
TTT$	T''T0c                    t         j                  j                  j                  t         j                  j                  j                                y rR   )r6   r   r  r  
device_opssynchronizera   s    r^   codegen_synczSIMDScheduling.codegen_sync-  s-    	&&qww'9'9'E'E'GHr_   c           
        ddl m} |D cg c]  }|j                          }}i i }
}	t        ||      D ]u  \  }}t	        |d       j
                  \  }\  }}| j                  |||      }| j                  |||      }||||f|
|<   |j                  |t        |||      |       |	|<   w |j                  || ||	|
      }t        j                  dt        |      |D cg c]  }t        |       c}       g }|D ]>  }|D cg c]  }|j                          }} |||      }t        ||      D ]  \  }}| j                  |
|   d	   |j                  |	|                |	|   }|
|   d	   }|sIt!        j"                  |      5  t%        j&                  |      D ]  }|j)                           	 d d d        t         j*                  xj,                  |j,                  z  c_        t         j*                  xj.                  |j.                  z  c_         |j1                         }|j3                  |||f       A |S c c}w c c}w c c}w # 1 sw Y   xY w)
Nr7   )ComboKernelc                4    t        | j                               S rR   r  r   s    r^   r   z;SIMDScheduling.generate_combo_kernel_code.<locals>.<lambda>=      #ann>N:O r_   r   )r   optimize_mask)r   triton_schedulingcustom_algorithm
kernel_mapnode_info_mapz1ComboKernels: %d nodes partitioned into %s groups)enable_autotunemixed_sizesr   )triton_combo_kernelr  rS  re  r+  r  r  rW  create_triton_kernelrA   horizontal_partitionr  r  r   r  create_sub_kernelr6   r  r@   
only_nodesr  r   r  r  r1  r   )r\   subkernel_nodescustom_part_algorithmr  r  r  r  r   fused_node_listssubkernel_mapnode_schedule_mappnr   r  rX   r$  r  r  
partitionspkernel_code_list
node_grouprZ   	subkernelr  s                            r^   generate_combo_kernel_codez)SIMDScheduling.generate_combo_kernel_code0  s    	59HIDNN,II+-r(_.>? 		IB!$U0O!P!V!VAv 77ufMM''ufEF$165&$Hb! + @ @+M5&I"-o !A !M"		 !55!"2$+ 6 

 			? '(SV(	

 $ 	DJ=GHT 0HH  /'F
 !-=> K	E66%b)!,,,]2->? *"-	 1" 5a 8(--i8 ,$6$A$A-$P ,D MMO,, ''9+D+DD'**i.J.JJ*K ,,.H##Xvz$BC-	D.  c J. )  I, ,s   I II+IIc                   |j                         }|j                  }|j                  }t        j                  dkD  xs t        j                  dk(  xr |}| j                  ||||      }|D ]l  \  }}}	| j                  ||g|      }
| j                  |g       t        j                  d|
       |j                  t        j                  j                  |
       n | j                          y )Nr7   z"ComboKernels: generated kernel %s.)get_subkernel_nodesuse_custom_partition_algor  r   combo_kernel_allow_mixed_sizesr  r  r  r  r  r  r6   r   r  r  )r\   combo_kernel_noder  r  r  r  r  r  rZ   r  r   s              r^   codegen_combo_kernelz#SIMDScheduling.codegen_combo_kernelm  s    +??A 1 K K+;;;;a? 
11Q6P;P 	  ::2O[
 $4 	BHfa,,X8I7JFSK  "3!45II:KHqww33[A		B 	&&(r_       c           
        
 dk(  }d 
fd}|j                         \  }
t        |      dk  rt        
      dk  rg S |j                         \  }
 |||r|n
|j                  |            }|D cg c]?  }t         j	                  |j
                  |      |j                  |j                        A }	}|	S c c}w )Nr7   c                d   t        |j                        t        |      k(  sJ d|j                  d|       |j                  |j                  g}t	        d t
        j                  j                  |      D              sJ t
        j                  j                  |      D cg c]:  }|j                  t        j                  j                  vrt        |t              r|< }}t        |j                  D cg c]  }|j                   c}      }dd}t        j!                   ||      g|       dd      g}|D ]  }t        j                  j"                  j%                  |j&                  |j                        }	t        |	      t        |      k(  sJ 	 |	j'                  d      dz   }
|
t        |      k(  rt	        d	 |	|
d
 D              r	  ||d
|
        |||
d
       f}t        j                  j"                  j+                  t-        d t/        ||	      D                    }|j                  |v r|dz  }t        j1                  |d         r|dz  }t        j1                  |d         r|dz  }t        j                  j"                  j+                  |t-        t        j                  |            z
        dk\  s|j3                  t        j!                   ||d
|
        |||
d
       g      ||j                                |S c c}w c c}w # t(        $ r Y w xY w)zX
            Compute tiling candidates by dividing up the iteration ranges.
            zrw.range_vars=z ranges=c              3  H   K   | ]  }t        |t        t        f        y wrR   )r   r   r   )r   r  s     r^   r   zHSIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.<genexpr>  s$       3G 45s    "c                f    t         j                  j                  j                  t	        |             S rR   r  )rt  s    r^   collapse_rangeszNSIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.collapse_ranges  s"    ww''00v1FGGr_   noner   )r  rU   scorer7   c              3  &   K   | ]	  }|d k(    ywrx  r   r   s     r^   r   zHSIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.<genexpr>  s     ;a16;s   Nc              3  2   K   | ]  \  }}|d k7  s|  ywrx  r   )r   r  r
  s      r^   r   zHSIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.<genexpr>  s       "!-vST"s   r   r  r$  rU   )rt  r8  ro   rl   )r   
range_varsru  r  r   r  rH  rI  rU   r6   r   r  r   r   r   CandidateTilingcreate_partial_tilingr   stride_hintsr   
ValueErrorr   r1   re  is_good_sizer   )is_pointwisert  rwdep_sourcesr  depswrite_namesr"  tilingsr  splittiled_groupsr$  r  r  reduction_rangess                r^   tile_rangesz5SIMDScheduling.candidate_tilings.<locals>.tile_ranges  s    r}}%V4S8H	&6SS4 88RYY/K $??88E    %??88E88177#:#::sI. D  %"))%D3chh%DEKH
  44(01<  G  4''**77		2==Q7|s6{222
#MM!,q0EF+ ;756?;; ! < $F6EN3#F56N3  ((22! "14VW1E" 
 88{*QJE"//Q@QJE"//Q@QJE GG$$..ioofFV.W XX 
 NN'#&#<#<$3F6EN$C$3F56N$C!" !0$ #(!$
Q4l N[ &E: " s$   $?L8L"L"=L""	L/.L/r'  )r.  rr   ro   list[CandidateTiling])rL  r   "pointwise_or_reduction_read_writesr)  complete_partial_tilingr  r$  rU   )r  r   rX   r  r.  r7  pointwise_rangespartial_tilingsr  full_tilingsr6  s   `  `      @r^   candidate_tilingsz SIMDScheduling.candidate_tilings  s     '!+\	| .2__->** A%#.>*?1*DI .2__->**% ,2B33LA
 *	
  22MM5/ ll[[	
 	
 	
s   6AB>c                    g dt        |       d }ddgdt        |       }t        g t        ||      t        ||            S )zK
        Create a tiling dict from pointwise and reduction splits.
        )rH   rI   rJ   NrK   rL   )r   r   re  )r  	pw_tilingreduction_tilingpw_prefixesreduction_prefixess        r^   create_tilingzSIMDScheduling.create_tiling  sY     &s9~o&78#U^,Cc2B.CDVc+y)VC0BDT,UV
 	
r_   c                >    | j                  |r|ng |s|      S g       S rR   )rD  )r  r  r.  s      r^   r*  z$SIMDScheduling.create_partial_tiling  s0       "F&F
 	
,.
 	
r_   c                    t        |j                               }d|v }||z  }|t        |      z  g}|r||fn||f} | j                  | S )zb
        Given a tiling for only pointwise or reduction dimensions, adds the missing one.
        rJ   )r  r   r1   rD  )	r  r  rX   r  splitsr.  total_numelmissing_tilingtiling_argss	            r^   r:  z&SIMDScheduling.complete_partial_tiling  sf     fmmo&f}o-%f(==> )5V^$>6:R 	 !s  +..r_   c           
        |dk(  }t        t        t        t        j                  f             }t        j                  |      D ]  }t        |t        j                        s|j                         }|st        |d         dk(  rC||rdnd   }|g}	|j                  j                         D 
cg c],  }
t        |
t              rt        |
j                        dkD  r|
. }}
|D ]U  }
g |
j                  j!                         }t        j"                  j$                  }t&        j(                  j*                  }t-        |      D ]!  \  }\  }}||z  }|j/                  ||      s! n |j1                  ||      sdz   }|r|d| n||d }g }|D ]  \  }}t3        j4                  |
j6                  |      }t9        d|j;                  t<              |j;                  t>              z   t        |            }t3        j@                  ||||      }||d   n|g}|jC                  |        |	jE                  |       X |	D ]  }t9        dt        |      tF        jH                  jJ                  z
        }|dz   }tM        |d|       }|ftO        ||d       z   }|jQ                  | jS                  | jU                  ||      ||               tW        |t        d      }|S c c}
w )z
        Creates N-dimensional tiling candidiates, attempting to simplify loads/stores
        by tiling the kernel into higher dimensions.

        Returns a list of tilings ranked by dimensionality.
        r7   r   Nr   T)r   reverse),r   rG  ri   rx   Exprr?   filterr   r   r{  rL  r   rt  reads_and_writesr   rt  rf   ry   rz   r6   r   r   r+  statically_known_geqr   r8   get_subexpr_involving_symbolr   r+  r  r   r   match_mod_div_block_exprr:  r   r   rX  	max_tilesr1   r  r   r:  r*  r  )r  r  pointwise_numelr  r.  r3  r   node_rangesranges_to_tilenode_tilingsr  memory_depsall_var_rangespointwise_vars_numelr   pointwise_end_idxvarrX   reduction_start_idxrW   index_tilingr   num_dimsmatch_resultdimsnode_tilingnum_leading_dimsfirst_trailing_dimcollapsed_leading_dimcollapsed_splitsranked_tilingss                                  r^   get_nd_tilingszSIMDScheduling.get_nd_tilings+  s*    '!+T#uzz/235#**=9 W	DdI$;$;< //+KCA$71$< )lBN*+L  ++<<>c9-#cjj/A2E K 
 # 12 "73::#3#3#5!6',ww{{$77++7@7P 3%|U(E1(44,o   77(/  '8!&;# $ ##7$78'(;(<=   "", .JC/LL		3E
  #H-O0LLN+ H $7#O#OsE8$L /;.F<?UGD ''-%.( ##L1c12h  , #&q#k*:V]]=T=T*T#U %5%9"(5kBUCU6V(W%$9#;e 2 34? $  //112BLQ''UW	v  
 as   .1K!c                b   dk(  }| j                  |gg      }|st        j                  j                  rt        j                  j                  dk  rt
        j                  t        j                  k  rt        j                  |      D ]g  }t        j                  j                  rt        | j                  ||            dkD  s>t
        j                  t        j                  d              |S  |S t!        t"                  }t%        j&                         }t        j                  |      D ]g  }| j                  ||      D ]O  }	|	j(                  |v r|	j(                  |j+                  |	j(                         ||	xx   |	j,                  z  cc<   Q i |j/                         D 	
cg c]  \  }	}
|	j0                   }}	}
t        j                  j                  dk\  r?|r=	 	 	 	 	 	 dd}t3        dt        |            D ]  } ||d   ||         }||g|z   } n t        |      dkD  rt
        j                  d|       t        j                  j4                  r| j7                  ||      |z   }|D ]-  t9        t:              sJ t=        fd|D              s+c S  |S c c}
}	w )	z
        Heuristics to decide how to tile kernels.
        Currently, we tile based on stride-1 dimensions.

        Returns:
            `(tile1, tile2, reduction_numel)` s.t. `tile1 * tile2 == numel`

        r7   r   z
                                Reduction over non-contiguous dims.
                                Consider setting config.triton.tile_reductions to True.
                                r   c                   | d   | j                  dd      }}|d   |j                  dd      }}t        j                  j                  j	                  ||z
        dk(  ry t        j                  j                  j	                  ||z
        dk  r||f||fc\  }}\  }}t        j                  j                  j	                  ||z
        dkD  sJ t        j                  j                  j                  ||      sy |t        ||      || d   d}|S )NrJ   rI   r7   r   rK   )rH   rI   rJ   rK   )r   r6   r   r   r   r{  r   )tiling0rd  a0a1b0b1
new_tilings          r^   convert_tiling_to_3dz:SIMDScheduling.select_tiling.<locals>.convert_tiling_to_3d  s    !w{{3':B w{{3':B77##--b2g6!;77##--b2g6:*,bB8&HRhr2ww''11"r':Q>>>ww''DDRL !"b)"5>	
 "!r_   zpossibly bad tiling: %sc              3     K   | ]R  }t        |t        j                        r6t        j	                  j                         |j                                 T yw))r  N)r   r   r{  rm   r  r   rL  )r   r   r  r  s     r^   r   z/SIMDScheduling.select_tiling.<locals>.<genexpr>  sO       dI$;$;<	 ((MMOT__%6 ) s   AA)rk  r5  rd  r5  ro   zOptional[dict[str, sympy.Expr]])rD  r   rX  tile_reductionsrS  perf_hint_loglevelloggingWARNINGr?   rN  r   r>  infotextwrapdedentr   ri   collectionsr   rU   r   r$  most_commonr  rangeprefer_nd_tilingrh  r   rG  r   )r  r  rX   r  r.  default_tilingr   
seen_namescandidate_tilescandidate_tilingr$  rg  rq  r7  new_3d_tilingr  s      `           @r^   rW  zSIMDScheduling.select_tiling  s    '!+ **E7_4EFV]]%B%B]]$$)""goo5+22=A D"MM99 5 5dE? STWXX%**$OO!$ !! "!_&
4?4G4G4I#**=9 	LD$'$9$9$$W L #((J6%**6NN#3#8#89 015E5K5KK1L	L ,;+F+F+H7
' % ##7
 7

 ==""a'L"."9N"0"0 1c.12  4"1%~a'8! !,&3_~%EN ~"8.I ==))""=%I ! 
 % 		Ffd+++  *	  		 C7
s   .J+c                     y rR   r   ra   s    r^   flushzSIMDScheduling.flush  r3  r_   c                     yr#  r   ra   s    r^   ready_to_flushzSIMDScheduling.ready_to_flush  r$  r_   c                   t        d |D              st        |d       j                  \  }\  }}| j                  |||      }| j	                  |||      }| j                  |t        |||            }| j                  ||       t        j                  d|      5  t        j                  |      5  |j                         }	d d d        d d d        nL|d   j                  |      \  }
}}t        j                  d|      5  | j                  |||
d      }	d d d        	j                  t!        t"        j$                        d	      }	|	S # 1 sw Y   xY w# 1 sw Y   @xY w# 1 sw Y   LxY w)
Nc              3  <   K   | ]  }|j                           y wrR   )rR  )r   r   s     r^   r   zASIMDScheduling.generate_kernel_code_from_nodes.<locals>.<genexpr>  s     2q1==?2r  c                4    t        | j                               S rR   r  r   s    r^   r   z@SIMDScheduling.generate_kernel_code_from_nodes.<locals>.<lambda>  r  r_   r   )r   r  r   Tr  triton_)r  r+  r  r  rW  rD  rA   r  r   r  r6   r  r1  get_prologue_template_epiloguer  replaceri   r-   KERNEL_NAME)r\   r   r  r  rX   r$  r  r  rZ   r  r  templateepilogues                r^   generate_kernel_code_from_nodesz.SIMDScheduling.generate_kernel_code_from_nodes  ss   2E22!$U0O!P!V!VAv 77ufMM''ufEF%%+M5&I & F 22=&I/1AB3$$V,3 "002	3 3 3 ,18+R+R,(Hh 02BC 00&*	 1  ##C(?(?$@)L%3 3 3 3 s0   E3EEE$E	EE!$E-c                     y rR   r   )r\   r  s     r^   r  zSIMDScheduling.codegen_comment4  r3  r_   c                    t         rR   r  )r\   r  r  rZ   s       r^   r  zSIMDScheduling.define_kernel7  r  r_   N)r   z<Union[scheduler.FusedSchedulerNode, scheduler.SchedulerNode])rX   rl   r  z<Iterable[Union[ir.Buffer, ir.TensorBox, ir.TorchBindObject]]ro   rr   )r  rA   )r  rA   ro   zlist[SIMDKernel])ro   Optional[str]r<  )r  zlist[BaseSchedulerNode]r  rr   r  rr   r  rr   r  rr   ro   zlist[tuple[str, Any, Any]])ro   r8  )r@  r8  rA  r8  ro   r5  )r  r8  r.  rr   ro   r5  )r  r5  rX   rl   r  rl   ro   r5  )ro   z"list[dict[str, tuple[sympy.Expr]]])ro   r5  rq   )&rt   ru   rv   rm   rD  r?  rG  rP  can_fuse_verticalr[  r  r  r@  r  r  r  r  r  r  r  r  rA  r   r   r>  rD  r*  r:  rh  rx   ry   rz   rW  r  r  r  r  r  r   r_   r^   rC  rC  )  s   'K'QF6P !"^@
P
$ M 
 6A)F
1
	
 -F SXt	tlI #(; 0;   $;  	; 
 ;   ;  
$; z)( Yy  yv 

,

@T

	

 

 
$
 
 
	
 
 /%/ / $	/
 
/ /( o
 
,o ob 3877;;p	p pd<"r_   rC  T)frozenc                  @    e Zd ZU ded<   ded<   dZded<   ed        Zy)	r)  r5  r  r   r$  Nr  rU   c                r    t         j                  j                  j                  |       } | dk\  xr | dz  dk(  S )z@Somewhat arbitrary heuristic used to boost scores for some sizesr  r   r  )r   s    r^   r-  zCandidateTiling.is_good_sizeA  s5     GG&&q)Bw(AFaK(r_   )rt   ru   rv   r?  rU   r@  r-  r   r_   r^   r)  r)  ;  s)    !!JD-) )r_   r)  c                      e Zd Zy)r|  N)rt   ru   rv   r   r_   r^   r|  r|  H  s    r_   r|  )r   r=  ro   ri   )v
__future__r   r{  rp  dataclassesr   r  rv  r   r  ry  r   typingr   r   r   r   r	   r
   r   typing_extensionsr   rx   r  torch._loggingtorch.fx.immutable_collectionsr   torch.utils._ordered_setr   torch.utils._sympy.functionsr   r   r   torch.utils._sympy.symbolr   r   r   r   _dynamo.utilsr   r  r   r   r   analyze_preserves_zero_maskr   	codecacher   dependenciesr   r   r    r!   r"   optimize_indexingr#   runtime.runtime_utilsr$   r%   r&   r'   r(   utilsr)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   virtualizedr4   r5   r6   block_analysisr8   commonr9   r:   r;   r<   multi_kernelr=   simd_kernel_featuresr>   r?   r@   rA   collections.abcrB   rC   rD   	getLoggerrt   r  _logginggetArtifactLoggerrt  r  
fusion_logdoprintr>  r/  	dataclassrN   rn   r   r   r   rm   rC  r)  	Exceptionr|  r   r_   r^   <module>r     s   "           X X X %    9 / L L  & $ $ F ! 6 6 - A ; D D    - , / P P %  << g!00<H~~//*E^^--hA
 	78 5+ 5+ 5+px;/ x;v;'? ;'| +;TW('/*B WtO"^ O"d  d#	) 	) $	)		 	r_   