
    Vh^                      U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dlmZmZmZmZmZmZ d dlZd dlmZ d dlmZ ddl m!Z! dd	l"m#Z#m$Z$ d
dl%m&Z& d
dl'm(Z( d
dl)m*Z* d
dl+m,Z, d
dl-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5 d
dl6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZB d
dlCmDZDmEZEmFZFmGZGmHZHmIZImJZJmKZKmLZLmMZM  G d deN      ZOerd dlPmQZQmRZRmSZS eZT e	j                  eV      ZWdYdZX	 	 	 	 	 	 	 	 dZdZYd ZZd Z[ G d deJ      Z\ G d d      Z] G d d      Z^d Z_g Z`d ead!<   d" Zbd# Zc G d$ d%e\      Zdd[d&Ze	 	 	 d\	 	 	 d]d'Zfd[d(Zgdddd)d*Zhd^d+Zid_d,Zjd- Zk	 	 	 	 	 d`	 dad.Zldbd/Zm	 	 	 dc	 	 	 	 	 ddd0Znded1Zodfd2Zp	 	 	 	 dgd3Zq	 	 	 	 	 	 dhd4Zr	 	 	 	 did5Zsd6 Zt	 	 djd7Zu	 	 	 	 did8Zv	 	 	 	 did9Zwdkd:Zxdld;Zydmd<Zzdnd=Z{d> Z|	 d\d?Z}dkd@Z~ej                   G dA dB             Z G dC dDe      Z G dE dFe      Z G dG dHe      Z G dI dJe      Z G dK dLe      Z G dM dNe      Z G dO dPe      Z G dQ dRe      Z G dS dTe      Z G dU dVe      Z G dW dXe      Zy)o    )annotationsN)
namedtuple)AnyCallableLiteralOptionalTYPE_CHECKINGUnion)compute_required_storage_length)
OrderedSet   )TritonBundler)prefix_is_reductiontriton_version_uses_attrs_dict   )triton_helpers)AutotuneCache)benchmarker)CoordescTuner)_NUM_THREADS_PER_WARPAutotuneHintDevicePropertiesHeuristicTypeReductionHintTileHintTRITON_MAX_BLOCKTRITON_MAX_RSPLIT)ceildivconditional_productcreate_bandwidth_info_strdynamo_timedget_first_attrget_max_y_gridget_num_bytesnext_power_of_2triton_cache_dirtriton_config_to_hashabletriton_hash_to_path_keyvalidate_triton_config)
	ASTSourceautograd_profilercc_warp_sizeCompiledKernelConfig	GPUTargetKernelInterfaceOutOfResources
PTXASErrortritonc                      e Zd Zy)NoTritonConfigsErrorN)__name__
__module____qualname__     Y/home/dcms/DCMS/lib/python3.12/site-packages/torch/_inductor/runtime/triton_heuristics.pyr5   r5   G   s    r:   r5   )	ContainerHashableSequencec           	     x    t        | j                         D cg c]  \  }}t        |      s| c}} S c c}}w N)r   itemsr   numelsprefixnumels      r;   get_total_reduction_numelrF   T   s2    %+\\^	SMFE7J67R%	S 	Ss   6
6
c                2   g }| D ]  }|t         j                  k(  st              dk(  r
|dz  ddff}nDt              dk(  r|dz  ddfd|dz  dff}n%t              dk(  r|dz  ddfd|dz  dfdd|dz  ff}|j                  fdD                |S )a  
    AutotuneHints can be attached to the metadata of triton kernels for providing
    suggestions about what to try for autotuning. One reason to do this is if there are
    some configs that are only useful in specific scenarios, in which case we can avoid
    wasting compile time on autotuning unless we know we are in one of those scenarios.

    Based on those hints, this function will generate a list of additional autotuning
    configs to try.
    r      Nr      c              3  l   K   | ]+  }t        g|d j                  rj                  ndi - yw)num_elements_per_warp    N)triton_config	warp_size).0xyzdevice_props
size_hintss     r;   	<genexpr>z,autotune_hints_to_configs.<locals>.<genexpr>w   sE      	   3?2H2H..b		s   14)r   ONE_ELEMENT_PER_THREADlenextend)hintsrR   
block_sizerQ   configshintxyz_optionss    ` `   r;   autotune_hints_to_configsr\   Z   s      G <666:!# *at<>ZA% *aD9AzQPT;UVZA%1_a+
a+:?+
 NN 	 '	 	. Nr:   c                L    | j                  d      ry| j                  dd       S )N$are_deterministic_algorithms_enabledTautotune_pointwiseget)inductor_metas    r;   disable_pointwise_autotuningrc      s-     ?@  !5t<<<r:   c           	     <   g }i }| D ]D  }t        |t        t        f      r|j                  t	        |             4|j                  d       F |j                         D ]&  \  }}	t        t        t        f      r|	||<   "|	||<   ( t               s1|j                  j                  j                         D ]
  \  }}	|	||<    |j                  j                  |d<   |j                  j                  |d<   g |}
|
j                  d |j                         D               dj                  |
      }
t        j                  j                  t         j"                  d         }t%        | dd      5 }|j'                  | d	|
 d	|d
       d d d        y # 1 sw Y   y xY w)NT	num_warps
num_stagesc              3  0   K   | ]  \  }}| d |   yw=Nr9   rO   kvs      r;   rS   z&_dump_launch_params.<locals>.<genexpr>   s     ?41aqc1#J?   , r   z.launch_paramsaz | 
)
isinstanceintboolappendstrrA   r   configkwargsrf   rg   rV   joinospathabspathsysargvopenwrite)argsrx   launcherkernel_namegrid	call_argscall_kwargsargrl   rm   args_strabs_pathfs                r;   _dump_launch_paramsr      sv   IK "cC;'SX&S!	"
  1cC;'KNKN	
 *+OO**002 	DAqKN	'88K ( : :K|HOO?;+<+<+>??yy"Hwwsxx{+H	
.)3	/ <1	;-s8*Cxr:;< < <s   .FFc                       e Zd ZdZ	 	 	 	 	 d	 	 	 	 	 d fdZ	 	 d	 ddZd Zd Zd Zd Z	d d	Z
d!d
Zd Zd"dZd ZdddZd Zd Zd Z	 	 	 	 d#dZd$dZd Zd Zd Zd ZdddZ	 	 	 	 	 	 d%dZ xZS )&CachingAutotunera	  
    Simplified version of Triton autotuner that has no invalidation
    key and caches the best config to disk to improve cold start times.
    Unlike the main triton Autotuner, this version can precompile all
    configs, and does not rely on the Triton JIT.
    Fc                   t         |           t        |      dkD  sJ d       |D ]  }t        |        || _        |d   | _        i || j
                  j                  | j
                  j                  d| _        |	i n|	| _	        || _
        || _        |g n|| _        || _        || _        || _        |
| _        d| _        t$        j'                  t(        j*                        rdt$        j-                  dt        | j                        | j                  j.                         | j                  D ]  }t$        j-                  |        g | _        g | _        t5        j6                         | _        t;        j<                  d      6t?        | j                  jA                  dd            t:        jB                  d<   t$        j-                  dt:        jB                  d          || _"        tG        d| j                  j.                  || j                  	      | _$        || _%        d
| _&        | jJ                  Tt:        jN                  jQ                  | jJ                        }d|v r't:        jN                  jS                  |      d   | _&        d| _*        d| _+        t:        jB                  jA                  dd      dk(  | _,        t:        jB                  jA                  dd      dk(  | _-        y )Nr   z2Non-empty TritonConfig list required for compilingdevice)r   device_typeFz'CachingAutotuner gets %d configs for %sTRITON_CACHE_DIRzTriton cache dir: %s)is_mmnamerR   rb    z.py TORCHINDUCTOR_DUMP_LAUNCH_PARAMS01TRITON_INTERPRET).super__init__rU   r)   fnrQ   indextypetriton_metarb   save_cache_hookmutated_arg_namesreset_to_zero_arg_namesoptimize_memrY   heuristic_typecustom_kernelcuda_kernel_savedlogisEnabledForloggingDEBUGdebugr6   compile_results	launchers	threadingLocklockrz   getenvr&   ra   environrR   r   coordesc_tunerfilenamekernel_hashr{   basenamesplitextprecompile_time_taken_nsautotune_time_taken_nsdump_launch_paramstriton_interpret)selfr   r   rY   r   r   r   r   rR   rb   r   r   r   cfgc	base_name	__class__s                   r;   r   zCachingAutotuner.__init__   s    	7|aU!UU 	(C"3'	( .9(.C

''--,,11

 $1#8Rm.!2)1B7N 	$ ),*!&GMM*II9DLL!  
 \\ 		! ;=-/NN$	99'(0-=  $$Xq1.BJJ)* 			("**5G*HI$+!!!,,	
 ! !# ==$((7I	!#%77#3#3I#>q#A ()%&'# JJNN=sCsJ 	 !#

/A3 G3 Nr:   c                    |r| j                          y | j                  5  ||| _        | j                          | j                          | j	                          d d d        y # 1 sw Y   y xY wr@   )_precompile_workerr   _reload_kernel_make_launchers_dynamic_scale_rblock)r   warm_cache_onlyreload_kernels      r;   
precompilezCachingAutotuner.precompile  sf    
 ##%YY 		)
 (&3###%  "&&(		) 		) 		)s   :A##A,c                L   | j                   r_| j                   D ]O  }t        j                  t        |j                  j
                        | j                  j                  dd             Q y | j                  rJ | j                  st        d      g }d }| j                  D ]#  }	 |j                  | j                  |             % t        |      dk(  r$t        dt!        |      j"                   d|       || _         d | _	        y # t        t        f$ r}|}Y d }~}d }~ww xY w)Nr   r   zNo triton configs are availableNo valid triton configs. : )r   r   putr(   kernelhashr   ra   r   rY   r5   ru   _precompile_configr1   r2   rU   r   r6   )r   resultr   excr   es         r;   r   z#CachingAutotuner._precompile_worker  s   .. !!+FMM,>,>?$$((15
 >>!!||&'HII 	A&&t'>'>q'AB	
 1$&+DI,>,>+?r#G   / #J/ s   % DD#DD#c           	     L   | j                   }| j                  j                  dd      rl| j                  j                  d      sO| j                  t        j
                  k(  r0| j                  "|j                  dv r|j                  r|j                  dk\  st        j                  j                  r|j                  |j                  sJ |j                  sJ |j                  sJ d }|j                  xs d}| j                   D ]n  }|j"                  }|j$                  }t'        | j                        dk\  sJ |j(                  j                  dd	      }|j(                  D cg c]  }|j+                  d
      s| }	}|	D cg c]  }|j(                  |    }
}| j                  d   |z   d	z
  |z  }t-        |dd       }|t/        |
 dk  r||j                  |j                  z  k  r||z  }||j0                  z  }t3        |j                  |z  d	      }|||j                  z  k  r+t5        j6                  |      }t3        |	|j(                  j8                        }|j(                  |xx   dz  cc<   |6t;        | j                   D cg c]  }t=        |j"                         c}      }t=        |      }||v r|j?                  |       t@        jC                  d|||       | jD                  jD                  E	 tG        | d      sJ tI        | jJ                        sJ | jK                         jD                  | _"        | j                   jM                  | jO                  |             q | jQ                          y y y y y y y y y c c}w c c}w c c}w )Ndynamic_scale_rblockTpersistent_reduction)cudahip   rL   r   XBLOCKr   Rxn_regs@   keyzNDynamically scale down %s from TritonConfig(%s) and get a new TritonConfig(%s)r   ))rQ   rb   ra   r   r   	REDUCTIONrR   r   majortorchversionr   regs_per_multiprocessormax_threads_per_multi_processormulti_processor_countrN   r   rw   r   rU   rx   
startswithgetattrr   rf   maxcopydeepcopy__getitem__r   r'   addr   r   r   hasattrcallabler   ru   r   r   )r   device_propseen_config_hashesrN   r   rM   compiled_binaryxblockkwargreduction_kwargsrblockstotal_blocknregnreg_per_warpnreg_per_blockmax_blocks_per_sm
new_configlargest_rkwargr   new_config_hashs                       r;   r   z&CachingAutotuner._dynamic_scale_rblock2  s    ''""#94@&&**+AB##}'>'>>+  O3!!""a'5==+<+<33?6666>>>>4444AE#--3I.. XQ &"(--4??+q000&--11(A>'4';';$#u?O?OPS?TE$  $ EUU5=//6UU#s3f<q@VK$?< '0B6 "::"BBCC  $y 0!.1H1H!H %(77>I1%! "3k6W6W"WW!]]=9
 '*$-*>*>*J*J' !!.1a71%-)3 &*%9%9 ! 6ahh?*& #<J"G"&88"&&7		d"!	 77::%
 #4)9:::#D$7$7888"11366DG$$++D,C,CJ,OPqXQt   "C @ ,= " 4 , ? C A($ Vps   ,NNNN!
c                6   t        | j                        t        | j                        k(  ry ddlm} | j                         } ||| j                  d         5  |j                  |j                                g }d }| j                  D ]"  }	 |j                  |j                                $ 	 d d d        t              dk(  r$t        dt              j                   d|       || _        y # t        t        f$ r}|}Y d }~~d }~ww xY w# 1 sw Y   axY w)Nr   )DeviceGuardr   r   r   )rU   r   r   torch._dynamo.device_interfacer   get_device_interfacer   synchronizecurrent_deviceru   make_launcherr1   r2   RuntimeErrorr   r6   )r   r   device_interfacer   r   r   r   s          r;   r   z CachingAutotuner._make_launchers  s
   t~~#d&:&:";;>446 )4+;+;H+EF 
	(()9)H)H)JKIC.. $$V%9%9%;<
	 y>Q!:49;M;M:NbQTPUVWW"	 '
3 C
	 
	s6   3DC1,D1D DDDDDc                    d| j                   _         d| j                   _        d| j                   _        t        | j                   j	                  | j                               | j                   _        g | _        y)zDrop stuff from triton.JITFunction that does not pickle.
        This must be called after precompile so that these things are no longer needed.
        N)r   __globals__used_global_vals
_ConstReprreprr   r   s    r;   prepare_for_picklez#CachingAutotuner.prepare_for_pickle  sO     
"#' !$'',,tww"78r:   c                L    | j                   rJ d       i | j                  dd iS )Nz7pickle should not be called with after make_launchers()r   )r   __dict__r  s    r;   __getstate__zCachingAutotuner.__getstate__  s9    >> 	
E	
!
mm
D
 	
r:   c                l    | j                   j                  |       t        j                         | _        y r@   )r  updater   r   r   )r   states     r;   __setstate__zCachingAutotuner.__setstate__  s"    U#NN$	r:   c                f    ddl m}  || j                  j                  j	                  dd            S )Nr   )get_interface_for_devicer   r   )r   r  rQ   r   replace)r   r  s     r;   r  z%CachingAutotuner.get_device_interface  s)    K'(9(9(>(>(F(Fuf(UVVr:   c                   t        j                  | j                        }|j                  }| j                  j
                  dk(  r$i |}dD ]  }||v s|j                  |      ||<    |d   j                  |       | j                  j                  D ]@  }| j                  j                  |   }||d   vs$|dk(  s|dk(  s/t        ||      |d   |<   B |j                  |d<   |j                  |d<   | j                  j                  dd      xr | j                  j                  dd	       |d
<   | j                  j
                  |d<   | j                  j                   |d<   | j                  j
                  dk(  rt#        j$                          nt#        j&                          t(        st+        d      t)        | j                  |d   |d   |d   d         f}t-        |d   |d   t/        |d               }|d   |d   |d
   d	d}	| j                  j
                  dk(  rd|v r|d   |	d<   d|v r|d   |	d<   ||	d}
	 t1        j2                  |i |
}t=        j>                  tA        |jB                        | j                  j                  dd             tE        |||| j                        S # t4        $ rH t6        j9                  d| j                  j                  dd      | j                  j:                  |        w xY w)z/Ahead of time compile a given autotuner config.r   )matrix_instr_nonkdimwaves_per_eukpack	constantsrf   rg   assert_indirect_indexingTis_hipFr   r   cccpuz0Installed triton version too old, please upgrade	signaturerY   r   )rf   rg   r   sanitize_overflowr  r  )targetoptionsz-Triton compilation failed: %s
%s
metadata: %sr   triton_r   )#r   r   r   rx   rQ   r   popr  r   
constexprs	arg_namesr   rf   rg   rb   ra   r  r   set_driver_to_cpuset_driver_to_gpur*   r  r/   r,   r3   compile	Exceptionr   	exceptionsrcr   r   r(   r   TritonCompileResult)r   r   compile_meta
cfg_kwargsrl   iarg_namecompile_argsr#  r$  compile_kwargsbinarys               r;   r   z#CachingAutotuner._precompile_config  s#   }}T%5%56ZZ
!!U*'JJF 8
?&0nnQ&7LO8 	[!((4## 	MAww((+H|K88K'8|+C6=c86L[)(3	M %(MM[!%(^^\" $ 2 2 6 6&!
 !:$$((599 	W
 '+&7&7&<&<]#!..11T!!U*,,.,,.QRR [)[)Y'*	
 'd+,
 &k2&|4!'*!&	
 !!U*-*6~*F'%52>?U2V./

		^^\D^DF 	#FKK0$2B2B2F2FxQR2S	
 #63d>P>PQQ  	MMA""&&}i@	 	s   J2 2ALc                F   t               rg }|j                  j                  j                         D ];  \  }}|j	                  | j
                  j                  j                  |      |f       = |j                          g |}|D ]  \  }}|j                  ||        |S |S )z
        `args` is passed in with only the non-constexpr args (because the constexpr arg values
        depend on the config). However, in later triton versions, the constexpr args need to be
        added into the args list.
        )
r   rw   rx   rA   ru   r   r(  r   sortinsert)r   r   r   constexpr_argsr3  arg_valnew_argsarg_idxs           r;   _get_args_with_constexprsz*CachingAutotuner._get_args_with_constexprs'  s     *+ 57N%-__%;%;%A%A%C T!'%%tww'8'8'>'>x'H'&RST !wH$2 2 12 Or:   )with_profilerc               *   	  j                   s_j                   j                  j                  dd      kD  r6t        j                  dj                  j                         t        d      S  j                         }|j                  |j                               	  j                  i  	fd}|rddlm}  ||dd	
      S  j                  j                  dk(  rt!        j"                  |      S t!        j$                  |d	      S )z+Measure the performance of a given launcherspill_threshold   z/Skip config %s because of register spilling: %dinfc                      j                   gi \  } } j                  i  j                  |       } |i |di j                         y )Nstream)maybe_clone_argsreset_to_zero_argsr>  restore_args_from_cpu)	cloned_argscloned_kwargsargs_with_constexprsr   
cpu_copiesrx   r   r   rE  s	      r;   kernel_callz+CachingAutotuner.bench.<locals>.kernel_callR  s    )>)>)>*!*%+*&K $D##T4V4#'#A#A+x#X % 
 &&z2r:   r   )do_bench_using_profiling
   (   )warmuprepr   )rR  )r   n_spillsrb   ra   r   r   rw   floatr  get_raw_streamr  copy_args_to_cpu_if_neededtorch._inductor.utilsrN  rQ   r   r   benchmark_cpubenchmark_gpu)
r   r   r?  r   rx   r  rM  rN  rL  rE  s
   `` ``   @@r;   benchzCachingAutotuner.bench<  s     !!h&7&7$:L:L:P:Pr;
 '
 IIA!!
 <446!001A1P1P1RS4T44dEfE
	3 	3 F+KKK!!U*,,[99(("==r:   c                X     j                   si S i t        j                  j                         t        j                  j	                         z
   fd}t         j                  j                  |      D ]  \  }} |||        |j                         D ]  \  }} |||        S )aw  
        To support benchmarking in the presence of mutated args, we need to avoid
        autotuning contanminating them. We try to pass cloned args to the kernel.
        If those clones would increase the peak memory usage, however, we instead
        copy to cpu and restore them after each iteration. Figure out the args
        to be copied and do the copying.
        c                   | j                   v r|j                  rt        |t        j                        sJ t        |j                         |j                         d      }||j                         z  }|kD  rQt        j                  |fd|j                  dd      }|j                  |j                  |fd      d       ||f| <   y |z  y y y )Nr   r   r   T)dtyper   
pin_memorynon_blocking)r   is_cudarr   r   Tensorr   sizestrideelement_sizeempty_stridedr^  copy_
as_strided)r   r   required_storage_lengthrd  cpu_argbudgetcopiesr   s        r;   
maybe_copyz?CachingAutotuner.copy_args_to_cpu_if_needed.<locals>.maybe_copyx  s    t---#++!#u||444*IHHJJJL+'
 /1A1A1CC&=#1102!ii$#'G MM(?'A4H%) "  %(>F4LdNF/ 3>-r:   )	r   r   r   max_memory_allocatedmemory_allocatedzipr   r(  rA   )r   r   rx   rn  r   r   rl  rm  s   `     @@r;   rV  z+CachingAutotuner.copy_args_to_cpu_if_neededj  s       I002UZZ5P5P5RR	#4 TWW..5 	"ID#tS!	"   	"ID#tS!	" r:   c                    |j                         D ]T  }|\  }}t        |j                         |j                         d      }|j	                  |fd      j                  |d       V y )Nr   r]  Tr`  )valuesr   rd  re  ri  rh  )r   rL  pairr   rk  rj  s         r;   rH  z&CachingAutotuner.restore_args_from_cpu  sm    %%' 		DLC&E


'#
 NN35t<BBd C 		r:   c                   | j                   sy t        |      D ]\  \  }}| j                  j                  |   | j                   v s,t	        |t
        j                        sJ d       |j                          ^ |j                         D ]E  \  }}|| j                   v st	        |t
        j                        sJ d       |j                          G y )NzEself.reset_to_zero_arg_names should only contain valid argument names)	r   	enumerater   r(  rr   r   rc  zero_rA   )r   r   rx   r2  r   r   s         r;   rG  z#CachingAutotuner.reset_to_zero_args  s    ++o 	FAsww  #t'C'CC!LL  \	  			   	ID#t333!LL  \	  			r:   c           	     4   	 ddl m	 	 fd}t        j                   j                  j
                  dt        |       |      D cg c]  \  }} |||       }}}|j                         D ci c]  \  }}| |||       }}}||fS c c}}w c c}}w )a  
        Prepare new args and kwargs by cloning any in-place buffers
        (that are not in the provided exclusion list), to avoid autotune
        contaminating them. Avoid cloning the other buffers because it
        leads to increased memory usage.
        r   )clone_preserve_stridesc                t    | j                   v r(| vr$t        |t        j                        sJ  |      S |S r@   )r   rr   r   rc  )r   r   ry  excluder   s     r;   prepare_argz6CachingAutotuner.maybe_clone_args.<locals>.prepare_arg  s<    t---$g2E!#u||444-c22
r:   N)
compile_fxry  	itertoolszip_longestr   r(  rU   rA   )
r   r{  r   rx   r|  r   r   rI  rJ  ry  s
   ``       @r;   rF  z!CachingAutotuner.maybe_clone_args  s     	8	 '224773D3D[s4y3QSWX
c c"
 
 HN||~V)${455VVM))
 Ws   
B3Bc                >     | j                   t               g|i |S r@   )rF  r   )r   r   rx   s      r;   
clone_argszCachingAutotuner.clone_args  s!    $t$$Z\CDCFCCr:   c           
        t        ddd| j                  j                  d      i      5  | j                  D ci c]  }| | j                  |g|i | }}|j                         D ]+  \  }}| j                  j                  |j                  |       - t        j                  t        j                        rt        j                  d| j                  j                         |j                         D ]G  \  }}t        j                  d|j                  ||j                   |j"                  |j$                         I  | j&                  |i | |cd d d        S c c}w # 1 sw Y   y xY w)Nz&CachingAutotuner.benchmark_all_configsTr   )log_pt2_compile_eventmetadataz(Benchmark all input configs for %s, get:z*%s: %f, nreg %d, nspill %d, #shared-mem %s)r!   rb   ra   r   rZ  rA   r   cache_benchmark_resultrw   r   r   r   r   r   r   r6   r   rS  sharedrG  )r   r   rx   r   timingsrl   rm   s          r;   benchmark_all_configsz&CachingAutotuner.benchmark_all_configs  sJ   4"&#T%7%7%;%;M%JK
 	 !% *$**X????G 
   H1##::188QGH .		DdggFVFVW#MMO DAqIID

 $D##T4V49	 		 	s   E EC:E E  E)c           	     ,   t        j                         } | j                  |i |}t        j                         |z
  }t        j                  ||j
                        g| _        | j                  |z   | _        | j                  d   }t        j                  d| j                  j                  |j                  ||   |j                  |j                  |j                          | j"                  r'| j#                  |j                  | j                         yy)zDo the actual autotuningr   r   z>Best config for %s: %s: %f, nreg %d, nspill %d, #shared-mem %sN)timetime_nsr  builtinsminra   r   r   r   r   r   r   r6   rw   r   rS  r  r   )r   r   rx   
start_timer  benchmark_time_taken_nsr   s          r;   autotune_to_one_configz'CachingAutotuner.autotune_to_one_config  s    \\^
,$,,d=f="&,,.:"=",,wGKK@A)),CC 	#
 >>!$		LGGOOHOOOO	
   $2M2MN  r:   c           
     x   | j                   j                  dd       }|J d       t        |j                  j                  d      r |j                  j                  j
                  n|j                  j                  d   t        |j                  d      r|j                  j                  n|j                  j                  j                  t        |j                  d      r|j                  j                  n|j                  j                  j                  |t        |j                        | j                   | j                  |j                  |j                  d	}ddlm} d	d
dj                  | j                  j                   d      }|j                  j"                  |   }|j%                  ||||       d| _        y )Nr   zkernel_name can not be Noner   rf   r  )	mangled_namerf   
shared_memrE  rw   rb   r   def_argsr   r   )CudaKernelParamCachehsacospv)r   xpucubinT)rb   ra   r   binr  r   rf   r  config_to_dictrw   r   r  r   torch._inductor.codecacher  rQ   r   asmsetr   )r   rE  r   r   paramsr  bin_typer6  s           r;   save_gpu_kernelz CachingAutotuner.save_gpu_kernel  s^     $$]D9= == 8<<00&9 %%**\\**62 8<<5 &&\\**44 8<<2 ##\\**11$X__5!//++ ))!++-
0 	C"5155d6G6G6L6LgV!!(+  ffh?!%r:   c                
     j                   t        j                  k(  s j                   t        j                  k(  r|S |j                  |i j
                  j
                  E	 t         d      sJ t         j                        sJ  j                         j
                   _         fd} j                   t        j                  k(  rd|j                  j                  v rJ d       t        j                         } j                  j                  ||j                  d      }t        j                         |z
  }d|_         j                   r! j!                  | j"                  |z   d       j%                  |      S )a&  
        Coordinate descent tuning can be run with or without max-autotune.

        The only difference between these two is the starting config for coordinate_descent tuning.
        E.g., assuming regular autotune only get one config C1; while max-autotune get 4 configs C1, C2, C3, C4
        and max-autotune figure out C3 is the best.

        Then if coordinate desecnt tuning is run with max-autotune disabled, it will start from C1;
        while if coordinate descent tuning is run with max-autotune enabled, it will start from C3.
        Nr   c                B   j                   5  j                  |       j                         }d d d        | <    j                  |gi }t        j                  d|j                  ||j                  |j                  |j                         |S # 1 sw Y   hxY w)Nz4COORDESC: %s: %f, nreg %d, nspill %d, #shared-mem %d)
r   r   r  rZ  r   r   rw   r   rS  r  )rw   r   outr   config2launcherrx   r   s      r;   benchmark_one_configzHCachingAutotuner.coordinate_descent_tuning.<locals>.benchmark_one_configS  s     K226:HHJK&.OF#$**X777CIIF!! JK Ks    BBR0_BLOCKzrCoordinate descent tuner relies on the assumption that persistent reduction's triton config does not have R0_BLOCKT)found_by_coordesc)r   r   TEMPLATEUSER_AUTOTUNErw   r   r   r   r   PERSISTENT_REDUCTIONrx   r  r  r   autotuner  r   r   ra   )	r   r   r   rx   r  r  best_configcoordesc_time_taken_nsr  s	   ` ``    @r;   coordinate_descent_tuningz*CachingAutotuner.coordinate_descent_tuning4  sc    =#9#99""m&A&AA O#??H5 77::
 4!1222D//000))+..DG	" =#E#EEhoo444	
 A		
 
 \\^
))22 (//4
 "&*!<(,%  ++.DD"& ! 
 "";//r:   )benchmark_runc                  | j                   rP| j                  || j                  d         \  }} | j                  |   |i || j                  d   j                  S t        | j                        dk7  rt        | j                        dk(  r@t        j                         }| j                          t        j                         |z
  | _
        t        | j                        dkD  r | j                  |i | t        | j                  d   j                  dd      sE| j                  j                  dd      r) | j                   | j                  d   g|i |g| _        | j                  \  }|j"                  r |r| j$                  s| j'                  ||       | j)                  ||      }| j*                  rB| j                  ||j                        \  }}t-        |||| j                  j.                  |       t0        j2                  rdj5                  d |j                  j                  j7                         D              }	| j8                  xs d| j:                  d	||j                  j<                  |j                  j>                  |	d
}
t@        jB                  jD                  jG                  | j                  j                  dd      ||
      5   ||i |d|icd d d        S  ||i |d|iS # 1 sw Y   y xY w)Nr   r   r  Fr  ,c              3  0   K   | ]  \  }}| d |   ywri   r9   rk   s      r;   rS   z'CachingAutotuner.run.<locals>.<genexpr>  s#      )%11#Qqc
)rn   r   r3   )kernel_filer   kernel_backendrE  rf   rg   kernel_kwargsr   ztriton kernelrE  )$r   _interpret_args_gridrY   r   rx   rU   r   r  r  r   r   r  r   rw   rb   ra   r  store_cubinr   r  r>  r   r   r6   r+   _is_profiler_enabledry   rA   r   r   rf   rg   r   _C	_profiler_RecordFunctionFast)r   rE  r  r   rx   r   r  r   r<  kernel_kwargs_strprofiler_kwargss              r;   runzCachingAutotuner.runx  s      224aIJD$ 4774= ,,q/((  t~~!#4>>"a'!\\^
!040K-4>>"Q&+++T<V<NN1$$&95
  $$%@%H...t~~a/@R4R6RDN nnd>T>T  2--dH=""!66tX__MNHd&(DGG<L<LdS 11 # ))1)?)?)E)E)G) !
 !% 3#//"* %__66&oo88!2O ##77""&&}oF 	
   "	 	   	 	s   K55K>c                `   t         j                  | j                  |      j                  t	        t        g | j                  j                  | j                  j                  dd      |                  }| j                  j                  d      r|d t        | j                  d           }||fS )Nextra_launcher_argsr9   )
GridExpr	from_metarb   	eval_slowdictrq  r   r(  ra   rU   )r   r   r   r   s       r;   r  z%CachingAutotuner._interpret_args_grid  s     !!$"4"4c:DD**++//0ErJ 

 !!"78I3t112GHIIJDTzr:   )NNFNN)r   z	list[str]r   zOptional[str]r   zOptional[list[str]]FN)r   z(Optional[Callable[[], CachingAutotuner]]returndict[str, Any]r  r  r  None)r   r.   r  r/  )r{  zContainer[str]r   tuple[list[Any], dict[str, Any]])r  r  )r   ztuple[Any, ...]r   r.   r  z,tuple[tuple[Any, ...], tuple[int, int, int]])r6   r7   r8   __doc__r   r   r   r   r   r  r  r  r  r   r>  rZ  rV  rH  rG  rF  r  r  r  r  r  r  r  __classcell__r   s   @r;   r   r      s     "&7;TO %TO  TO "5TOp BF) @)&4n#`#0
%WNR`* 49 ,>\.`
.*%*	)*2D>O2!&FB0P 	HT#*0	5r:   r   c                      e Zd ZddZdddZy)r
  c                    || _         y r@   value)r   r  s     r;   r   z_ConstRepr.__init__  s	    
r:   Nc                    | j                   S r@   r  )r   _s     r;   __call__z_ConstRepr.__call__  s    zzr:   )r  rv   r@   )r  rv   )r6   r7   r8   r   r  r9   r:   r;   r
  r
    s    r:   r
  c                       e Zd ZdZe ej                  d      d
d              Z	 	 	 	 	 	 	 	 	 	 d fdZed        Z	ed        Z
ddZddZdd	Z xZS )r/  z
    Upstream Triton CompileKernel can not be pickled.  This is a wrapper
    to support serialization and generate the launcher function.
    rL   c                ,    t        dt        |             S )NKernelMetadata)r   sorted)fieldss    r;   _kernel_metadata_clsz(TritonCompileResult._kernel_metadata_cls  s     *F6N;;r:   c                Z    t         |           || _        || _        || _        || _        y r@   )r   r   r   rw   r0  rb   )r   r   rw   r0  rb   r   s        r;   r   zTritonCompileResult.__init__  s.     	(*r:   c                >    dd} ||       r| j                         S | S )a]  
        Triton uses a nested class called KernelMetadata to store metadata information.
        Pickle does not work well with nested namedtuples, as the namedtuple doesn't appear
        in the toplevel namespace of the module. So these serialization/deser functions
        are used to convert the namedtuples to a dict and back.

        As for packed_metadata, depending on the triton backend, KernelMetadata can be
        a namedtuple, or a regular tuple! So the serialization function branches on whether
        the metadata to be serialized is a namedtuple or regular, serializable one.
        c                Z    t        | t              xr t        | d      xr t        | d      S )N_asdict_fields)rr   tupler   )objs    r;   is_namedtuplez>TritonCompileResult._serialize_metadata.<locals>.is_namedtuple  s/    3& ,C+,C+r:   )r  rt   )r  )r  r  s     r;   _serialize_metadataz'TritonCompileResult._serialize_metadata  s%    	 "##%%Or:   c                    t        | t              r1t        j                  t	        | j                                     di | S | S )Nr9   )rr   r  r/  r  r  keys)r  s    r;   _deserialize_metadataz)TritonCompileResult._deserialize_metadata  s>    h%&;;E(--/<RS   Or:   c           	         | j                   }i |j                  | j                  |j                        | j                  t	        |dd             d d d d}i | j                  d|iS )Npacked_metadata)r  r  modulefunctionr  r   )r   r  r  r  r   )r   r   kernel_states      r;   r  z TritonCompileResult.__getstate__  su    

oo

 00A#77 148  

 9$--8<88r:   c                    t        j                  t               }|d   d   }|d   d   }|j                  j                  i |d   | j	                  |      | j	                  |      d       | j                  j                  |       || _        y )Nr   r  r  )r  r  )r-   __new__r  r  r  r   )r   r  r   r  r  s        r;   r  z TritonCompileResult.__setstate__'  s      ''7?:./*;</ !66x@#'#=#=o#N		
 	U#r:   c                $
   | j                   }| j                  }| j                  }|j                  j                  |j                          	 t        fdt        j                        D              t        fd|d   j                         D              }|j                  t        |d   j                                     }t               rj                  }j                  }d|d   v sd|d   v r|D cg c]	  }|dvs| }}dD ci c]   }|t        |d   j                  |            " }	}|D cg c]  }|	j                  ||       }}net        j                        D 
cg c]  \  }
}|
j                  vr||vr| }}
}t!        |      }j                  D cg c]  }||vr||vr| }}t#        |d      r|j$                  n|j&                  j$                  }|j(                  ||j*                  j,                  |j*                  j.                  t#        |d	      r|j0                  n|j&                  |t#        |d      r|j2                  n|j&                  j2                  t#        |d
      r|j4                  gt7        |dd      n;t#        |d      r.|j&                  j4                  g|j&                  j8                  ndt7        |dd      t7        |dd      d
}t#        |d      sdddddddddddg|}n;|j*                  j,                  rddj;                  |       d}nd}dddddd|ddg	|}d | j<                  v rg || j<                  d    }t>        jA                  | j<                  |      }d!dj;                  |       d"g|jB                  D cg c]  }d#| 	 c}d$|jD                   d%|jF                   d&|jH                   d'dj;                  |       d}tK        d(j;                  |      |       |d)   }||_         tM        |d*d+      |_'        tM        |d,d+      |_(        ||_        | j<                  j                  d-d.      |_)        |jR                  rp|_        ||_*        t               rJt!        |      }|D cg c]	  }||vs| }}|D cg c]   }|d   j                  |d/      d/k7  r||vr|" }}||_+        ||_,        |S c c}w c c}w c c}w c c}}
w c c}w c c}w c c}w c c}w )0z
        Launching triton kernels is performance sensitive, we compile
        a custom Python function get the grid() and reorder the args to
        the underlying wrapper.
        c              3  F   K   | ]  \  }}|j                   v s|  y wr@   )r'  )rO   r2  r   r   s      r;   rS   z4TritonCompileResult.make_launcher.<locals>.<genexpr>R  s%      %
Asbmm9KC%
s   !!c              3  4   K   | ]  \  }}||vr|  y wr@   r9   )rO   rl   rm   known_constantss      r;   rS   z4TritonCompileResult.make_launcher.<locals>.<genexpr>U  s)      
1yQo5 
   r  r!  rf   rg   rf   rg   r  r  num_ctascluster_dimsclusterDimsr  r9   r  cu_functionr  	c_wrapper)
	grid_metar  launch_enter_hooklaunch_exit_hookr  r  rf   cta_argsr  runnerlaunch_metadatagrid_0grid_1grid_2z	*cta_argsrE  r  r  z6bin.launch_metadata((grid_0, grid_1, grid_2), stream, ro   )r  r  zdef launcher(z
, stream):z    z    grid_0 = z    grid_1 = z    grid_2 = z    runner(rq   r   r   NrS  r  F	constexpr)-rw   r0  r   r.  r   _init_handlesr   rv  r(  rA   
differencer  r   rv   ra   r'  r  r   r  r  rx   r   r  r  r  rf   r  r"   r  ry   rb   r  r  rD   x_gridy_gridz_gridexecr   r   rS  r  r  r  r   )r   r   r0  r6  	none_argsr   r  r   rl   replr2  cfg_dictr   binary_sharedscoperunner_argsr   r   linelinesr   r   r   r  s                         @@r;   r  z!TritonCompileResult.make_launcher9  s    kk((ZZ]]	 % %
'5%
 
  
$[1779
 
	
 ((L4M4R4R4T)UV	)+I||H|K88<#<<
 $,s:U/UC 
 9 s<488;<<  <EECTXXc3/E	E (5AsBMM)c.B I 
 &c*H LLx'D	,A H  %VX6FMMFOO<R<R 	
 !'!1!1!C!C & 0 0 A A 6#45 &&__# 6;/   __.. 6:. OO#FNMJ vz2 __--M0L0LM 'vz=I$VUK@;
@ v01 #" K( 11$Z[_[d[den[oZppq"r"(#" K !D$6$66NND$6$67L$MNH!!$"4"4c: DIIh/0
;
(,4TFm4
 DKK=)
 DKK=)	

 DKK=)
 $))K013
 	TYYuu%$!&(D9#FJ='#1155mUKHK!HL-/)#.'/E!1H3DAEE '#K044QDS* 	  !)H!*HM Fr 5* Fs<   9	S)S)%S.8S3,S8&S>&T	T(T2%T)r  ztuple[str, ...]r  r   )
r   r-   rw   r.   r0  r  rb   r  r  r  r  r  )r  LauncherType)r6   r7   r8   r  staticmethod	functools	lru_cacher  r   r  r  r  r  r  r  r  s   @r;   r/  r/    s    
 Y<  <++ + %	+
 &+ 
+  0  9 $rr:   r/  c                    dd l }dd l} |j                         }||j                   |j                  }|g }|j                  |       D ]A  }t        |t              s|j                         D ]  \  }}|| u s|j                  |        C |S Nr   )
gcinspectcurrentframef_localsf_backget_referrersrr   r  rA   ru   )r  r  r  frame	obj_namesreferrerrl   rm   s           r;   _find_namesr#    s     G  "E

 
 I$$S) (h% ( (18$$Q'((
 r:   z	list[Any]collected_callsc                 ,    t         j                          y r@   )r$  clearr9   r:   r;   start_graphr'    s    r:   c           
        t        t              dk(  ry t        d t        D              }t        d t        D              }t        j                         d   j
                  }d| d|dd|dd	||d
z  z  dd	}t        j                  d|       | t        t        d d      }	 t        | d      5 }t        j                  d|        |j                  d       |j                  d| d       |D ]?  \  }}}	}
||z  dz  dd}d| d|
 }t        |||	|d      }|j                  |dz          A |j                  | d       d d d        y y # 1 sw Y   y xY w# t        $ r!}t        j                  d| |       Y d }~y d }~ww xY w)Nr   c              3  &   K   | ]	  }|d      yw)r   Nr9   rO   calls     r;   rS   zend_graph.<locals>.<genexpr>	  s     ;4tAw;   c              3  &   K   | ]	  }|d      yw)r   Nr9   r*  s     r;   rS   zend_graph.<locals>.<genexpr>
  s     9T!W9r,  r   z	SUMMARY (z)
z.2fzms   	 z GB	      @@zGB/s%sc                    t        | d         S r  )rT  )r   s    r;   <lambda>zend_graph.<locals>.<lambda>  s    U1Q4[ r:   T)r   reverserp   z$Save profile bandwidth results to %sz====================
zTRITON KERNELS BANDWIDTH INFO (d   % 	 F)suffixcolorrq   z

z4failed to write profile bandwidth result into %s: %s)rU   r$  sumr  stackr   r   infor  r   r   r    r,  warning)output_fileoverall_time
overall_gbcur_filesummary_strsorted_callsfilemsnum_gbgb_per_sr   
percentager6  bw_info_strr   s                  r;   	end_graphrH    s   
?q ;?;;L999J}}q!**H
H:S
HZ$4F:X[I[;\]`:aae	g  HH  o3HRVW	k3' 14: 

34

<XJcJK9E 35B+$&$5$;C#@!BJ#J<tK=AF"; %##K JJ{T123 

k]$/0'1 1 
1 1(  	KKF 	s7   %E 1BEE EE E 	F%FFc                  2     e Zd Zdddd fd
Z fdZ xZS )DebugAutotunerr   FT)regex_filterr?  with_bandwidth_infoc               \    || _         || _        || _        t        |   |i | d | _        y r@   )rK  r?  rL  r   r   cached)r   rK  r?  rL  r   rx   r   s         r;   r   zDebugAutotuner.__init__6  s6     )*#6 $)&)r:   c          
     l   | j                   st        |   |d|i|ddi y t        |       }t	        |t
               }t        j                  | j                  |      sy t        | j                        dk7  rt        | j                        dk(  r@t        j                         }| j                          t        j                         |z
  | _        t        | j                        dkD  r | j                  |i | | j                  \  }|j                  r| j!                  ||       | j"                   | j$                  |g|d| j&                  i}t        | j(                  j*                  D 	cg c]  }	|	j-                  d      r|	 c}	      }
| j.                  j1                  d	d       }|t3        |d
|
idz  }||dz  z  }||||f| _        t4        j7                  ||||f       t8        j;                  dt=        |||d|              y t4        j7                  | j"                         y c c}	w )NrE  r  Tr   r   r   r?  
in_out_ptrkernel_num_gbnum_in_out_argsg    eAr.  r/  r5  )r6  )rL  r   r  r#  r   rU   rematchrK  r   r  r  r   r   r  r  r  rN  rZ  r?  r   r(  r   rb   ra   r$   r$  ru   r   r:  r    )r   rE  r   rx   possible_namesr   r  r   rC  r3  num_in_out_ptrsrD  rE  r   s                r;   r  zDebugAutotuner.runD  s   ''GKKfKKdK(.N S9:K88D--{;4>>"a't~~&!+!%JOO%48LLNZ4OD1t~~&*/D//@@..KX##$$VX6{{"TZZR4Rt?Q?QR"% )-(9(9$#..|< !# ++//F>*DR/RUXXF!R#X. &(K?&&FHk'JK-FHtK=5I  &&t{{3)s   1H1)r6   r7   r8   r   r  r  r  s   @r;   rJ  rJ  5  s      -4 -4r:   rJ  c           	        t        j                         }| D ]_  }|j                  t        |j                  j                                d|j                   d|j                   dj                                a |j                         S )z:
    Hash used to check for changes in configurations
     rq   )
hashlibsha256r  r  rx   rA   rf   rg   encode	hexdigest)rY   hasherr   s      r;   hash_configsr^  t  sw     ^^F 
cjj&&()*!CMM?!CNN;K2NUUW	

 r:   c                    t              t              dk(  ssJ i nj                  dd      }d|s~t              dkD  sj                  d      r_t        j                  j                  dd      dk(  s<t              }t        j                  |      r/j                        x}	r|	gn|rt        j                  d	       j                  d
d      j                  dd      dv rj                  d      z  g dv r j                  j                  d              fd}
|
S )z
    A copy of triton.autotune that calls our subclass.  Our subclass
    has additional debugging, error handling, and on-disk caching.
    r   Nforce_disable_cachesFr  r   r   r   z;autotune caching is disabled by config.force_disable_cachesr   r9   r   Trestore_valuereset_to_zeroc                   dd l }d |j                  | j                        j                  vrED ]@  }d|j                  v s|j                  d   dk(  sJ |j                  j                  d       B j                  d      r.t        | d   d   xr j                  	
d      S t        | xr j                  	
	      S )
Nr   r   r   profile_bandwidthprofile_bandwidth_regex/profile_bandwidth_with_do_bench_using_profilingT)r   rb   rK  r?  rY   r   r   r   r   r   rR   r   r   rL  )r   rb   rY   r   r   r   r   r   rR   r   r   )
r  r!  r   
parametersrx   r&  ra   rJ  saver   )r   r  tconfigautotune_cacherY   r   r   r   rb   r   r   r   rR   r   s      r;   	decoratorz"cached_autotune.<locals>.decorator  s   
 	,7,,RUU3>>>" 1w~~-">>(3q888NN&&x01
 01!'+*+DE+E   . F>3F3F"3(?)-%+!$(# &  #'*B~/B/B/$;%)!'
 	
r:   )unique_configsrU   ra   rz   r   r^  r   create	read_bestr   r   r&  rV   )rR   rY   r   r   r   rb   r   disabledconfigs_hashr  rk  rj  r   r   r   s   ```````    @@@@r;   cached_autotunerq    sQ    W%Gw<1(('/B]M  !7?H N \A!2!23N!O

1373>#G,&--mX|T,66}gNN{N&- IIST%))*=rB $$^T:L+%[___==)++%&&{'GH.
 .
` r:   c                    t               }g }| D ]4  }t        |      }||vs|j                  |       |j                  |       6 |S )zRemove duplicate configurations)r   r'   r   ru   )rY   seenpruned_configsr   r   s        r;   rl  rl    sO    !+DN '',d?HHSM!!#&	'
 r:   xnumelynumelznumelc                  t        |||fd      D ]s  \  }}|	| | d   }|dk(  r*|dk(  s%J d|j                          d| d| d| d|  d	       t        |   }d
| d}||z  dk(  rXJ d| d| d| d| d| d| d|  d	        y )NXYZBLOCKr   z;TritonKernel.indexing assumes numel == 1 => BLOCK == 1 but znumel==z and zBLOCK=z (cfg=z).zconfig.triton.max_block["z"]r   zTritonKernel.indexing assumes zBLOCK divides z but rj   )rq  lowerr   )	r   rv  rw  rx  rE   labelblock	max_blockmax_block_strs	            r;   check_configr    s    VVV4e< 
u=ugUO$A:A: geWE%ugVTWSXXZ\: %U+	3E7"=5 A% 	
,UG>-E7&u]O1YKvcURTV	
%
r:   c           	         | j                         D ]<  \  }}d}||v s|j                  |      }t        |   }||k  r-J d| d| d| d        y)z@
    Check that block sizes are within the maximum allowed.
    r{  'z' too large. Maximum: z
. Actual: .N)rA   removesuffixr   )r   varvalblock_suffixrD   r  s         r;   check_max_blockr    sr     IIK S3%%l3F(0I)# C5.ykC5J#r:   c                    t         j                  j                  r|dz   dz  }|dz   dz  }|r|dz  }t        t	        t        | |      |            S )Nr   r   )r   r   r   r%   r  r   )rf   max_num_warpsmin_num_warpsregister_intensives       r;   
_num_warpsr    sU     }}&*q0&*q0%*3s9m<mLMMr:   c                    d}t         j                  j                  rdnd}| d   |z   dz
  |z  }||z  |z  |kD  r&|| d   k  r|dz  }|dz  }||z  |z  |kD  r	|| d   k  r||z  |z  |kD  rt        d      ||fS )Nr   rL   r   r   r   zQReduction config exceeds cudaDeviceProp maxGridSize. Please raise a pytorch issue)r   r   r   AssertionError)rR   r   rf   
max_grid_xrN   
num_blockss         r;   _check_max_grid_xr    s    JmmR  S/A%)a/J	!I-
;JsO@S	Q1_
 	!I-
;JsO@S 	Y*j8_
 	
 j=r:   c           	        g d}t        |||      }t        | j                          |k  r|dz  }t        || d         }|rt        || d         }|rt        || d         }|t        | d   t        d         k  r\||d   z  | d   k  st        |||      |k  r>|dz  }|t        | d   t        d         k  r ||d   z  | d   k  r-t        |||      |k  r>|rw|t        | d   t        d	         k  r^||d
   z  | d   k  st        |||      |k  r@|dz  }|r9|t        | d   t        d	         k  r ||d
   z  | d   k  r/t        |||      |k  r@|rw|t        | d   t        d         k  r^||d   z  | d   k  st        |||      |k  r@|dz  }|r9|t        | d   t        d         k  r ||d   z  | d   k  r/t        |||      |k  r@t	        t        |||      |z  d
      }	t        |||      dk\  r&t
        j                  j                  st        |	d      }	| d   }
| j                  d      }| j                  d      }t        t        |||      |t        z  |	z        }|t        j                  |t        |||      z        z  }t        | ||	      \  }}t        || d         }d|i}|r||d<   |r||d<   t        |       t        ||
||       t!        ||	|      S )a  
    Construct a pointwise triton config with some adjustment heuristics
    based on size_hints. Size_hints is a tuple of numels in each tile
    dimension and will be rounded up to the nearest power of 2.

    num_elements_per_warp is a suggestion for controlling how many warps
    the triton config should contain. e.g.: if x=16, y=8, z=4 then
    num_elements = 16*8*4 = 512. Then if we set num_elements_per_warp=128,
    we'll launch 512 (elem) / 128 (elem/warp) = 4 warps. Note that it's
    just a suggestion, and sometimes other adjustment heuristics will
    override the num_elements_per_warp.

    min_elem_per_thread controls the minimum number of elements
    processed by each thread. It's always enforced.
    )r    r  r   r   yzXr   r   Yr   Zr     rH   r   YBLOCKZBLOCKru  r  )r   rs  r  r   r  r   r   r   r   ra   r   mathceilr  r  r  r.   )rR   r   r  r  rg   rK   min_elem_per_threadmaxGridSizer#  rf   rv  rw  rx  rX   _num_blocksr   s                   r;   rM   rM   .  sA   4 -K Aq)FJ--/0691 	Az#A:c?#:c?# c*S/#3C#89
9	KNZ_,0CAq!0Lv0U	Q c*S/#3C#89
9	KNZ_,0CAq!0Lv0U 	
JsO%5c%:;;AC0"1a+f4 	
Q 	
JsO%5c%:;;AC0"1a+f4
 	
JsO%5c%:;;AC0"1a+f4 	
Q 	
JsO%5c%:;;AC0"1a+f4
 Aq!$(==QI 1a#s*5==3D3D	1%	_F^^C F^^C F Aq!$33i?J : 3Aq! <<	==A&z1i@NA{Az#AQ-CHHCVF6B#zBBr:   c                  	 t        | t                    } t        D cg c]  }t        |      s| c}      }| }i 	t	        |dz
  dd      D ]X  }d| d}t        |   t
        |j                                  }t        ||      }||z  dk(  sJ d| d| d       |	|<   ||z  }Z t        	j                          }| |k(  sJ d		 d
|  d       t        	fd	D              sJ d	 d        	S c c}w )aC  
    Converts a linear reduction numel to ND, in row major order.
    This order is often desirable as it presents opportunities to coalesce memory
    accesses.
    For example, if r = 64 and size_hints = [32,32], this function returns [32, 2].
    This unraveling works because both r and size_hints are powers of 2.
    r   rr  r   zExpected dimension 'z' to divide remaining size 'r  zExpected ND reduction size (z
) to have z
 elements.c              3  4   K   | ]  }|   |   k    y wr@   r9   )rO   rD   rnumelsrR   s     r;   rS   z+_get_nd_reduction_numels.<locals>.<genexpr>  s     Kwv*V"44Kr  zrnumels exceed size_hints. z > )
r  rF   rU   r   ranger   upperr   rs  all)
r  rR   rD   num_reduction_dims	remainingidxmax_sizedimfinal_numelr  s
    `       @r;   _get_nd_reduction_numelsr    sM    	A(45A(HF,?,GH IG'!+R4 SEz&)+;FLLN+KL(I&3!# 	
"3%'CI;aP	
# c	 &w~~'78K 
&wiz!JG K7KK 
%gYc*>K N1 	Is
   C9C9c                   t        ||       t        | d         dfd} |       }t        | j                          |k  r|dz  }| d   k  r" |       |k  rdz  | d   k  r |       |k  rt	              D ]=  }|   | |   k  s |       |k  s|xx   dz  cc<   |   | |   k  s3 |       |k  r$? |
 |       dz  }t        |d|      }t        | |      \  }	t	              D ].  } |       |kD  s|   dk(  r|xx   dz  cc<    |       |kD  r!0 t        di      }
t        |
       t        |
| d   	       t        |
||
      S )z
    Construct a reduction triton config with some adjustment heuristics
    based on size_hints. Size_hints is a tuple of numels in each tile
    dimension and will be rounded up to the nearest power of 2.
    r   c                 8    t        g j                          S r@   r   rs  )r  r   s   r;   total_numelz,triton_config_reduction.<locals>.total_numel  s    "18w~~'788r:   r   r   r  rB  )r  r  r   )rv  r  r  rs   )r  r  r   rs  r  r  r  _get_configr  r  r.   )rR   r   r  rg   rf   r  r  r#  rD   r  r   r  s    `         @r;   triton_config_reductionr    s    'q*5G 	Az#A9 ]FJ--/0691 jo
+-&"8	Q jo
+-&"8/ !fo
6 22{}v7MFOq O fo
6 22{}v7M! MS(	8JI 'z1i@NA{/ "mf$v!#FO!O mf$" sA))
*CCZ_-#zBBr:   c                x    | j                         D ci c]  \  }}|j                         dz   | c}}S c c}}w )zW
    Convert numels ("x", "r0_", etc.) to block sizes ("XBLOCK", "R0_BLOCK"), etc.
    r{  )rA   r  rB   s      r;   r  r    s4    
 BHPFLLNW$e+PPPs   6c                  
 t        ||       
t        | d         t        | d         d
fd} |       }t        | j                          |k  r|dz  }| d   k  r" |       |k  rdz  | d   k  r |       |k  rt	        
      D ]=  }
|   | |   k  s |       |k  s
|xx   dz  cc<   
|   | |   k  s3 |       |k  r$? | d   k  r" |       |k  rdz  | d   k  r |       |k  rt        d
      }t         |       dz  d	      }	t        || d
   | d          t        |       t        ||	|      S )z
    Construct a tile reduction triton config with some adjustment
    heuristics based on size_hints. Size_hints is a tuple of numels in
    each tile dimension and will be rounded up to the nearest power of 2.
    r   r  c                 :    t        g j                          S r@   r  )r  r   r  s   r;   r  z2triton_config_tiled_reduction.<locals>.total_numel  s    "1a;'..*:;;r:   r   r   r   r   r     r  r   )rv  rw  r  r  )
r  r  r   rs  r  r  r  r  r  r.   )rR   r   r  r  rg   r  r#  rD   r   rf   r  s    ``       @r;   triton_config_tiled_reductionr    sy    'q*5G 	Az#AAz#A< ]FJ--/0691 jo
+-&"8	Q jo
+-&"8/ !fo
6 22{}v7MFOq O fo
6 22{}v7M! jm
 6	Q jm
 6 AA11
2C;=C/qAIZ]:a=AC#zBBr:   c                ~   |i n|}|j                  d      rJ t        j                  t        j                  | j                               }t        dt        |dz  d            }t        |j                  dt                     | ||d         }t        j                  t        |      }	d}
t        |       d	k(  rSt        |      r-|j                  d
      s|j                  d      s |	| |      g}
n |	| |d       |	| |dz  d      g|}
t        |       dk(  rt        |      s|t        j                  k(  r.|j                  d
      s|j                  d      s |	| dd      g}
n: |	| dd       |	| dd       |	| dd       |	| dd       |	| |d	       |	| d	|      g|}
t        |       dk(  rbt        |      r |	| ddd      g}
nJ |	| ddd       |	| ddd       |	| ddd       |	| ddd       |	| |d	d	       |	| d	|d	       |	| d	d	|      g|}
|
st!        d|        t#        | |
||t$        j&                  |      S )z=
    Construct @triton.heuristics() based on size_hints.
    Nno_x_dimr  r     autotune_hintsr   )r  r   max_autotunemax_autotune_pointwise)rK   r   r   rL   rB  rI   r   size_hints: r   rb   r   r   )ra   r  reduceoperatormulrs  r   r  r\   r   partialrM   rU   rc   r   SQUARENotImplementedErrorrq  r   	POINTWISE)rR   r   	tile_hintr   r  rb   rE   bshinted_configstriton_config_with_settingsrY   s              r;   	pointwiser    s    (/B]M  ,,,X\\:+<+<+>?E	S#eslD)	*B.*JL9
H	N #,"3"3+># G
:!'6n-  !9:2:rBCG ,JRUV+ar
  G :!(79;Wn-  !9:2:r2FGG ,JB?+JB?+JR@+JC@+JA>+J2>  G :!'62:r2rJKG ,JBC+JAqA+J2qA+J1bA+JAqA+J2qA+J1bA	  	G !L"=>>#$.. r:   c                   |j                  dd       }t        |       }d}d}| d   dk\  r,|j                  dd      |j                  dd      z   d	k\  rd}d
}t        | dd|cxk  r|k  rn n|n||      }t        | dd|      }t        | |dk  rdd|z  z  ndt        ||      |      }|j                  d      s|j                  d      rnB|t        j
                  k(  r|gS |t        j                  k(  r|gS |t        j                  k(  r|gS t        |      rt        | dd      gS |||t        | dd      t        | dd      t        | ddd      gS )Nreduction_hintFi   r   r  num_loadr   num_reductionrO  Tr   r  r  r   r   r   r  r  rL   r  i   rH   )rf   )	ra   rF   r  r  r   INNEROUTER
OUTER_TINYrc   )	rR   rb   r  rnumelr  MAX_R0_BLOCKcontiguous_configouter_configtiny_configs	            r;   _reduction_configsr  p  s    #&&'7>N 'z2FL34j!,}/@/@RS/TT !/	.,.L-	 +B.@L *%}SF]!FL!-	K (M,=,=>V,W	=..	.!""	=..	.~	=33	3}#M2'
B<==
B3
As3 	 
BQ?
 
r:   c                    |i n|}||d<   |j                  d      rd| d<   |J t        | |      }t        | |||t        j                  |      S )zargs to @triton.heuristics()r  r  r   r   rR   rb   rY   r   rb   r   r   )ra   r  rq  r   r   rR   r  r   r   rb   rY   s         r;   	reductionr    so     (/B]M&4M"#$
3""" JmTG#$.. r:   c                   |i n|}||d<   |j                  d      rd| d<   t        |       dk(  sJ d       | d   | d   }}d}t        dt        ||z  t                    }||k\  sJ |t        k  sJ |d	   rt        |||z  d
||      }	nt        |||z  d
|      }	|	D ]  }
||
j                  d<    t        | |	||t        j                  |      S )Nr  r  r   r   r   z:Cooperative reductions don't support tiling reduction dimsr0_r   r   )r   r  r  RSPLITr  )ra   rU   r   r  r   _persistent_reduction_configsr  rx   rq  r   r   )rR   r  r   r   rb   rv  r  r#  splitrY   rw   s              r;   cooperative_reductionr    s#    (/B]M&4M"#$
3 z?a D  _j&7FF F3v'):;<EU??%%%%+,/51>=
 %#FeO<'
  ("'h( #$.. r:   c           	        | d   }t        |       }dD cg c]#  }|dk(  s||z  dk  r||k  rt        | ||d      % }}|t        j                  k(  r|dk\  r|d d }nG|t        j                  k(  r|dd  }n.|t        j
                  k(  rt        | |dk  rd	d|z  z  nd|      g}|D ]A  }| D ]:  }t        |      s|j                  j                  |j                          d
       < C t        |      r|d d }|S c c}w )Nr   )r   r   rL   r  r   i   Tr  r  r  r   r{  )rF   r  r   r  r  r  r   rx   r&  r  rc   )	rR   r  rb   rv  r  r   rY   r   rD   s	            r;   r  r    s0   
 _F&z2F &Q;6F?d2v7G 	 
FFtTG  ,,,3"1+	=..	."#,	=33	3#'-}SF]#!
  7  	7F"6*/u56	77 $M2"1+N9s   (C<c                    |i n|}||d<   |j                  d      rd| d<   t        | ||      }t        | ||||t        j                        S )Nr  r  r   r   )r   rb   r   r   )ra   r  rq  r   r  r  s         r;   r   r   #  se     (/B]M&4M"#$
3+JVG#$99 r:   c                   |i n|}||d<   |j                  d      rd| d<   |J t        |       dk7  rt        d|        t        | |      }|j                  dd	      }|D ]^  }t	        |j
                  j                               D ]6  }|j                  d
      s|j
                  |   |k  s(||j
                  |<   8 ` t        | |||t        j                  |      S )z#Heuristic for TritonSplitScanKernelr  r  r   r   r   r  r  min_split_scan_rblockr  r   r  )ra   rU   r  r  listrx   r  r   rq  r   
SPLIT_SCAN)	rR   r  r   r   rb   rY   
min_rblockr   r  s	            r;   
split_scanr  ;  s     (/B]M&4M"#$
3"""
:!!L"=>> JmTG ""#:C@J -

)* 	-C~~c"szz#'C",

3	--
 #$// r:   c                n    t        dt        j                  i | |      g||t        j                  |      S )z#
    Compile a triton template
    Nrg   rf   r  rq  r3   r.   r   r  )rg   rf   r   r   rb   s        r;   templater  _  s9     	rjI	FG#$-- r:   c                L    i }dD ]  }| j                  |d      }||||<    |S )z7Extract triton.Config options that should become kwargs)rf   rg   r  maxnregN)r&  )rw   poppedr   r  s       r;   _pop_config_kwargsr  m  s;    FA jjd#?F3K Mr:   c                P    i | j                   | j                  | j                  dS )Nr  )rx   rf   rg   rw   s    r;   r  r  w  s/    
--%%'' r:   c                4    i | } t        | fi t        |       S r@   )r.   r  r  s    r;   config_from_dictr    s!    ZF&7.v677r:   c           
         i | } t        dt        j                  | fi t        |       g||t        j
                  |      S )zH
    Used when the configuration is already decided at compile time
    Nr  )rq  r3   r.   r  r   FIXED)rw   r   r   rb   s       r;   fixed_configr    sI     ZF	v	<!3F!;	<=#$** r:   c           	         t        |       dk(  rt        j                  i       g} ng t        t        |       } t        d| |t        j                  |||      S )z.
    Compile a user defined triton kernel
    r   N)r   r   r   rb   r   )rU   r3   r.   mapr  rq  r   r  )rY   r   r   rb   r   s        r;   user_autotuner    s[     7|q==$%3C('23$22## r:   c                n    t        dt        j                  i d|      g| |t        j                  |      S )z)
    Compile a triton foreach kernel
    Nr   r  r  r  )r   rf   r   rb   s       r;   foreachr	    s9     	ra9	=>#$-- r:   c                      e Zd ZU dZded<   dZded<   dZded	<   d
Zded<   d
Zded<   d
Z	ded<   ddZ
ddZ	 	 	 	 	 	 ddZddZddZ	 	 	 	 	 	 ddZddZe	 d	 	 	 	 	 	 	 d d       Zd!dZy)"r  z3Generate code for grid size expressions in launcherr  rb   pythonLiteral['python', 'cpp']moder9   zSequence[str]rD   r   Union[str, int]r  r	  r
  c                $    | j                   dv sJ y )N)r  cpp)r  r  s    r;   __post_init__zGridExpr.__post_init__  s    yy----r:   c                    t         r@   r  r   metas     r;   generatezGridExpr.generate  s    !!r:   c                    ||dk(  r|S t        |t              rt        |t              rt        ||      S | j                  dk(  r	d| d| dS d| d| d| dS )	Nr   r  z-((z) // -(z))z((z + (z
 - 1)) / ()rr   rs   r   r  )r   rE   r~  s      r;   r   zGridExpr.ceildiv  sp     =EQJLeS!j&<5%((99 wugR00E7$ugZwb99r:   c                    | j                  t        |      }t        |      dk  r|d   S | j                  dk(  r#ddj	                  t        t        |             dS t        j                  d |      S )zPCodegen for max function with constant folding, constants are represented as intr   r   r  zmax(ro   r  c                    d|  d| dS )Nz	std::max(ro   r  r9   r  s     r;   r1  z"GridExpr.maximum.<locals>.<lambda>  s    y2aS-B r:   )	_constant_foldr   rU   r  ry   r  rv   r  r  r   seqrA   s      r;   maximumzGridExpr.maximum  sg    ##C-u:?8O99 $))CUO45Q77 BEJJr:   c                    | j                  t        |      }t        |      dk  r|d   S dj                  t	        t
        |            S )zPCodegen for sum function with constant folding, constants are represented as intr   r   z + )r  r8  rU   ry   r  rv   r  s      r;   	summationzGridExpr.summation  s=    ##C-u:?8Ozz#c5/**r:   c                    |D cg c]  }t        |t              r| }}|D cg c]  }t        |t              s| }}|r|j                   ||             |S c c}w c c}w )z?Constant fold through a commutative fn where ints are constants)rr   rs   ru   )r   r   r  r   rA   const_itemss         r;   r  zGridExpr._constant_fold  s]     47'QajC>P'Q'Q"%<QAs);q<<LLK)	 (R<s   AAA!A!c                    | j                   dk(  r| d| S | j                   dk(  r	d| d| dS t        d| j                          )Nr  z = r  z	uint32_t ;zinvalid mode )r  r  )r   r   exprs      r;   
assign_tmpzGridExpr.assign_tmp  sV    99 V3tf%%99tfCvQ//}TYYK899r:   c                    t               | d      }t        |t              sJ  || |      }t        |t              rt        |      }|j                  |       |S )N	grid_type)rb   r  )globals
issubclassr  rr   r.   r  r  )rb   r   r  grid_clsr   s        r;   r  zGridExpr.from_meta  sV     9];78(H---m$?c6" %Ccr:   c                    i |}| j                   D ]  }t        ||        t        d| j                   |       t        d| j                   |       t        d| j                   |       |d   |d   |d   fS )Nz	grid_0 = z	grid_1 = z	grid_2 = r  r  r  )rD   r  r  r	  r
  )r   r  r  r  s       r;   r  zGridExpr.eval_slow	  s    4KK 	Du	y&.y&.y&.Xhx@@r:   N)r  r  r  dict[str, int]r  r  )rE   r  r~  zUnion[None, int, str]r  r  )r  list[Union[int, str]]r  zUnion[int, str])r   zCallable[[list[int]], int]r  r.  r  r.  )r   rv   r$  r  r  rv   )r  )rb   r  r   zUnion[Config, dict[str, int]]r  r  r  r  )r  r-  r  ztuple[int, int, int])r6   r7   r8   r  __annotations__r  rD   r  r	  r
  r  r  r   r  r  r  r%  r  r  r  r9   r:   r;   r  r    s    =!!%-D
"-FMFOFOFO."
:$
:-B
:	
:K+,3H	:  *2%* ' 
	 Ar:   r  c                      e Zd ZddZy)Grid1Dc                P    | j                  d|j                  d            | _        y )Nrv  r   )r   ra   r  r  s     r;   r  zGrid1D.generate	  s    ll8TXXh-?@r:   Nr,  r6   r7   r8   r  r9   r:   r;   r1  r1  	  s    Ar:   r1  c                      e Zd ZddZy)Grid2Dc                    | j                  d|j                  d            | _        | j                  d|j                  d            | _        y )Nrv  r   rw  r  )r   ra   r  r	  r  s     r;   r  zGrid2D.generate	  s8    ll8TXXh-?@ll8TXXh-?@r:   Nr,  r3  r9   r:   r;   r5  r5  	      Ar:   r5  c                      e Zd ZddZy)Grid3Dc                    | j                  d|j                  d            | _        | j                  d|j                  d            | _        | j                  d|j                  d            | _        y )Nrv  r   rw  r  rx  r  )r   ra   r  r	  r
  r  s     r;   r  zGrid3D.generate	  sS    ll8TXXh-?@ll8TXXh-?@ll8TXXh-?@r:   Nr,  r3  r9   r:   r;   r9  r9  	  s    Ar:   r9  c                      e Zd ZddZy)Grid2DWithYZOverflowc           	     L   | j                  d|j                  d            | _        | j                  d| j                  d|j                  d                  | j                  d| j                  dt	                           g| _        | j                  dd      | _        d| _        y )Nrv  r   y_grid_raw_rw  r  y_grid_div_)r   ra   r  r%  r#   rD   r	  r
  r  s     r;   r  zGrid2DWithYZOverflow.generate 	  s    ll8TXXh-?@OOM4<<$((8BT+UVOOt||M>;KL
 ll=-@#r:   Nr,  r3  r9   r:   r;   r<  r<  	  s    	$r:   r<  c                      e Zd ZddZy)CooperativeReductionGridc                v    t        |d         | _        | j                  d|j                  d            | _        y )Nr  rv  r   )rv   r  r   ra   r	  r  s     r;   r  z!CooperativeReductionGrid.generate-	  s-    $x.)ll8TXXh-?@r:   Nr,  r3  r9   r:   r;   rA  rA  ,	  r7  r:   rA  c                      e Zd ZddZy)SplitScanGridc                    |j                  dd      dk(  sJ | j                  d|j                  d            | _        d| _        y )Nr   r   r0_numelr  rv  )ra   r   r  r	  r  s     r;   r  zSplitScanGrid.generate3	  s=    xx!$)))ll:txx
/CDr:   Nr,  r3  r9   r:   r;   rD  rD  2	  s    r:   rD  c                  &    e Zd Zedd       ZddZy)	FixedGridc                 2    t         j                  g dg ddS )z>Inductor meta so the launcher takes three extra grid arguments)_grid_0_grid_1_grid_2)r'  
fixed_gridr  )rH  r6   r9   r:   r;   setup_grid_as_argszFixedGrid.setup_grid_as_args:	  s     #++;#D
 	
r:   c                H    | j                   d   \  | _        | _        | _        y )NrM  )rb   r  r	  r
  r  s     r;   r  zFixedGrid.generateC	  s    040B0B<0P-T[$+r:   Nr  r,  )r6   r7   r8   r  rN  r  r9   r:   r;   rH  rH  9	  s    
 
Qr:   rH  c                      e Zd ZddZy)PrecomputedGridc                    | j                   d   D ]K  }t        fd|d   j                         D              s)|| j                     \  | _        | _        | _         y  t        d d| j                   d          )Nprecomputed_gridsc              3  L   K   | ]  \  }}j                  |      |k(    y wr@   r`   )rO   rl   rm   r  s      r;   rS   z+PrecomputedGrid.generate.<locals>.<genexpr>J	  s"     L1488A;!#Ls   !$rw   zPrecomputed grid not found for z in )rb   r  rA   r  r  r	  r
  r  )r   r  	candidates    ` r;   r  zPrecomputedGrid.generateH	  s    ++,?@ 	IL	(0C0I0I0KLL8A$))8L5T[$+	 -dV48J8JK^8_7`a
 	
r:   Nr,  r3  r9   r:   r;   rQ  rQ  G	  s    
r:   rQ  c                  ,    e Zd ZddZ	 	 	 	 	 	 	 	 ddZy)ComboKernelGridc                    | j                   d   }|d   r	i |d   |}g }g }g }g }t        |d         D ]  }|d|    |d|    dkD  sJ |j                  |d|           |j                  |d|    xs d|        d| |v r|j                  |d|    xs d|        d| |v sz|j                  |d|    xs d|         | j                  |||      | _        |d	   r%| j                  | j                  |d	   g      | _        |r5| j                  | j                  |      |j                  d
            | _        |r6| j                  | j                  |      |j                  d            | _	        y y )Ncombo_grid_metadefault_confignum_kernelsxnumel_r   	no_x_dim_ynumel_znumel_
min_blocksr  r  )
rb   r  ru   combo_x_gridr  r  r   ra   r	  r
  )r   r  
combo_meta	no_x_dimsxnumelsynumelsznumelsnums           r;   r  zComboKernelGrid.generateS	  s   ''(9:
&';j!12;d;D	M23 		OCWSE?+3zGC5/7RUV7VVZ)C5(9:;NN:uo6IGC5/J*,zGC5/:MuoN*,zGC5/:MuoN		O ''DAl#,,Z5M'NODK,,t||G'<dhhx>PQDK,,t||G'<dhhx>PQDK r:   c                    t         r@   r  )r   rd  rc  r  s       r;   ra  zComboKernelGrid.combo_x_gridn	  s
     "!r:   N)r  r-  rd  r.  rc  
list[bool]r  r-  r  r  )r6   r7   r8   r  ra  r9   r:   r;   rW  rW  R	  s3    R6"&" " 	"
 
"r:   rW  c                  $    e Zd Z	 	 	 	 	 	 	 	 ddZy)SequentialComboKernelGridc                    t        |      t        |      k(  sJ | j                  t        ||      D cg c]*  \  }}| j                  ||rdn|j	                  d            , c}}      S c c}}w )Nr   r   )rU   r  rq  r   ra   )r   rd  rc  r  r   r  s         r;   ra  z&SequentialComboKernelGrid.combo_x_gridx	  sk     7|s9~---~~ $'w	#:Ax QX488H3EF
 	
s   /A+
Nri  r6   r7   r8   ra  r9   r:   r;   rl  rl  w	  s-    
&
 
 	

 

r:   rl  c                  $    e Zd Z	 	 	 	 	 	 	 	 ddZy)RoundRobinComboKernelGridc                   t        |      t        |      k(  sJ | j                  d   d   }t        ||      D cg c]
  \  }}|s	| }}}t        ||      D cg c]
  \  }}|r	| }}}|r?|j                  | j	                  | j                  |      |j                  d                   d| j                  |       d| S c c}}w c c}}w )NrY  r[  r   (z) * )rU   rb   rq  ru   r   r  ra   )	r   rd  rc  r  r[  r   r  exprsxnumels_x_dims	            r;   ra  z&RoundRobinComboKernelGrid.combo_x_grid	  s     7|s9~---(():;MJ&)'9&=J{q(JJ.1'9.EV{q(XVVLLdll=&A488HCUVW4<<&'tK=99	 KVs   
CC
C'CN)rd  r.  rc  rj  r  r-  r  rv   rn  r9   r:   r;   rp  rp  	  s-    :&: : 	:
 
:r:   rp  )rC   r-  r  rs   )rW   zOrderedSet[AutotuneHint]rX   rs   rQ   r   r  list[Config])rY   ru  )NNF)rR   zOptional[list[int]]rY   ru  )r   r-  )r   r   F)NNr   r  r   )r  r.   )r  rs   rR   r-  r  r-  )r   NF)r   rs   r  rs   r  r.   )rC   r-  r  r-  r]  )NNr   N)rR   r-  rb   r  r  ru  )FNNNr  )NN)rw   r  r  r  )rw   r.   r  r  )rw   r  r  r.   )
__future__r   r  r   dataclassesr  rY  r  r~  r   r  r  rz   os.pathrS  r}   r   r  collectionsr   typingr   r   r   r   r	   r
   r   torch._prims_commonr   torch.utils._ordered_setr   triton_bundlerr   utilsr   r   r   r   rj  r   benchmarkingr   coordinate_descent_tunerr   rW   r   r   r   r   r   r   r   r   runtime_utilsr   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   triton_compatr*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r  r5   collections.abcr<   r=   r>   r  	getLoggerr6   r   rF   r\   rc   r   r   r
  r/  r#  r$  r/  r'  rH  rJ  r^  rq  rl  r  r  r  r  rM   r  r  r  r  r  r  r  r  r  r   r  r  r  r  r  r  r  r	  	dataclassr  r1  r5  r9  r<  rA  rD  rH  rQ  rW  rl  rp  r9   r:   r;   <module>r     sb   "           	  	 
   " I I  ? / * G  ) % 3	 	 	     	< 	 ==L g!(#( ( #	(
 (V=<4k k\ N Nb"   ,^<4% <4~	" `#``F
 !%T$ 
$	N* 
`C `CF#T 2C
2C 2C 2CjQ!CN UpA!A2@AAL 4+` $R 4 !H8
  LQ* SA SA SAlAX A
AX AAX A
$8 
$Ax AH Q Q
h 
""h ""J
 
 : :r:   