
    Vh#                      d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlmZ d dlmZmZmZmZmZ d dlZd dlmZ d dlZd dlZd dlmc mZ d dlmZ d dlm Z m!Z! d dl"m#Z# d d	l$m%Z% d d
l&m'Z' d dl(m)Z)m*Z*m+Z+m,Z,m-Z- d dl.m/Z/ d dl0m1Z1 d dl2m3Z3 d dl4m5Z5m6Z6 ddl7m8Z8m9Z9m:Z: ddl;m<Z< ddl:m=Z=m>Z> ddl?m@Z@ ddlAmBZB ddlmCZCmDZDmEZEmFZFmGZGmHZHmIZI ddlJmKZK ddlLmMZMmNZNmOZOmPZPmQZQmRZRmSZS ddlTmUZU ddlVmWZWmXZXmYZY erd dlZm[Z[m\Z\ d dl]Z]ddl^m_Z_  eQ       j                  Zaebej                  ej:                  edf   Zeee:j                  eRf   ZgdHdZhdIdZidJd ZjdKd!ZkdLd"Zlemedenf   Zoeebeenej,                  f   d#f   eeogebend#f   f   f   Zp	 dM	 	 	 	 	 	 	 	 	 dNd$ZqdOd%Zrej                   G d& d'             Zt G d( d)      Zu G d* d+      Zvej                   G d, d-ev             Zwej                   G d. d/ev             Zxej                   G d0 d1ev             Zy G d2 d3ev      Zzej                   G d4 d5ev             Z{ej                   G d6 d7e{             Z|ej                   G d8 d9e{             Z}ej                   G d: d;e{             Z~ G d< d=e{      Zej                   G d> d?ev             Zej                   G d@ dAe             Zej                   G dB dCe             ZedZ G dD dEeN      Z G dF dGe      Zy)P    )annotationsN)count)AnyCallableOptionalTYPE_CHECKINGUnion)Expr)dtype)countersdynamo_timed)DebugPrinterManager)MultiKernelState)	cache_dir)CallMethodKeyConvertIntKeyDivideByKeyresolve_unbacked_bindingsSymTypes)_get_qualified_name)
OrderedSet)SingletonInt)symbol_is_typeSymT   )async_compileconfigir)output_code_log)IRNodeReinterpretView)triton_heuristics)DeviceProperties)cache_on_selfget_benchmark_nameLineContextsympy_product	sympy_str
sympy_substriton_version_uses_attrs_dict)V   )ArgNameCodeGenDeferredLineIndentedBufferPythonPrinterWorkspaceArgWorkspaceZeroMode)cexpr)	config_ofshould_unwrap_unspec_argsignature_to_meta)IteratorSequence)GraphLoweringc                    t         j                  j                  |       }| j                         | j	                         t        t         j                  j                  j                  |            fS N)r+   graphget_allocation_storage_sizeget_device_or_error	get_dtyper(   sizevarssimplify)nodestorage_sizes     O/home/dcms/DCMS/lib/python3.12/site-packages/torch/_inductor/codegen/wrapper.pybuffer_reuse_keyrF   T   sU    7766t<L  " 	!''""++L9:     c                   | j                         |j                         k7  ry| j                         |j                         k7  ryt        j                  j                  j                  t        j                  j                  |             }t        j                  j                  j                  t        j                  j                  |            }t        |      t        |      k(  sWt        j                  j                  j                  |d|z        r+t        j                  j                  j                  ||      ryy)NFgffffff?T)
r?   r@   r+   r=   rA   rB   r>   r(   statically_known_geqstatically_known_leq)	input_buf
output_buf
input_sizeoutput_sizes       rE   can_match_buffer_sizerO   `   s     $$&**H*H*JJ
 4 4 66!!**	++I6J ''""++	++J7K 	*;!77 	
--k4*;LMGG11+zJrG   c                   ddl m}m} t        | j                        }|dk(  r.| j
                  | j
                  j                  rd| dS d| dS ||v r||   }|S |j                         D ]Q  \  }}t        j                  |dz   |      }t        |      dk(  s.|d   }||v sJ d	| d
|        ||   }	| d|	 dc S  t        d|       )Nr,   )CONTAINER_PYTHON_TO_CPPPYTHON_TO_CPPTensorzat::&z const&z\[([a-zA-Z_]+)]r   zunsupported z type in convert_arg_type: <>zunsupport python_type: )cpprQ   rR   repr	real_type
alias_infois_writeitemsrefindalllenAssertionError)
argrQ   rR   python_typecpp_typepy_containercpp_containercontainer_matchcontained_typecpp_contained_types
             rE   convert_arg_typeri      s   ; s}}%Kh>>%#..*A*A+a((+g..m# - (?'D'D'F <#m**\4F%FT1$,Q/N!]2 |n,GGWX2 "/~!>#_A&8%9;;< 2;-@
AArG   c                    t        | j                        }ddd}|j                  |d       }|
J d|        |dk(  r| j                  |dz  }|S )Nz
at::Tensorzstd::vector<at::Tensor>)rS   zList[Tensor]zNYI return type: rS   rT   )rX   rY   getrZ   )retrb   python_to_cpprc   s       rE   convert_return_typern      sk    s}}%K1M
   d3HB#4[M!BB h3>>#=COrG   c                   | j                   j                  }| j                   j                  }t        |      }|dkD  sJ d       |dk(  rt	        |d         }n3|dkD  r.dj                  |D cg c]  }t	        |       c}      }d| d}|D cg c]  }t        |       d|j                    }} ddj                  |       d	S c c}w c c}w )
Nr   z#must have at least one return valuer,   , zstd::tuple<rV    ())_schema	argumentsreturnsr_   rn   joinri   name)	kernelargsrv   num_returnscpp_return_valuertuple_returnsra   cpp_arg_types	            rE   get_cpp_op_schemar      s    >>##Dnn$$Gg,K?AAA?a.wqz:	q		7"Ka#6q#9"KL(q9EIJc',-Qsxxj9JLJq<!8 9;;	 #L Ks   ,C	!C.c                    t               dd	 	 dfd}dd fd}d  } |d| d       r4t        j                  j                  rj                  j                         nt        j                         }j                         5  |5  t        |      dk(  r ||d         \  }}	 |d	| d	|	        nt        |      dkD  sJ t        |      t        |      k(  sJ t        t                  }
t        t        ||      d
 d      D ]  \  }}|j                  rD|j                  j                         D  cg c]  \  } }d|  d|  }} }dj                  |      }nd} ||      \  }}	d| d| }||
v rp|
j!                  |        ||d| d|	         d d d        d d d        |j#                         fS c c}} w # 1 sw Y   )xY w# 1 sw Y   -xY w)Nc                d    t        | t        j                        r| S t        j                  |       S r<   )
isinstancesympyr
   Integer)items    rE   _convert_to_sympy_exprz@user_defined_kernel_grid_fn_code.<locals>._convert_to_sympy_expr   s#    !$

3tLt9LLrG   c                    t        |       r| | fS t        fd| D              }j                  |      t        j                  j
                  r$j                  t        fd|D                    fS dfS )a'  
        This function return a tuple of two values: the first one is for the real grid
        which is used in the generated code; the second one is an example grid with
        concreate values which is used in the autotune block to run the generated
        kernels at compile time.
        Nc              3  .   K   | ]  } |        y wr<    ).0gr   s     rE   	<genexpr>zKuser_defined_kernel_grid_fn_code.<locals>.determine_grid.<locals>.<genexpr>   s     C1!4Cs   c              3  T   K   | ]  }j                  |t        |             ! y wr<   generate_example_arg_valuetype)r   r   wrappers     rE   r   zKuser_defined_kernel_grid_fn_code.<locals>.determine_grid.<locals>.<genexpr>   s*        ::1d1gF   %()callabletuplecodegen_python_shape_tupler   tritonautotune_at_compile_time)grid
sympy_gridr   r   s     rE   determine_gridz8user_defined_kernel_grid_fn_code.<locals>.determine_grid   s     ?htn:CdCC
..z: ==99 22 !+ 
 	
 
 	
rG   c                    j                  |        rJt        j                  j                  r/j                  vr j
                  j                  |xs |        y y y y r<   )	writeliner   r   r   kernel_autotune_nameskernel_autotune_calls)lineexample_gridrx   outputr   s     rE   r   z3user_defined_kernel_grid_fn_code.<locals>.writeline   sW    66G999))33L4HDI : 7 rG   grid_wrapper_for_def z(meta):r,   r   zreturn c                2    t        | d   j                        S Nr,   r_   kwargsxs    rE   <lambda>z2user_defined_kernel_grid_fn_code.<locals>.<lambda>      3qt{{3C rG   Tkeyreversezmeta['z'] == z and Trueif z	: return )r   Union[int, sympy.Expr]return
sympy.Expr)r   
TritonGridr<   )r   strr   Optional[str])r0   r   r   r   r   indent
contextlibnullcontextr_   r   r   sortedzipr   r\   rw   addgetvalue)rx   configsgridsr   r   r   fn_namekernel_autotune_calls_indentr   r   seencvalguards	statementr   r   s   `  `           @@rE    user_defined_kernel_grid_fn_coder      s    FM

8J J "$(GWIW%& v}}== 	%%,,.##% !
 
 L6 Lu:?!/a!9D,v&',(@Au:>!>u:W---c?$D "E7#)CT La 88DEHHNNDT7@tS&fSE2F  %\\&1F#F%3D%9"l!&4&9	$#)s6()L>%JKLL L< FOO%%%#L L L Ls8   GB/GGAG%GGG	GG"c                    t               j                  | j                  d       ddlm ddlm t        | j                  g      fd |        j                         S )zg
    Given a triton kernel function pointer collect the transitive closure of
    its dependencies
    Tstripr   )JITFunction)	constexprc           	        t        d t        j                  | j                        D              }| j                  j                  j                  di       }| j                  j                  j                  D ]  }|v r	|| j                  j                  v s"| j                  j                  |   }t        |      rX	j                          	j                  d       	j                  |j                  d       j                  |        |       t        |t        t        t         
f      r	j                          t        |
      rd|j"                  d}n|}|j                  |      x}rKt        |t$              rd|j&                   d	|j(                   }nd|}	j                  | | d
|        n	j                  | d
|        j                  |       r||v sx|dk7  st+        |d      s|j&                  j-                  d      s	j                  d|j&                   d|j(                   d|        j                  |        y )Nc              3  R   K   | ]  }|j                   d k(  r|j                   ! yw)LOAD_GLOBALN)opnameargval)r   insts     rE   r   z^user_defined_triton_kernel_transitive_closure_source_code.<locals>.traverse.<locals>.<genexpr>4  s(      '
{{m+ KK'
s   %'__annotations__z@triton.jitTr   ztl.constexpr(rs   : . = tl
__module__r   zfrom z import z as )r   disBytecodefn__globals__rk   __code__co_namesr   newliner   splicesrcr   intr   boolvaluer   r   __name__hasattr
startswith)
cur_kernelunqualified_loadsglobal_annotationssymbol_namesymbol
symbol_str
annotationannotation_coder   compile_wrapperr   symbols_includedtraverses           rE   r   zKuser_defined_triton_kernel_transitive_closure_source_code.<locals>.traverse/  s?   
 ' '
Z]]3'
 

 (]]66::;LbQ%==11:: -	6K..jmm777#22;?fk2#++-#--m<#**6::T*B$((5V$c4(CD#++-!&)4'4V\\4DA%F
(.z
%7%;%;K%HHzH%j$7"$Z%:%:$;1Z=P=P<Q R , 13:..AO'11*mO+<C
|L (11[MZL2QR$((5#44#t+5 ))44X>
 $-- 1 12(6??:K4P[}] %((5[-	6rG   )
r0   r   r   r   r   triton.languager   r   r   r   )ry   r   r   r   r   r   s    @@@@@rE   9user_defined_triton_kernel_transitive_closure_source_coder      sd    
 %&O6::T2 #) "6??"3486 86t V##%%rG   c                  (    e Zd ZU ded<   ded<   d Zy)SymbolicCallArgr   innerr   
inner_exprc                ,    t        | j                        S r<   )r   r   selfs    rE   __str__zSymbolicCallArg.__str__s  s    4::rG   N)r   r   __qualname__r   r   r   rG   rE   r   r   m  s    JrG   r   c                  6     e Zd Z fdZddZddZddZ xZS )MemoryPlanningStatec                l    t         |           t        j                  t              | _        d| _        y Nr   )super__init__collectionsdefaultdictlist
reuse_pooltotal_allocated_buffer_size)r   	__class__s    rE   r  zMemoryPlanningState.__init__x  s-    ##D) 	 12(rG   c                L    t        | j                  j                  |d             S r<   )r   r  rk   )r   r   s     rE   __contains__z MemoryPlanningState.__contains__  s    DOO''T233rG   c                \    | j                   |   j                         }|j                  rJ |S r<   )r  pop	is_reusedr   r   r   s      rE   r  zMemoryPlanningState.pop  s+    s#'')>>!!rG   c                \    |j                   rJ | j                  |   j                  |       y r<   )r  r  appendr  s      rE   pushzMemoryPlanningState.push  s&    >>!!##D)rG   )r   ReuseKeyr   r   )r   r  r   FreeIfNotReusedLine)r   r  r   r  r   None)r   r   r   r  r  r  r  __classcell__r
  s   @rE   r   r   w  s    24
*rG   r   c                      e Zd Zy)WrapperLineNr   r   r   r   rG   rE   r  r        rG   r  c                  2    e Zd ZU ded<   ded<   ddZd	dZy)
EnterSubgraphLinePythonWrapperCodegenr   r:   r=   c                b    | j                   j                  | j                   j                         y r<   )r   push_computed_sizescomputed_sizesr   s    rE   __post_init__zEnterSubgraphLine.__post_init__  s    (()D)DErG   c                n    | j                   j                  | j                         |j                          y r<   )r   push_codegened_graphr=   	do_indentr   codes     rE   codegenzEnterSubgraphLine.codegen  s"    ))$**5rG   Nr   r  r(  r0   r   r  r   r   r   r   r#  r)  r   rG   rE   r  r    s    !!FrG   r  c                  (    e Zd ZU ded<   ddZddZy)ExitSubgraphLiner  r   c                V    | j                   j                         | j                   _        y r<   )r   pop_computed_sizesr"  r   s    rE   r#  zExitSubgraphLine.__post_init__  s    &*ll&E&E&G#rG   c                X    | j                   j                          |j                          y r<   )r   pop_codegened_graphdo_unindentr'  s     rE   r)  zExitSubgraphLine.codegen  s    ((*rG   Nr*  r+  r,  r   rG   rE   r.  r.    s    !!HrG   r.  c                  *    e Zd ZU ded<   ded<   ddZy)EnterDeviceContextManagerLiner   
device_idxzOptional[int]last_seen_device_guard_indexc                x   t         j                  j                  r|j                  d       t         j                  j                  rg| j
                  ;|j                  t         j                  j                  j                          d       y | j
                  | j                  k(  s{J d       | j
                  H|j                  t         j                  j                  j                          d| j                   d       y |j                  d| j                   d       y y |j                  dt         j                  j                  j                  | j                         d       |j                          |j                  t         j                  j                  j                  | j                               y )	N
z) stream_guard(stream, this->device_idx_);z4AOTInductor only supports running on one CUDA devicez device_guard(z);zdevice_guard.set_index(with :)r+   r=   cpp_wrapperr   aot_moder7  
device_opscpp_aoti_stream_guardr6  cpp_aoti_device_guarddevice_guardr&  
set_devicer'  s     rE   r)  z%EnterDeviceContextManagerLine.codegen  sP   77NN4 ww 44<NN77--CCEFFop  <<O NO 44<NN77--CCEFnUYUdUdTeegh NN%<T__<MR#PQ P NNU177#5#5#B#B4??#S"TTUVWNNNN177--88IJrG   Nr+  )r   r   r   r   r)  r   rG   rE   r5  r5    s    O"//KrG   r5  c                      e Zd ZddZy)ExitDeviceContextManagerLinec                Z    t         j                  j                  s|j                          y y r<   )r+   r=   r<  r3  r'  s     rE   r)  z$ExitDeviceContextManagerLine.codegen  s     ww"" #rG   Nr+  r   r   r   r)  r   rG   rE   rD  rD    s    rG   rD  c                  0    e Zd ZU ded<   ddZddZd	dZy)
MemoryPlanningLiner  r   c                    | S )zFirst pass to find reuser   r   states     rE   planzMemoryPlanningLine.plan  s    rG   c                     y)zSecond pass to output codeNr   r'  s     rE   r)  zMemoryPlanningLine.codegen  s    rG   c                r   g }t        j                  |       D ]t  }|j                  dk(  rt        | |j                        }|j	                  |j                   d|j
                  t        j                  u r|j                         n|        v t        |       j                   ddj                  |       dS )zF
        Emits a string representation that fits on one line.
        r   =rr   rp   rs   )dataclassesfieldsrx   getattrr  r   r   Bufferget_namer   rw   )r   rz   fieldr   s       rE   r   zMemoryPlanningLine.__str__  s      ''- 	EzzY&$

+CKK::,a%**		2IsST		 t*%%&a		$'8::rG   NrK  r   r   rH  r+  r   r   )r   r   r   r   rL  r)  r   r   rG   rE   rH  rH    s    !!);rG   rH  c                  (    e Zd ZU ded<   ddZddZy)AllocateLine
BufferLikerC   c           	        | j                   j                         t        j                  j                  v rt        | j                        S t        | j                         }t        j                  rG||v rC|j                  |      }d|_        t        | j                  |j                   | j                         S | j                   j                         j                  dk(  rh| j                  j                  | j                         }|A|xj                   t#        t%        j&                  t(        j*                  |d            z  c_        | S )NTcpur,   )rC   rT  r+   r=   removed_buffersNullLiner   rF   r   allow_buffer_reuser  r  	ReuseLiner?   r   static_shape_for_buffer_or_noner	  r   	functoolsreduceoperatormul)r   rK  r   	free_linestatic_shapes        rE   rL  zAllocateLine.plan  s    99177#:#::DLL)) tyy)$$		#I"&IT\\9>>499EE99((*//58<<GG		RL'11S$$X\\<C6 1 rG   c                    | j                   j                         t        j                  j                  vsJ | j
                  j                  | j                         }|j                  |       y r<   )rC   rT  r+   r=   r]  r   make_buffer_allocationr   r   r(  r   s      rE   r)  zAllocateLine.codegen  sK    yy!!#177+B+BBBB||22499=trG   NrV  r+  )r   r   r   r   rL  r)  r   rG   rE   rY  rY    s    
(rG   rY  c                  6    e Zd ZU ded<   dZded<   d	dZd
dZy)r  rZ  rC   Fr   r  c                   t        | j                  j                               dkD  r| S t        | j                  j                  t
        j                        r| S | j                  rJ | j                  j                         t        j                  j                  v rt        | j                        S t        j                  r%|j!                  t#        | j                        |        | S r  )r_   rC   get_inputs_that_alias_outputr   layoutr   MultiOutputLayoutr  rT  r+   r=   r]  r^  r   r   r_  r  rF   rJ  s     rE   rL  zFreeIfNotReusedLine.plan  s    tyy55781<Kdii&&(<(<=K>>!!99177#:#::DLL))$$JJ'		2D9rG   c                    | j                   j                         t        j                  j                  vsJ | j
                  s5|j                  | j                  j                  | j                                y y r<   )	rC   rT  r+   r=   r]  r  r   r   make_buffer_freer'  s     rE   r)  zFreeIfNotReusedLine.codegen  sR    yy!!#177+B+BBBB~~NN4<<88CD rG   NrV  r+  )r   r   r   r   r  rL  r)  r   rG   rE   r  r  
  s    
It
ErG   r  c                  @    e Zd ZU ded<   ded<   dZded<   d
dZddZy	)r`  rZ  rC   	reused_asTr   
delete_oldc                p   | j                   j                         t        j                  j                  v rK| j
                  j                         t        j                  j                  v sJ t        | j                        S | j
                  j                         t        j                  j                  vsJ | S r<   )rC   rT  r+   r=   r]  rs  r^  r   rJ  s     rE   rL  zReuseLine.plan'  s    99177#:#::>>**,0G0GGGGDLL))~~&&(0G0GGGGrG   c                p   | j                   j                         t        j                  j                  vsJ | j
                  j                         t        j                  j                  vsJ |j                  | j                  j                  | j                   | j
                  | j                               y r<   )
rC   rT  r+   r=   r]  rs  r   r   make_buffer_reusert  r'  s     rE   r)  zReuseLine.codegen.  sz    yy!!#177+B+BBBB~~&&(0G0GGGGLL**499dnndooV	
rG   NrV  r+  )r   r   r   r   rt  rL  r)  r   rG   rE   r`  r`  !  s"    
J
rG   r`  c                      e Zd Zy)r^  Nr  r   rG   rE   r^  r^  6  r  rG   r^  c                  X    e Zd ZU ded<   ded<   ed	d       Zed
d       Zedd       Zy)CommBufferLiner  r   	ir.BufferrC   c                    ddl m} | j                  j                         }| j                  j	                         } ||      rt        d| j                         t        |      |j                  z  S )Nr   )is_symbolicz-The size of a comm buffer can't be symbolic: )torch._inductor.utilsr}  rC   	get_numelr@   r`   r   itemsize)r   r}  numelr   s       rE   sizezCommBufferLine.size?  sa    5		##%		##%u ?		{K  5zENN**rG   c                    | j                   j                         }t        |t        j                        sJ |j
                  S r<   )rC   get_output_specr   r   CommBufferLayoutcomm_buffer_typer   rn  s     rE   r  zCommBufferLine.comm_buffer_typeK  s6    **,&""5"5666&&&rG   c                    | j                   j                         }t        |t        j                        sJ |j
                  S r<   )rC   r  r   r   r  
group_namer  s     rE   r  zCommBufferLine.group_nameQ  s6    **,&""5"5666   rG   Nr   r   )r   zir.CommBufferTyperW  )r   r   r   r   propertyr  r  r  r   rG   rE   rz  rz  :  sG    !!
O	+ 	+ ' '
 ! !rG   rz  c                  $    e Zd ZddZed        Zy)CommBufferAllocateLinec                "   | j                   j                         t        j                  j                  vsJ | j                   j                         }| j                   j                         }| j                   j                         }t        | j                   j                               }t        | j                   j                               }|j                  | j                  | j                  | j                  | j                  |||||             y r<   )rC   rT  r+   r=   r]  
get_devicer@   r   get_size
get_strider   make_allocation_liner  r  r   )r   r(  rx   devicer   shapestrides          rE   r)  zCommBufferAllocateLine.codegenZ  s    yy!!#177+B+BBBByy!!#%%'		##%dii((*+tyy++-.%%%%		
rG   c                    | t         j                  j                  k(  rS| d|j                  |       d|j                  |       d| d|j                   d| dt        j                  dd       dS t        d	|        )
Nz = empty_strided_p2p(rp   z, torch.device("cuda:z"), group_name="z", alloc_id=r   l    rs   zUnsupported comm buffer type: )r   CommBufferTypeSYMM_MEMcodegen_shape_tupleindexrandomrandintNotImplementedError)r  r  r   rx   r  r   r  r  s           rE   r  z+CommBufferAllocateLine.make_allocation_linen  s     r00999&-..u56b..v67r' &&,ll^ 4)l +"NN1i89< &01A0BC rG   Nr+  )r   r   r   r)  staticmethodr  r   rG   rE   r  r  X  s    
(  rG   r  c                      e Zd ZddZy)CommBufferFreeLinec                    | j                   j                  | j                        }|j                  | d| j                  j
                   d       y )Nz # z buffer free)r   rq  rC   r   r  r   rj  s      rE   r)  zCommBufferFreeLine.codegen  s@    ||,,TYY7$s4#8#8#>#>"?|LMrG   Nr+  rF  r   rG   rE   r  r    s    NrG   r  c                  j    e Zd ZdZ fdZe	 dz	 	 	 	 	 	 	 d{d       Zd|dZd}dZd|dZ	d~dZ
d|d	Zed|d
       Zed|d       ZddZedd       Zd|dZ	 	 ddZddZd|dZd|dZd|dZddZddZddZd|dZd|dZdzddZd Zd Zd Zd Z d Z!ddZ"dd Z#d|d!Z$dd"Z%dd#Z&dd$Z'dd%Z(d& Z)d' Z*	 	 	 	 	 	 	 	 	 	 	 	 dd(Z+dd)Z,d* Z-d+ Z.d, Z/	 	 	 d	 	 	 	 	 	 	 	 	 dd-Z0d. Z1dd/Z2d0 Z3d1 Z4d2 Z5d3 Z6	 	 	 	 	 	 dd4Z7d5 Z8dd6Z9d7 Z:d8d9dd:Z;d8d9dd;Z<dd<Z=dd=Z>dd>Z?dd?Z@dd@ZA	 dz	 	 	 ddAZBddBZCdC ZDdD ZEdE ZFdF ZG	 	 	 d	 	 	 	 	 	 	 	 	 ddGZHddHZI	 	 ddIZJdzddJZKddKZLddLZMdM ZNdN ZOdO ZPdP ZQdQ ZRdR ZSdS ZTdT ZUddUZVdV ZWdd8ddddW	 ddXZXdY ZYdZ ZZd[ Z[dzd\Z\dd]Z]	 dzd^Z^dd_Z_dd`Z`ddaZaddbZbddcZcdddZdddeZedf ZfdzdgZgdh ZhddiZidj Zj	 	 	 	 	 	 	 	 ddkZkdl Zldm Zm	 	 	 	 ddnZnddoZodp Zpdq Zqdr Zrds Zsdt Ztedu        Zuedv        Zvedw        Zwedx        Zxedy        Zy xZzS )r  zB
    Generate outer wrapper in Python that calls the kernels.
    c                    t                    t                _        t	                _        t	                _        t	                _        t	                _        t	                _	        t	                _
        t	                _        t	                _        t	                _        t        t                   _        i  _        t                _        g  _        d _        d _        d _        d _        d _        t2        j4                  j6                  rdnd _        t2        j4                  j6                  rdnd _        d  _        d _        i  _         t        t                   _!        t                _"        d  _#         jI                          g  _%        g  _&         jO                           jQ                           jS                          t2        j4                  jT                  sBt2        j4                  jV                  jY                         D ]  \  }} j[                  ||        t        t\                   _/        t        t\                   _0        i  _1         te        jf                  d        jh                         _4        te        jf                  d       d	 fd       }| _5        i  _6        t        t                   _7        tq                _9        t        t                   _:        i  _;        ty        tz        j|                  j~                  tz        j|                  j                         _A        g  _B        y )
N #r  z
std::move(rs   Tc                    j                   j                  |        t        j                  j                  rj
                  j                  |        y y r<   )importsr   r   r   r   r   )r   r   s    rE   add_import_oncez6PythonWrapperCodegen.__init__.<locals>.add_import_once  s;    LL""4(}}55**44T: 6rG   )debug_printer_leveluse_array_ref)r   r   r   r  )Cr  r  r   _names_iterr0   r  headerprefixsuffixkernel_declarationswrapper_callkernel_autotune_defsr   subgraph_definitionsr   r   r   src_to_kernelkernel_numel_exprlinesdeclaredeclare_maybe_referenceendingcommentnone_strr+   r=   r<  
move_beginmove_endr7  supports_intermediate_hooksuser_defined_kernel_cacheunbacked_symbol_declsr"  launcher_fn_nameset_launcher_fn_namecodegened_graph_stackcomputed_sizes_stackwrite_headerwrite_prefix!write_kernel_autotune_defs_headerr=  constant_reprsr\   write_constant
BufferName	allocatedfreedreusesrb  	lru_cachewrite_get_raw_streamr  _metas
_meta_varsr   multi_kernel_statealready_codegened_subgraphsallocated_workspacesr   r   aot_inductor debug_intermediate_value_printerallow_stack_allocationdebug_printeradditional_files)r   rx   hashedr  r
  s   `   rE   r  zPythonWrapperCodegen.__init__  s   */'%'$&$&$&#1#3 *,$2$4!%3%5"$2$4!%/_%6" .0HRCE
')$*+''*=*=,2 ww22;?)+/(QS&%/_%6"8B $!!# &("$&!..0ww ! 6 6 < < > 2f##D&12 $J/1
+-
 57$=I$7$7$=%%%
! 
		T	"	; 
#	;
  /&($S/+"2"4+5c?+<(46! 1 & 3 3 T T --DD
 !#rG   Nc                D    | r|J |J t        |||      S t               S r<   )SubgraphPythonWrapperCodegenr  )is_subgraphsubgraph_nameparent_wrapperpartition_signaturess       rE   createzPythonWrapperCodegen.create  s?      ,,,!---/~/C  $%%rG   c                    d| _         y )Ncall)r  r   s    rE   r  z)PythonWrapperCodegen.set_launcher_fn_name  s
     &rG   c                D    | j                   j                  | d|        y )Nz = None  # )r  r   )r   rx   r  s      rE   r  z#PythonWrapperCodegen.write_constant  s    k&:;rG   c           	     T   t         j                  j                  j                         }d}||j                  d|j                   }d}t        t        j                  j                        dkD  rd}| j                  j                  d| dt        j                   d| dd	
       | j                  j                  dd	
       	 ddlm} | j                  j                  dd	
       t        j$                  r| j                  j'                  d       y y # t         t"        f$ r Y >w xY w)Nr  z
# AOT ID: r   zRfrom torch._inductor.codegen.debug_utils import _print_debugging_tensor_value_infoz
                aH  
                from ctypes import c_void_p, c_long, c_int
                import torch
                import math
                import random
                import os
                import tempfile
                from math import inf, nan
                from cmath import nanj
                from torch._inductor.hooks import run_intermediate_hooks
                from torch._inductor.utils import maybe_profile
                from torch._inductor.codegen.memory_planning import _align as align
                from torch import device, empty_strided
                from z import AsyncCompile
                from torch._inductor.select_algorithm import extern_kernels
                from torch._inductor.codegen.multi_kernel import MultiKernelCall
                z
            Tr   a  
                aten = torch.ops.aten
                inductor_ops = torch.ops.inductor
                _quantized = torch.ops._quantized
                assert_size_stride = torch._C._dynamo.guards.assert_size_stride
                empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
                empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
                empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
                reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
                alloc_from_pool = torch.ops.inductor._alloc_from_pool
                async_compile = AsyncCompile()
            )_SymmetricMemoryzs
                empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
                zfrom torch.cuda import nvtx)torch_guardsTracingContexttry_getaot_graph_namer   r   r  r  r  r   r   r   r  torch._C._distributed_c10dr  AttributeErrorImportErrorannotate_trainingr   )r   contextaot_config_commentaot_inductor_debug_utilsr  s        rE   r  z!PythonWrapperCodegen.write_header  sF   --..6687#9#9#E#-g.D.D-E!F#% v""CCDqH'{$#$ % $,,- . ** +#& ) 	 	
, 	  	 	
	 DKK 	   ##KK!!"?@ $ , 		s   #D D'&D'c                     y r<   r   )r   r  s     rE   include_extra_headerz)PythonWrapperCodegen.include_extra_header5      rG   c                ^    | j                   j                  dt        j                   d       y )Na	  
                import torch
                from torch._dynamo.testing import rand_strided
                from torch._dynamo.utils import preserve_rng_state
                from torch._inductor.select_algorithm import AlgorithmSelectorCache
                from aH   import AsyncCompile

                async_compile = AsyncCompile()
                generate_example_value = AlgorithmSelectorCache.generate_example_value
                empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
                empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
            )r  r   r   r   r   s    rE   r  z6PythonWrapperCodegen.write_kernel_autotune_defs_header8  s3    !!((
 $,,- .	
rG   c                   dt         j                   d}t        j                  j                  r]| j
                  j                  |       | j
                  j                  t        j                  j                  j                  d             t        j                  j                  s`| j                  j                  |d       | j                  j                  t        j                  j                  j                  d             y y )NzU
            import triton
            import triton.language as tl
            from z+ import start_graph, end_graph
            get_raw_streamTr   )r"   r   r   r   r   r   r   r   r+   r=   r>  import_get_raw_stream_asr<  r  )r   
import_strs     rE   write_triton_header_oncez-PythonWrapperCodegen.write_triton_header_onceH  s     $,,- .

 ==11&&--j9&&00"";;<LM ww""LL
$7LL"""";;<LM #rG   c                v   t         j                  j                  rB| j                  j	                  t
        j                  j                  j                  d             t
        j                  j                  sC| j                  j	                  t
        j                  j                  j                  d             y y )Nr  )r   r   r   r   r   r+   r=   r>  r  r<  r  r   s    rE    write_get_raw_stream_header_oncez5PythonWrapperCodegen.write_get_raw_stream_header_onceZ  s{    ==11&&00"";;<LM ww""LL"""";;<LM #rG   c                   t        |      }|| j                  vrdt        | j                         }|| j                  |<   | j                  j	                  | d|        t
        j                  j                  r;| j                  j	                  | d|        | j                  j                  |       | j                  |   S )Nmetar   )rX   r  r_   r  r   r   r   r   r   r  r   )r   r  vars      rE   add_meta_oncez"PythonWrapperCodegen.add_meta_oncee  s    Dzt{{"T[[)*+C #DKKKK!!SETF"34}}55**44uCv5FG##C({{4  rG   c                z    | j                         D cg c]  }|j                  | j                         c}S c c}w r<   )get_graph_outputscodegen_referencer  r   r   s     rE   get_output_refsz$PythonWrapperCodegen.get_output_refsp  s<     =A<R<R<T
78A 1 12
 	
 
s   "8c                     y r<   r   r   s    rE   mark_output_typez%PythonWrapperCodegen.mark_output_typev      rG   c                6    t         j                  j                  S r<   )r+   r=   graph_inputsr   s    rE   get_graph_inputsz%PythonWrapperCodegen.get_graph_inputsy  s     ww###rG   c                6    t         j                  j                  S r<   )r+   r=   graph_outputsr   s    rE   r  z&PythonWrapperCodegen.get_graph_outputs~  s    ww$$$rG   c           
        | j                         j                         D ]  \  }}t        |t        j                  t
        j                  f      r1|t        j                  j                  vst        |t
        j                        rht        |j                               dk(  r| j                  |j                               }| j                  |j                               }| j                  j!                  d| d| d| d        y )Nr   zassert_size_stride(rp   rs   )r  r\   r   r   r
   r   TorchBindObjectr+   r=   graph_input_namesGeneratorStater'   r  r   r  r  r   )r   rx   bufr  r  s        rE   codegen_input_size_assertsz/PythonWrapperCodegen.codegen_input_size_asserts  s    ..0668 	SID##

B,>,>?@ 177444
R&&9  S\\^,1223<<>BD44S^^5EFFKK!!$7vRvRxq"QR	SrG   c                `   | j                   j                  d       | j                         j                         D ]r  \  }}t	        |t
        j                  t        j                  f      r1d| d}| j                   j                  |       d| d}| j                   j                  |       t y )Nz(# make sure graph inputs are not nan/infzassert not z.isnan().any().item()z.isinf().any().item())	r  r   r  r\   r   r   r
   r   r  )r   rx   r  r   s       rE   codegen_input_nan_assertsz.PythonWrapperCodegen.codegen_input_nan_asserts  s    HI..0668 	(ID##

B,>,>?@ &;<DKK!!$' &;<DKK!!$'	(rG   c                :    | j                   j                  d       y )NzV

            async_compile.wait(globals())
            del async_compile
            )r  r   r   s    rE   write_async_compile_waitz-PythonWrapperCodegen.write_async_compile_wait  s    	
rG   c                    dj                  |      }t        |      dk(  r|dz  }| j                  j                  | d       | j                  j                  d       y )Nrp   r,   ,z = argszargs.clear())rw   r_   r  r   )r   input_nameslhss      rE   
write_argszPythonWrapperCodegen.write_args  sP    ii${q 3JCWo.n-rG   c                    t         j                  r| j                  j                  d       d}|S | j                  j                  d| j                   d       d}|S )Na  
                class Runner:
                    def __init__(self, partitions):
                        self.partitions = partitions

                    def recursively_apply_fns(self, fns):
                        new_callables = []
                        for fn, c in zip(fns, self.partitions):
                            new_callables.append(fn(c))
                        self.partitions = new_callables

                    def call(self, args):
                r   z
                def z(args):
                r,   )r   graph_partitionr  r   r  r   prefix_indents     rE   !write_launcher_fn_call_get_indentz6PythonWrapperCodegen.write_launcher_fn_call_get_indent  sm    !!KK M  KK**+ ,
 MrG   c                6    t         j                  j                  S r<   )r+   r=   r  r   s    rE   get_graph_input_namesz*PythonWrapperCodegen.get_graph_input_names  s    ww(((rG   c                   | j                   J | j                          | j                         }| j                  j	                  |      5  t
        j                  j                  rA| j                  j                  t        j                  j                  j                                t        j                  j                         }t
        j                  r| j                  j                  d| d       | j                         x}r| j!                  |       | j#                          | j%                          d d d        y # 1 sw Y   y xY w)Nz0training_annotation = nvtx._device_range_start(''))r  r  r!  r  r   r   r   debug_sync_graphr   r+   r=   r>  synchronizeget_training_phaser  r#  r  codegen_inputs"codegen_input_size_and_nan_asserts)r   r   phaser  s       rE   r  z!PythonWrapperCodegen.write_prefix  s    $$000%%'>>@[[. 	6}}--%%agg&8&8&D&D&FGGG..0E''%%FugRP %)$>$>$@@ @ 12!335	6 	6 	6s   
C,D??Ec                    t         j                  r| j                          t         j                  r| j	                          y y r<   )r   size_assertsr  nan_assertsr  r   s    rE   r*  z7PythonWrapperCodegen.codegen_input_size_and_nan_asserts  s1    ++-**, rG   c                   | j                          d| }t        j                  j                  r=| j                  j                  | d| d       t        j                  j                  r|S | j                  | d| d       |S )Nstream = get_raw_stream(rs   )	r  r   r   r   r   r   r+   r=   r<  )r   r6  r=   rx   s       rE   r  z)PythonWrapperCodegen.write_get_raw_stream  s    --/
|$==11&&00&*:,a8 ww""$1*Q?@rG   c                     | j                   d   S )N)r  r   s    rE   get_codegened_graphz(PythonWrapperCodegen.get_codegened_graph  s    ))"--rG   c                :    | j                   j                  |       y r<   )r  r  )r   r=   s     rE   r%  z)PythonWrapperCodegen.push_codegened_graph  s    ""))%0rG   c                6    | j                   j                         S r<   )r  r  r   s    rE   r2  z(PythonWrapperCodegen.pop_codegened_graph  s    ))--//rG   c                P    ddl m} | j                  j                   ||            S )Nr   )deepcopy)copyr8  r  r  )r   r"  r8  s      rE   r!  z(PythonWrapperCodegen.push_computed_sizes   s!    !((//0HIIrG   c                6    | j                   j                         S r<   )r  r  r   s    rE   r0  z'PythonWrapperCodegen.pop_computed_sizes  s    ((,,..rG   c                .    t        | j                         S r<   )nextr  r   s    rE   next_kernel_suffixz'PythonWrapperCodegen.next_kernel_suffix  s    t''()*rG   c                8   | j                  t        || j                               t        j                  j
                  r| j                          | j                  j                  dt        j                  j                  j                  |       d       | j                  j                          | j                  j                  t        j                  j                  j                  |             | j                  j                  d| d| d       || _        y )Nr:  r;  r0  r1  rs   )r   r5  r7  r   r   r   r  r   r+   r=   r>  rA  r&  rB  )r   r6  s     rE   codegen_device_guard_enterz/PythonWrapperCodegen.codegen_device_guard_enter  s    )*d6W6WX	
 ==11))+&&00**77
CDAF &&002&&00""--j9 &&00$6zl!D -7)rG   c                    | j                  t                      t        j                  j                  r| j
                  j                          y y r<   )r   rD  r   r   r   r   r3  r   s    rE   codegen_device_guard_exitz.PythonWrapperCodegen.codegen_device_guard_exit  s6    356==11&&224 2rG   c                    |r1| j                   j                  ddj                  |      z   dz          y | j                   j                  d       y )Nzreturn (rp   , )z	return ())r  r   rw   )r   output_refss     rE   generate_returnz$PythonWrapperCodegen.generate_return#  s@    ''
TYY{5K(Ke(ST''4rG   c                     y r<   r   r   results     rE   generate_before_suffixz+PythonWrapperCodegen.generate_before_suffix)  r  rG   c                    t         j                  rNdj                  | j                        t	        | j                        dk(  rdndz   }|j                  d| d       y y )Nrp   r,   r  r  z-
                runner = Runner(partitions=[z{])
                call = runner.call
                recursively_apply_fns = runner.recursively_apply_fns
                )r   r  rw   all_partition_namesr_   r   )r   rH  all_partition_name_lists      rE   generate_after_suffixz*PythonWrapperCodegen.generate_after_suffix,  se    !!&*ii0H0H&I43349r'# MM--D,E F "rG   c                     y r<   r   rG  s     rE   generate_endz!PythonWrapperCodegen.generate_end:  r  rG   c                (    | j                  ||       y r<   )generate_extern_kernel_alloc)r   fallback_kernelrz   s      rE   generate_fallback_kernelz-PythonWrapperCodegen.generate_fallback_kernel=  s    ))/4@rG   c           
        t        |j                  t        j                        }|j	                         }|j                         }|j                         }| j                  }t        j                  r	d|v rd| }|r5| j                  | j                   | ddj                  |       d|        y | j                  | j                   | d| ddj                  |       d|        | j                  rKt        j                  r:|7t        d   dxx   d	z  cc<   | j                  d
|j                   d| d       y y y y )Nview_as_complexz.clone()rr   rp   rs   r   inductorintermediate_hooksr,   zrun_intermediate_hooks()r   rn  r   
NoneLayoutrT  get_origin_nodeget_kernel_namer  r   memory_planningr   r  rw   r  generate_intermediate_hooksr   rx   )r   extern_kernelrz   	no_returnoutput_nameorigin_nodekernel_namer  s           rE   rQ  z1PythonWrapperCodegen.generate_extern_kernel_alloc@  s;    }33R]]C	#,,.#335#335!!&7;&F  x(FNNdll^K=$))D/9J!F8TUNN<<.SQtyy>OqQWPXY 0066+$%9:a?:-k.>.>-AK=PQR , 7 1rG   c                    t         j                  j                  j                  }|j	                  ||d d d       |j                  d|r|n|        |5  | j                  | ddj                  |       d       d d d        y # 1 sw Y   y xY w)Nexternzout=rr   rp   rs   )r+   r=   wrapper_coder  set_printer_argsr  r   rw   )r   ry   outout_viewrz   r  debug_printer_managers          rE   generate_extern_kernel_outz/PythonWrapperCodegen.generate_extern_kernel_out]  s     !" 4 4 B B..tVT4Rdx8S9:;" 	;NNfXQtyy&7q9:	; 	; 	;s   'BBc                    |j                   }|j                  }|r$t        d |D              }t        d |D              }|j                  j	                          d}dj                   fd|D              }dj                   fd|D              }t        j                   |j                        }d}| d|j                   d	}| d| d| d| }	| d
|	 d}
|
S )Nc              3  n   K   | ]-  }t         j                  j                  j                  |       / y wr<   r+   r=   rA   atomically_apply_size_hintr   ds     rE   r   zEPythonWrapperCodegen._generate_tma_descriptor_call.<locals>.<genexpr>p  s%     VA))DDQGV   35c              3  n   K   | ]-  }t         j                  j                  j                  |       / y wr<   rl  rn  s     rE   r   zEPythonWrapperCodegen._generate_tma_descriptor_call.<locals>.<genexpr>q  s*      CD  ;;A>rp  z.data_ptr()rp   c              3  J   K   | ]  }t         j                  |        y wr<   r  val_to_arg_strr   dimr   s     rE   r   zEPythonWrapperCodegen._generate_tma_descriptor_call.<locals>.<genexpr>w  s     XC-<<T3GX    #c              3  J   K   | ]  }t         j                  |        y wr<   rs  ru  s     rE   r   zEPythonWrapperCodegen._generate_tma_descriptor_call.<locals>.<genexpr>x  s$      
?B //c:
rw  z$triton.tools.experimental_descriptorz.create_d_tma_descriptorrr   rs   )
dims
block_dimsr   tensorr  rw   r  rt  element_sizerank)r   descapply_size_hintsrz  r{  ptrr}  r  r   rz   r  s   `          rE   _generate_tma_descriptor_callz2PythonWrapperCodegen._generate_tma_descriptor_calll  s    yy__
VQUVVD HR J ..01=yyXSWXXYY 
FP
 

 ,::4ARARS7xx		{*:;bbB|n=QtfArG   c                    | j                  |      }|j                   d| | j                   }| j                  |       y Nr   )r  rx   r  r   )r   r  r  r   s       rE   generate_tma_descriptorz,PythonWrapperCodegen.generate_tma_descriptor  s:    11$7))Cvdkk]3trG   c                    | ddj                  t        t        |             }|j                  d      r|dj                  dg|z         z  }n|r|dt	        |       z  }|dz  }| j                  |       y )Nrr   r  zaten.scatter_reducerp   r  z	, reduce=rs   )rw   mapr   r   rX   r   )	r   r   inputscpp_kernel_namepython_kernel_namesrc_is_tensorrc  r   r   s	            rE   generate_scatter_fallbackz.PythonWrapperCodegen.generate_scatter_fallback  s{     %%QsxxC0@'A&BC(()>?DIIrdVm,,D)DL>22trG   c                |    ddj                  |       d}||||g}| j                  | j                  ||             y )N[rp   ])rw   r   wrap_kernel_call)r   ry   r   indicesvalues
accumulateindices_strrz   s           rE   generate_index_put_fallbackz0PythonWrapperCodegen.generate_index_put_fallback  sA    $))G,-Q/;
3t,,VT:;rG   c           	     V    | j                  | d| ddj                  |       d       y )Nr   rr   rp   rs   )r   rw   )r   buf_namer  r  codegen_argsop_overloadraw_argsoutputss           rE   ,generate_fallback_kernel_with_runtime_lookupzAPythonWrapperCodegen.generate_fallback_kernel_with_runtime_lookup  s0     	(3'9&:!DIIl<S;TTUVWrG   c                f    t        d      5  | j                  |      cd d d        S # 1 sw Y   y xY w)NzPythonWrapperCodegen.generate)r   	_generate)r   is_inferences     rE   generatezPythonWrapperCodegen.generate  s,    9: 	0>>,/	0 	0 	0s   '0c                &    t         j                  ryy)Nr   r,   )r   r  r   s    rE   get_wrapper_call_indentz,PythonWrapperCodegen.get_wrapper_call_indent  s    !!rG   c                D	   t         j                  r| j                          t               }|j	                  | j
                         |j                  d       |j	                  | j                         t        j                  j                  r>t        j                  j                  r$t        j                  j                  r
t               }|j	                  | j                         t        j                         5 }|j!                  | j"                  j%                                t         j&                  r| j)                  |       t         j                  r| j+                          |r!t         j,                  r| j/                          n| j1                          t         j2                  j4                  r*t         j2                  j6                  s| j9                          | j:                  D ]I  }t=        |t>              r|jA                  | j"                         /| j"                  j                  |       K | jC                         }| jE                          t         j2                  jF                  rA| j"                  j                  t        j                  jH                  jK                                t         j                  r| jM                          t         j2                  j4                  r*t         j2                  j6                  s| jO                          t         j2                  j6                  r| jQ                          t         jR                  r+t         j                  s| j"                  j                  d       | jU                  |       d d d        | jW                          |j	                  | jX                         | j[                         }|j%                  |      5  |j	                  | j"                         d d d        | j]                  |       |j	                  | j^                         | ja                  |       | jc                  |       | je                  |       |jg                         | jh                  jg                         fS # 1 sw Y   xY w# 1 sw Y   xY w)Nr  z+nvtx._device_range_end(training_annotation))5r   profile_bandwidthr  r0   r   r  r   r  r+   r=   r=  r<  is_const_graphr  r   	ExitStackenter_contextr  r   profiler_mark_wrapper_call#generate_profiler_mark_wrapper_callgenerate_start_graphr[  memory_planmemory_plan_reuser   store_cubinr   !generate_reset_kernel_saved_flagsr  r   r  r)  r  r  r&  r>  r'  generate_end_graph generate_save_uncompiled_kernelsgenerate_and_run_autotune_blockr  rE  finalize_prefixr  r  rI  r  rM  rO  add_benchmark_harnessgetvaluewithlinemapr  )r   r  rH  stackr   rD  wrapper_call_indents          rE   r  zPythonWrapperCodegen._generate  s?   ##))+!dll#dkk" 77 3 38N8N#%F 	d//0!!# )	.u 1 1 8 8 :;0088?''))+  6 6  "&&(}}((1W1W668

 6dK0LL!2!23%%//5	6 ..0K!!#}}--!!++AGG,>,>,J,J,LM'''')}}((1W1W557}}55446 ''0B0B!!++A   -S)	.V 	dkk""::<]]./ 	-MM$++,	- 	##F+dkk"""6*&!""6* &&($$88:
 	
w)	. )	.`	- 	-s   9JR	R	RRc                8   | j                   j                  d       i }| j                   j                         dz   | j                  j                         z   }t        j
                  t        j                  k(  rkt        j                  t               dd      5 }|j                  |j                  d             |j                  }ddd       t	        j                  d       	 t        ||       y# 1 sw Y   -xY w# t         $ r}t#        d	|       |d}~ww xY w)
z
        Compose self.kernel_autotune_defs and self.kernel_autotune_calls into a single block of
        code and execute it to trigger Triton kernel compilation and auto-tuning
        zQ
            async_compile.wait(globals())
            del async_compile
        r9  z.pyF)dirr  deletezutf-8NzAuto-tuning code written to %sz%Failed to run autotuning code block: )r  r   r   r   r   levelloggingDEBUGtempfileNamedTemporaryFiler   writeencoderx   debugexec	ExceptionRuntimeError)r   scopetuning_codef	file_pathes         rE   r  z4PythonWrapperCodegen.generate_and_run_autotune_block  s   
 	!!((	
 %%..0((1134 	
   GMM1 ,,Ke #**734FF		#
 !!0
	Se$# #  	S!FqcJKQRR	Ss$   -C0#C< 0C9<	DDDc                \    ddl m}  ||       j                  | j                        | _        y )Nr,   )MemoryPlanner)r[  r  rL  r  )r   r  s     rE   r  z PythonWrapperCodegen.memory_plan'  s     2"4(--djj9
rG   c                   t         j                  j                         }| j                  rt	        | j                  d   t
              r| j                  d   j                  j                  |vri| j                  j                          | j                  rCt	        | j                  d   t
              r&| j                  d   j                  j                  |vrit               g}g }t        t        | j                              D ]  }| j                  |   }t	        |t
              r"|j                  |d         | j                  |<   Dt	        |t              r|j                  t                      nt	        |t              s|j                  |j                                 |j                  |j                                t        |      dk(  sJ t!        d |D              }y )Nr3  r   c              3  4   K   | ]  }|j                     y wr<   )r	  )r   ss     rE   r   z9PythonWrapperCodegen.memory_plan_reuse.<locals>.<genexpr>I  s      +
./A))+
s   )r+   r=   get_output_namesr  r   rH  rC   rx   r  r   ranger_   rL  r  r  r.  sum)r   	out_namesplanning_statespast_planning_statesir   _total_allocated_buffer_sizes          rE   r  z&PythonWrapperCodegen.memory_plan_reuse,  s}   GG,,.	 JJ4::b>+=>

2##((	9 JJNN JJ4::b>+=>

2##((	9 /01!s4::' 	CA::a=D$ 23 $		/"*= >

1D"34&&':'<=D"23$++O,?,?,AB	C 	##O$7$7$9:?#q(((
 (+ +
3G+
 (
$rG   c           	     4  	 | j                   	t        j                  d       	fd       }t        j                  d       	fd       }t        |t        j
                        rGt        |t        j                        r||v ry 	j                  | d|        |j                  |       y t        |t        j                        rt        |j                               D ]V  \  }}t        |t        j                        s!||vs&	j                  | d ||       d| d       |j                  |       X t        |j                               D ]V  \  }}t        |t        j                        s!||vs&	j                  | d ||       d| d       |j                  |       X y t        |t        j                        ry t        |t        j                        ry t         j"                  j$                  j&                  ry t)        dt+        |             )Nc                <    j                  |  d|  d       |  dS )Nz_size = z.size()_sizer   rx   r(  s    rE   sizeofzDPythonWrapperCodegen.codegen_input_symbol_assignment.<locals>.sizeofU  s(    NNdV8D69:V5>!rG   c                <    j                  |  d|  d       |  dS )Nz
_stride = z	.stride()_strider  r  s    rE   strideofzFPythonWrapperCodegen.codegen_input_symbol_assignment.<locals>.strideofZ  s)    NNdV:dV9=>V7##rG   r   r  r  zUnknown value type: )r  rb  r  r   r   r
   Symbolr   r   r   	TensorBox	enumerater  r  r  r  r  	_inductorr   r  r`   r   )
r   rx   r   
bound_varsr  r  rv  r  r  r(  s
            @rE   codegen_input_symbol_assignmentz4PythonWrapperCodegen.codegen_input_symbol_assignmentM  s    {{			T	"	" 
#	" 
		T	"	$ 
#	$ eUZZ(eU\\2ez6INNeWCv./NN5!r||,&u~~'78 )	TdELL1d*6LNNdV3vd|nAcU!#DENN4()  ))9)9);< +Vfell3j8PNNfXS$0@#a#HINN6*+ r112r001%%55$';DK=%IJJrG   c           	        t        t        j                            }| j                         }|j	                         D cg c]$  \  }}t        |t        j                        s!||f& c}}|j	                         D cg c]$  \  }}t        |t        j                        r!||f& c}}z   }|D ]  \  }}| j                  |||        yc c}}w c c}}w )z$Assign all symbolic shapes to localsN)r   r   r  r  r\   r   r  )r   r  r
  kvr  rx   r   s           rE   r)  z#PythonWrapperCodegen.codegen_inputsw  s    -/
 ,,.+113
q!z!U\\7RQF
 , 2 2 4X1Jq%,,<WaVXY " 	JKD%00ujI	J
Xs   "C"C>"C!Cc                P   t        |t        j                        rt        |t        j
                        rq|| j                  v ry | j                  j                  |       t        j                  j                  j                  |   }| j                  | dt        |              y y y r  )r   r   r  r   r   PRECOMPUTED_SIZEr"  r   r+   r=   rA   inv_precomputed_replacementsr   pexpr)r   symexprs      rE   ensure_size_computedz)PythonWrapperCodegen.ensure_size_computed  s    c5<<(^CAVAV-Wd)))##C(77##@@EDNNcU#eDk]34 .X(rG   c                     y r<   r   r   s    rE   r  z$PythonWrapperCodegen.finalize_prefix  r  rG   TrB   c                   t        d      )Nz8codegen_cpp_sizevar is only implemented for cpp_wrapper!)r  r   r   rB   s      rE   codegen_cpp_sizevarz(PythonWrapperCodegen.codegen_cpp_sizevar  s    UVVrG   c                   t        ||      S )Nr  )r  r  s      rE   codegen_python_sizevarz+PythonWrapperCodegen.codegen_python_sizevar  s    Q**rG   c                $    | j                  |      S r<   )r  r  s     rE   codegen_sizevarz$PythonWrapperCodegen.codegen_sizevar  s    **1--rG   c                    | d| dS )Nr  r  r   )r   basenamerx   r  s       rE   codegen_tuple_accessz)PythonWrapperCodegen.codegen_tuple_access  s    1UG1%%rG   c                    g t        | j                  |      }t        |      dk(  ryt        |      dk(  r	d|d    dS ddj                  |       dS )Nr   ()r,   rr   rC  rp   rs   )r  r  r_   rw   )r   r  partss      rE   r   z/PythonWrapperCodegen.codegen_python_shape_tuple  s^    :#d1159:u:?u:?uQxj$$499U#$A&&rG   c                $    | j                  |      S r<   )r   )r   r  s     rE   r  z(PythonWrapperCodegen.codegen_shape_tuple  s    ..u55rG   c                    dj                  dj                  |t        |      t        |      | j	                  |      | j	                  |      g            S )Nzalloc_from_pool({})rp   )formatrw   r  r   r   )r   rx   offsetr   r  r  s         rE   codegen_alloc_from_poolz,PythonWrapperCodegen.codegen_alloc_from_pool  sS    $++II&MJ33E:33F;

 
	
rG   c                   ||j                   j                  k(  rk||j                   j                  k(  rR||j                   j                  k(  r9|&||j                  k7  rd|j                          d| dS |j                          S | j                  |      }| j                  |      }| j                  |      }|/||j                  k7  r d|j                          d| d| d| d| dS d|j                          d| d| d| d	S )Nzaten.view.dtype(rp   rs   z#aten.view.dtype(reinterpret_tensor(z), zreinterpret_tensor()rn  r  r  r  r   rT  r   r  )r   datar  r  r  r   r   s          rE   codegen_reinterpret_viewz-PythonWrapperCodegen.codegen_reinterpret_view  s    DKK$$$$++,,,$++,,, Udjj%8)$--/):"UG1EE--/*+2248D44V<F))&1F Udjj%8<T]]_<MRPTvUWX^W__abhaiilmrlsstuu *$--/):"TF"VHBvhVWXrG   c                8    | j                  | d| d| d       y )Nz.copy_(rp   rs   r  )r   r   dstnon_blockings       rE   codegen_device_copyz(PythonWrapperCodegen.codegen_device_copy  s!    #gcU"\N!<=rG   c                `    | j                  | j                   | d| | j                          y r  )r   r  r  )r   rx   r   s      rE   codegen_multi_outputz)PythonWrapperCodegen.codegen_multi_output  s)    $,,vS}EFrG   c                   d |j                   D        \  }t        |j                        dk(  r#| j                  |j                   d| d       nkt        |j                        dk(  r@t        |j                  d   t              r#| j                  |j                   d| d       nt        |j                        dk(  rt        |j                  d   t              r| j                  |j                   d| d       | j                  d	|j                   d
|j                  d   j                   d|j                   d|j                  d   j                   d	       | j                  |j                   d|j                   d|j                  d   j                          nt        d|j                         | j                  |j                          d       y )Nc              3  <   K   | ]  }|j                           y wr<   )r  )r   ts     rE   r   z>PythonWrapperCodegen.codegen_dynamic_scalar.<locals>.<genexpr>  s     >Q1&&(>s   r   r   .item()r,   z = 1 if z.item() else 0z_undivided = zassert z_undivided % z
 == 0, f'{z_undivided} not divisible by 'z_undivided // unrecognized keypath z = None)r  r_   keypathr   r  r   r   r   divisorr`   rT  )r   rC   r  s      rE   codegen_dynamic_scalarz+PythonWrapperCodegen.codegen_dynamic_scalar  s   >$++>t||!NNdhhZs4&89!#
4<<?M(RNNdhhZxv^DE!#
4<<?K(PNNdhhZ}TF'BCNN$((=a1H1H0I Jxxj >t||A?V?V>WWXZ NN88*CzQ8O8O7PQ !#8!GHH 	$--/*'23rG   c           
     0     fd}fd}fd}j                  g d       j                         5  j                  dd       t        j                  j
                  j                         D ]U  \  }}j                  d|         |||j                         |j                         |j                  |j                         W t        t        j                  j                        d	kD  r^j                  d
       t        j                  j                  j                         D ]"  \  }}j                  d|         |||       $ t        j                  j                  j                         D ]  \  }}t        |t         j"                        rCt        t        j                  j$                  j&                  j)                  |d       t*              rdt        |t,        j.                        rct        t        j                  j                        d	k(  rj                  d
       j                  d|         |||j1                                t        |t         j2                        r4 ||t        j                  j$                  j5                  |d             /t        |t,        j6                        r# ||d|j                  j8                   d       l|j;                         D cg c]-  }t        j                  j$                  j5                  |d      / }	}|j=                         D cg c]-  }t        j                  j$                  j5                  |d      / }
} |||	|
|j?                         |jA                                " ddjC                  t        j                  j                  jE                                d}j                  d|        j                  d       d d d        y c c}w c c}w # 1 sw Y   y xY w)Nc                    j                  |  dj                  |       dj                  |       d| d| d
       y )Nz = rand_strided(rp   
, device='	', dtype=rs   )r   r   )rx   r  r  r  r   r   r   s        rE   add_fake_inputzFPythonWrapperCodegen.benchmark_compiled_module.<locals>.add_fake_input  sT    &(2259:"226:; <!()E7!5rG   c                2    j                  |  d|        y r  r  )rx   r   r   s     rE   add_expr_inputzFPythonWrapperCodegen.benchmark_compiled_module.<locals>.add_expr_input  s    vS./rG   c                    dd l }t        |t        j                        sJ j	                  |  d|j                  |      d       y )Nr   z = pickle.loads(rs   )pickler   r  ScriptObjectr   dumps)rx   r   r  r   s      rE   add_torchbind_inputzKPythonWrapperCodegen.benchmark_compiled_module.<locals>.add_torchbind_input  sB    eU%7%7888v%5fll56I5LANOrG   )r  r  z3def benchmark_compiled_module(times=10, repeat=10):z
                from torch._dynamo.testing import rand_strided
                from torch._inductor.utils import print_performance
                Tr   zglobal r   zimport pickle*   fallbackztorch.cuda.default_generators[z].graphsafe_get_state()zcall([rp   z])zfn = lambda: z8return print_performance(fn, times=times, repeat=repeat))#
writelinesr   r   r+   r=   	constantsr\   r   r  r  r  r   r_   torchbind_constantsr
  r   r   r  rA   
var_to_valrk   r   r   r  get_real_objr
   	size_hintr  r  r  r  r  r@   rw   keys)r   r   r  r  r!  rx   r   torchbind_objr   r  r  call_strs   ``          rE   benchmark_compiled_modulez.PythonWrapperCodegen.benchmark_compiled_module  sy   		0	P 	K	
 ]]_ E	YMM     !ww00668 e   74&!12%**,ekk	 177../!3  1+,77+F+F+L+L+N ='D- $$wtf%56'm<	=  !ww3399; (eeU\\2zGG$$//33E4@,8 eR%7%781776671<((9$$wtf%56'e.@.@.BCuzz2
 #4)9)9)C)CETV)C)WXr'8'89"89K9K8LLcd "'!1 ((221r2BE  "'!1!1!3 ((221r2BF  #((*)E(T  		!''*>*>*C*C*E FGrJH}XJ78WXKE	Y E	YfoE	Y E	Ys+   J>P42P&P:2P,BP
PPc                    t         j                  sy| j                  |       |j                  g d       |j	                         5  |j                  ddt                dg       ddd       y# 1 sw Y   yxY w)zL
        Append a benchmark harness to generated code for debugging
        N)r  r  zif __name__ == "__main__":zBfrom torch._inductor.wrapper_benchmark import compiled_module_mainzcompiled_module_main('z', benchmark_compiled_module))r   benchmark_harnessr.  r%  r   r%   r   r   s     rE   r  z*PythonWrapperCodegen.add_benchmark_harnessL  ss     ''&&v.@A]]_ 	X,-?-A,BB_`	 	 	s    A//A8c                   t         j                  j                  r>d| d| }| j                  j	                  |       t
        j                  j                  ry |r| dnd}d| | d| }| j                  j	                  |       y )Nz

r   r9  r  )	r   r   r   r  r   r+   r=   r<  r  )r   ra  kernel_bodymetadatagpucpp_definitionbodymetadata_comments           rE   define_kernelz"PythonWrapperCodegen.define_kernel^  s     ==11+c+7D%%,,T2ww"".6hZr?B&'}C}E4 rG   c                :    | j                   j                  |       y r<   )r  r   )r   fn_codes     rE   define_subgraph_launcher_fnz0PythonWrapperCodegen.define_subgraph_launcher_fnq  s    !!((1rG   c                  *+,-./ ddl m} ddlm}m}	m}
 ddlm*m}m	}m
}m} ddlm}m}  |        |j                  }g /i -g ,g },/fd+d)*+-fd		}t!        |j"                        D ]  \  }}||j$                  v r || *|
      d       (|vr-|   }|    || *|
      d       Jt'        |t(        j*                        r || ||
             ut'        |t(        j,                        r/ || |||j/                         |j1                                      t'        |t(        j2                        rO || |||j4                  j/                         |j1                         |j6                  j8                               't'        |t:        t<        j>                  f      xr* t@        jB                  jD                  jG                  |d      } || |||      |        tI        /d ,|j"                  D cg c]  }tK        |       c}      }|tM        jN                  t@        jB                  jQ                               i -tR        jU                  |d      tW        /,      gd}|rtY        |      |d<   |rtY        |      |d<   t[        |      dk(  r0|	j]                         }g t_        t<        j`                  |d         }nd*.fd}i .|D cg c]  }g t_        ||       }}|rt[        |      t[        |      k(  sJ g }tc        te        ||      d d      D ]@  \  }} |jg                   ||       g t_        th        |      g t_        tj        |      d       B |
j                  |g t_        tl        .jo                               d}g .jq                         }ts        |jt                        g}!t[        |      dkD  rQjo                         D ]>  }t'        |t(        j,                  t(        j2                  f      r.|!jg                  |       @ |!jg                  tm        |             |!jw                  tm        |             tY        |!      }!|!| jx                  v rg | jx                  |!   |S | dt[        | jx                         }"t{               }#t|        j                  j~                  r|#j                  d|"d       n|#j                  d|d       |"|d<   |j                  |j                                |#j                   |              |#j                  dg t_        ||      d |d!|d"       t        |      }$t|        j                  j~                  r|$j                  d#| d$d#|" d$      }$|#j                  |$       t@        jB                  jQ                         }%|#j                  d%|%j                   d&       t        j                  |jt                        \  }&}'t        j                  |jt                        }(d'|( d(|' })| j                  |"|#j                         |)       |"|f| jx                  |!<   |"||fS c c}w c c}w )+Nr   )patch_triton_dtype_reprr   )config_to_dict	FixedGridPrecomputedGridr,   )ConstexprArgKernelArgTypeSizeArg	TensorArgTMADescriptorArg)gen_common_triton_importsTritonKernelc                J    j                  |       j                  |        y r<   )r  )idxra   arg_indices	signatures     rE   add_to_signaturezPPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.add_to_signature  s    S!s#rG   c                   |r?t               r	 | |       |j                  v r|j                     |j                  <   y y |j                  v sJ |r>t               r |  |j                               n	 | |       d|j                  <   y |r4t               r |  |j                               d |j                  <   y  | |       y )Nrx   r,   )r*   rx   )	rJ  ra   is_constexprequals_1equals_nonerB  rM  r&  r   s	        rE   add_argzGPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.add_arg  s    13 %S#.88v% +1*:Ichh' & xx6)))57
 )l.IJ(c2*+Ichh' 57 )l.IJ*.Ichh'$S#.rG   rO  T)rP  )rR  )rx   bufferr   )rx   rT  r   r  )rQ  )
size_dtyper  argdefs)r  )rL  r  r&  r   restore_valuereset_to_zeroc                N   t        | t        j                        rdg | j                  }|s| S |j	                  t
               |D ]+  }|v rt        j                  dt                     |<   - t        |       S t        | t              sJ t        j                  |       S )N)r   _launcher_s)r   r   r
   free_symbolssortr   r  r_   r)   r   r   )r  symbolsr  extra_launcher_argss      rE   rename_sizes_for_launcherzYPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.rename_sizes_for_launcher  s    dEJJ/2 1 12G"#LLSL)& "55$38<<)#.A*B)CD4+C0 &d,?@@!$,,,}}T**rG   c                2    t        | d   j                        S r   r   r   s    rE   r   zHPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.<lambda>-  r   rG   r   )r   pythonrW   )	grid_typeprecomputed_gridsr^  _zasync_compile.triton(z, '''ra  zG
            @triton_heuristics.user_autotune(
                configs=z ,
                inductor_meta=z,
                triton_meta=z{,
                filename=__file__,
                custom_kernel=True,
            )
            @triton.jit
            r   rr   z''', device_str='r%  z# Original path: r;  )FFF)r  r   r   r   )Ltorch.utils._tritonr>  runtime.triton_heuristicsr?  r@  rA  commonrB  rC  rD  rE  rF  r   rG  rH  r   r  	arg_names
constexprsr   r   TMADescriptorrS  rT  r@   r!   r  rn  r  r   r   r   r+   r=   rA   statically_known_equalsr7   r-   r#   r  get_current_device_or_throwdictfromkeysr5   r   r_   setup_grid_as_argsr  sympifyr   r   r  r  r4   r   r  r+  idr   extendr  r0   r   unique_user_kernel_namesr   updateinductor_meta_commonr   r   replacer   inspectgetsourcelinesgetsourcefiler9  r   )0r   ry   r   r   restore_value_argsreset_to_zero_argsr   r>  r?  r@  rA  rC  rD  rE  rF  rG  rH  original_nameequal_to_1_argsrS  rJ  r   ra   rQ  r   triton_signaturetriton_metainductor_metaextra_launcher_call_argsr_  r   rc  cfg	cache_keyrx   r   
kernel_srccurrent_devicerd  linenosrcfiler4  rB  rM  rK  r&  r^  rL  s0      `                                      @@@@@@rE   !define_user_defined_triton_kernelz6PythonWrapperCodegen.define_user_defined_triton_kernelt  sO    	@	
 	

	
 	
 	D!)+	$&	!#%'	$"	/ "	/H "&"2"23 1	GHCf'''\s3$G& +Cc{"\s3Fc2#3#34(!$  RYY/!!$#&<<>"%--/  R%7%78 !!$#&88#4#4#6"%--/#&::#4#4	  *c5==1   ''**BB  Cc!2XFc1	Gf -)/)9)9:AWQZ:	
 *&--agg.Q.Q.ST--3
 ''
, +01C+DK(+01C+DK(u:?,5,H,H,JM'FU]]E!H)E'F$+  EGINO<s4d;<OEOSZ3w<777 "#E7#)CT 		c "(("0"5"5Ct$4"52UD!12	 -55%6'PS2E2L2L2N)O'PM
 (E)<)A)A)C'D$ VYY-	w<!}} *!#		23E3E'FG$$S)* 	[)*]+,)$	666//	:( 
  #d&D&D"E!FG(*==11%%(=dXU&KL%%(=m=Ne&TU'+m$\>>@A8:;83~w78; <,/ 0(O ,			
 OvV
==11#++d=/,CtD6QR^TJz*<<>!!$5n6I6I5J""MN**6995	6''		2&wiq9$$&	
 6:;4G&&y1[":::I ;j Ps   Y&)Y+c                    | d|j                    d}||d| z  }| j                  | dt        |j                                t	        ||j                        S )Nrd  r  r   )r  r   r  r  r   )r   ra  treer  r  s        rE   generate_numel_exprz(PythonWrapperCodegen.generate_numel_expry  sb    a}E2axL D$s5#4"567 tTZZ00rG   c                   |j                         }t        | |      }|j                  t        j                  k(  r| j                  |       n1|j                  t        j                  k(  r2| j                  |       | j                  | j                  |             n|j                  t        j                  k(  r| j                  j                  |      }|rRt        |t              rt        |j                  t              sJ t        j                  |j                  |      |_        nV| j                  |       | j                  | j                  |             || j                  |<   nt        |j                        t         j"                  j$                  r| j&                  j                  t(        j+                  | ||j,                  |j.                  t0        j2                  j4                  j7                  |j8                        fd             |j                  t        j                  k7  r0| j&                  j                  t(        j                  | |             y y y )N)r,   )r  r  )rT  rY  	zero_moder3   UNINITIALIZEDr   ZERO_ON_CALLmake_zero_bufferZERO_PER_GRAPHr  rk   r   rC   r2   maximumr`   r   r   r   r   r  make_allocationr  r   r+   r=   rA   r*  r   )r   wsrx   r   priors        rE   generate_workspace_allocationz2PythonWrapperCodegen.generate_workspace_allocation  s   {{}D"%<<,:::NN4 \\.;;;NN4 NN40067\\.===--11$7E!%6:JJ<   *11%**bA
t$t44T:;26))$/ ..==11&&00$44IIHH77++55bhh?A 5 	 ||0>>>**44(99$E ? 2rG   c                v    |j                   t        j                  k7  r| j                  t	        | |             y y r<   )r  r3   r  r   r  )r   r  s     rE   generate_workspace_deallocationz4PythonWrapperCodegen.generate_workspace_deallocation  s.    <<,;;;NN.tR89 <rG   c                $    | d| j                    S )Nz.zero_())r  )r   rx   s     rE   r  z%PythonWrapperCodegen.make_zero_buffer  s    x}--rG   c                H    | ddj                  |       d| j                   S )Nrr   rp   rs   )rw   r  )r   rx   	call_argss      rE   r  z%PythonWrapperCodegen.wrap_kernel_call  s'    q9-.a}==rG   c                    | j                   j                  d       | j                   j                  dt        j                  j                   d       |j                  | j                   j                                y )Nz*from torch.profiler import record_functionzwith record_function('graph_z_inductor_wrapper_call'):)r  r   r+   r=   graph_idr  r   )r   r  s     rE   r  z8PythonWrapperCodegen.generate_profiler_mark_wrapper_call  sb    ##$PQ##*177+;+;*<<UV	
 	D--4467rG   c                :    | j                   j                  d       y )Nzstart_graph())r  r   r   s    rE   r  z)PythonWrapperCodegen.generate_start_graph  s    ##O4rG   c                ^    | j                   j                  dt        j                  d       y )Nz
end_graph(rs   )r  r   r   profile_bandwidth_outputr   s    rE   r  z'PythonWrapperCodegen.generate_end_graph  s'    ##j1P1P0SST$UVrG   c                ^    | j                   j                  dt        j                   d       y )NU
            for kernel in globals().values():
                if isinstance(kernel, zU.CachingAutotuner):
                    kernel.cuda_kernel_saved = False
            r  r   r"   r   r   s    rE   r  z6PythonWrapperCodegen.generate_reset_kernel_saved_flags  s2      ''8'A'A&B C	
rG   c                ^    | j                   j                  dt        j                   d       y)a[  
        Precompile and save the CUBINs of the Triton kernels that haven't
        been precompiled and saved as a side effect of running the generated
        JIT model (Python wrapper). This can happen when the model contains
        control flow: only one pass through the control flow operators covers
        the kernels that are saved, the remaining kernels are not launched,
        hence not saved. The main purpose of this codegen is to compile and
        save the Triton kernels outside the active control flow path for
        subsequent AOTInductor code generation and compilation.
        r  a  .CachingAutotuner):
                    if not kernel.cuda_kernel_saved:
                        if len(kernel.launchers) == 0:
                            kernel.precompile()
                        kernel.save_gpu_kernel(
                            grid=(0, 0, 0),   # use dummy grid
                            stream="stream",  # use dummy stream
                            launcher=kernel.launchers[0],
                        )
            Nr  r   s    rE   r  z5PythonWrapperCodegen.generate_save_uncompiled_kernels  s4     	  ''8'A'A&B 	C	
rG   c                >    d }|D cg c]
  } ||       c}S c c}w )Nc                    t        | t              rt        |       r| dz   S | S t        | t        t        t
        t        f      rt        |       S t        t        j                  j                  j                  |             S )Nr  )r   r   r6   r   floatr   r   r  r+   r=   rA   rB   )ra   s    rE   wrap_argzAPythonWrapperCodegen.prepare_triton_kernel_call.<locals>.wrap_arg  s^    #s#*B3*GsYPSPC#udO!DE3xQWW--66s;<<rG   r   )r   r  r  ra   s       rE   prepare_triton_kernel_callz/PythonWrapperCodegen.prepare_triton_kernel_call  s!    	= *33#333s   c                    t        |t              rt        |t        j                        r:|j                  j                         }t        j                  j                  |      }nQt        j                  j                  |      "|}t        j                  j                  |      }n|J d       d| }|}t        d |j                         D              }t        d t        j                  j                  |      D              }t        d |j                         D              }	|j                         }
|j                         }t        j                  j                   j#                  |j%                         j&                  t(        j*                        }d| d|	 d	|
 d
| d| d| d} j,                  j/                  | d|        t        |t        j                        r5 j1                  |d      }|} j,                  j/                  | d|        |S t3        |t4        j6                        st        |t8              rt        |t:              r| j<                  v r|S |y|}t        |t8              r|j>                  }|t        j                  j                   j@                  v r't        j                  j                   j@                  |   }t;        t        j                  j                   jC                  |t(        j*                              S t        |t:        tD        tF        tH        f      rt;        |      S t        |tJ              rddjM                   fd|D               dS tO        dtQ        |             )NzBV.graph.get_buffer(arg) and raw_arg can't be None at the same timetmp_arg_c              3     K   | ]=  }t         j                  j                  j                  |t        j
                          ? ywr#  Nr+   r=   rA   rm  r   unbacked_symint_fallbackr   r  s     rE   r   zBPythonWrapperCodegen.generate_example_arg_value.<locals>.<genexpr>  s@      
 	   ;;#<< <    AAc              3     K   | ]=  }t         j                  j                  j                  |t        j
                          ? ywr  r  r  s     rE   r   zBPythonWrapperCodegen.generate_example_arg_value.<locals>.<genexpr>  s@      $
 	   ;;#<< < $r  c              3     K   | ]=  }t         j                  j                  j                  |t        j
                          ? ywr  r  r  s     rE   r   zBPythonWrapperCodegen.generate_example_arg_value.<locals>.<genexpr>  s@      
 	   ;;#<< < r  r#  zgenerate_example_value(rp   z, 'z', rs   r   T)r  r  r  r  c              3  T   K   | ]  }j                  |t        |             ! y wr<   r   )r   ar   s     rE   r   zBPythonWrapperCodegen.generate_example_arg_value.<locals>.<genexpr>C  s#      ZQR!@!@DG!L Zr   r  zUnsupported type ))r   torch_dtyper   rj  r|  rT  r+   r=   
get_buffertry_get_bufferr   r  get_allocation_sizer  r  r@   rA   r*  
get_layoutr  r   r  r   r   r  
issubclassr   Basicr   r   r  r   r  rm  r   r  r   r  rw   r  r   )r   ra   arg_typeraw_argr  r  r  r  allocation_sizer  r  r   r  r   s   `             rE   r   z/PythonWrapperCodegen.generate_example_arg_value  s   h,'2#3#34">>224gg((2'',8gg((-* X* &eW- 
  D $ $
 44S9$ O  
 ) F ^^%FMMOEWW%%// ''88 0 F .dV2fXSE7RTU[T\\^_n^oopqE&&00H:S1HI'2#3#34 :: %) ;  **44zUG5LMO%++.*S/2R#s#$//)J?!#/nnagg&&CCCgg&&CCCH  ;;&"A"A <   c3t45s8OT"tyy ZVY ZZ[[\]]%(9$s)&EFFrG   c                z     t        |t              r ddj                   fd|D              z   dz   S t        |      S )Nr  rp   c              3  @   K   | ]  }j                  |        y wr<   )_grid_dim_str)r   r   r   s     rE   r   z5PythonWrapperCodegen._grid_dim_str.<locals>.<genexpr>J  s     RT 2 24 8Rs   r  )r   r  rw   r  )r   grid_per_dims   ` rE   r  z"PythonWrapperCodegen._grid_dim_strG  s<    lD)diiR\RRRUXX &&rG   )r  r   	arg_typesr  r  c          
        |xs t         j                  j                         }|s1|j                  dk7  s"| j	                  | j                  ||             y| j                  |      }dj                  |      }t        j                  | |j                  t         j                        }	|s$d|	 d}
| j	                  | d| d| d|
 d       y| j                          t        j                  j                  r|| j                  vr|t!        |      t!        |      k(  sJ d       i }g }|dgt!        |      z  }nt!        |      t!        |      k(  sJ d	       t#        t%        |||            D ]  \  }\  }}}d}t'        |t(              r!d
t)        |      v r|j+                  d
      \  }}t'        |t,              rBt/        j0                  d|      r|}|||<   n8||vr| j3                  ||||      }|||<   n||   }n| j3                  ||||      }|j5                  ||n| d
|         | j6                  j	                  | ddj                  |       d|	 d       | j6                  j	                  ddj                  d |j9                         D               d       | j                  j;                  |       t         j                  j<                  ryt         j                  j>                  j@                  }|jC                  |||d       |5  | j	                  | d| d|	 d       ddd       y# 1 sw Y   yxY w)z
        Generates kernel call code.

        triton: Defines whether the backend uses Triton for codegen. Otherwise it uses the CUDA language when gpu=True,
                and C++ when gpu=False.
        r\  Nrp   z	c_void_p(rs   r   rr   z$call_args and arg_types do not matchz#call_args and raw_args do not matchrO  z^(workspace|semaphore)z.run(z	, stream=del c              3      K   | ]  }|  y wr<   r   )r   ra   s     rE   r   z<PythonWrapperCodegen.generate_kernel_call.<locals>.<genexpr>  s      E E   r9  )"r+   r=   rl  r   r   r  r  rw   r  r  r  r  r   r   r   r   r_   r  r   r   r   splitr  r]   matchr   r  r   r  r   r<  rd  r  re  )r   ra  r  r  r   r  r  r  call_args_strstream_name
stream_ptrtensor_argsall_argsr  ra   r  r  r   arg_strrh  s                       rE   generate_kernel_callz)PythonWrapperCodegen.generate_kernel_callO  sE   " @177>>@&++.NN400iHI77	B		-0*??&,,
 $[M3JNN-qQ}oR
|1M %%' MM224#=#== (S^s9~-M 6M KH 6C	N28}I6 96 09Iy(30 P++C7 c3'C3s8O"yy~HCh4 xx 93?"%+2C(K/"&"A"A7A# ,3C("-c"2"==c8WVWXG3;se1WI<NO/P2 &&00-uTYYx%8$9;-qQ &&00tyy E0B0B0D EEFbI &&**;7ww"" !" 4 4 B B..y+yRVW" 	XNNk]%i}TUVW	X 	X 	Xs   L::Mc                :    | j                   j                  |       y r<   )r  r  )r   r   s     rE   r   zPythonWrapperCodegen.writeline  s    

$rG   c                4    |D ]  }| j                  |        y r<   r  )r   r  r   s      rE   r%  zPythonWrapperCodegen.writelines  s     	!DNN4 	!rG   c                L    | j                   j                  t        |             y r<   )r  r  r&   )r   ctxs     rE   r  z"PythonWrapperCodegen.enter_context  s    

+c*+rG   c                &    ddl m}m}  |       rdd l}t	        |t
              rt        |j                  j                        S t	        |t        j                        rt        |      S t	        |t        t        f      rAt        j                   G d d             t         t!        |       fd|D                    S t	        |t"        j$                  j&                        rt)        |      S t	        |t*        j,                  t*        j.                  t0        f      r|j3                         S  |       r(t	        |j4                  j6                        r ||      S t	        |t*        j8                        r|j3                         S t        |      S )Nr   )dtype_to_stringhas_triton_packagec                      e Zd ZU ded<   d Zy)1PythonWrapperCodegen.val_to_arg_str.<locals>.Shimr   refc                    | j                   S r<   )r  r   s    rE   __repr__z:PythonWrapperCodegen.val_to_arg_str.<locals>.Shim.__repr__  s    88OrG   N)r   r   r   r   r  r   rG   rE   Shimr    s    $rG   r  c              3  V   K   | ]   } t         j                  |             " y wr<   rs  )r   r  r  r   s     rE   r   z6PythonWrapperCodegen.val_to_arg_str.<locals>.<genexpr>  s$     Vq1@@qIJVs   &))re  r  r  r   r   r   r  rC   r  r   r
   r   r  rP  	dataclassrX   r   r  _ops
OpOverloadr   r   rS  
MutableBoxr!   r  languager   r  )r   r  type_r  r  r   r  s   `     @rE   rt  z#PythonWrapperCodegen.val_to_arg_str  s'   Ka"%%5::&8OE4=)""$ $ #$ QVTUVV  5::001&q))BIIr}}oFG&&((!jFOO4I4I&J"1%%2,,-&&((7NrG   c                >   |j                         }|j                         }t        |j                               }t        t        j
                  j                  |            }t        |j                               }| j                  |j                         |||||      S r<   )
r  r@   r   r  r+   r=   r  r  r  rT  )r   rT  r  r   r  allocation_shaper  s          rE   ri  z+PythonWrapperCodegen.make_buffer_allocation  s    ""$  "foo'( !<!<V!DEv((*+##OOvueV=M
 	
rG   c           
         ||}| j                  |      }| j                  |      }| j                  |      }	|j                  dv r| d|j                   d| d|	 d| d
}
n| d| d|	 d|j                   d| d
}
||k7  r|
d	| d|	 dz   }
|
S )
N)r\  cudaxpuz = empty_strided_rr   rp   rs   z = empty_strided(r  r  z.as_strided()r   r   )r   rx   r  r   r  r  r  r  codegen_allocation_shape_tuplecodegen_stride_tuplerf  s              rE   r  z$PythonWrapperCodegen.make_allocation  s     #$"==eD)-)H)H*
&  $>>vF;;00 &)&++a12"'('  &)12"'( )!;;-yq:  "@@,':&;2>R=SSTUUC
rG   c           	     `    | j                    | d| | j                   d| j                   d| 	S )Nr     rq   )r  r  r  )r   new_nameold_namer  s       rE   make_tensor_aliasz&PythonWrapperCodegen.make_tensor_alias	  s6    ,,zXJt{{m2dll^STU\T]^^rG   c                (    d|j                          S )Nr  )rT  )r   rT  s     rE   rq  z%PythonWrapperCodegen.make_buffer_free
	  s    foo'())rG   c                8    ddj                  d |D               S )Nr  rp   c              3      K   | ]  }|  y wr<   r   )r   rx   s     rE   r   z:PythonWrapperCodegen.make_free_by_names.<locals>.<genexpr>	  s     >>r  )rw   )r   names_to_dels     rE   make_free_by_namesz'PythonWrapperCodegen.make_free_by_names	  s    dii>>>?@@rG   c           	     `    | j                    | d| | | j                   d| j                   d	S )Nr   r   reuse)r  r  r  )r   r  r  del_lines       rE   codegen_exact_buffer_reusez/PythonWrapperCodegen.codegen_exact_buffer_reuse	  s@    ../zXJxjQUQ\Q\P]]_`d`l`l_mmsttrG   c                r   |j                         |j                         k(  sJ |j                         }|j                         }d}|t        j                  j	                         vr|rd| j                  |       }|j                         |j                         k(  r4|j                         |j                         k(  r| j                  |||      S | j                  ||j                         |j                         d| j                  j                        }| j                   | d| | d| j                   dS )N;z; r   r   r  r  )r@   rT  r+   r=   r  rq  r  r  r  r  r  r   r  r  )r   oldnewrt  r  r  r  reinterpret_views           rE   rw  z&PythonWrapperCodegen.make_buffer_reuse	  s   }}#--/111<<><<>1773355*D11#678H<<>S\\^+0@CNNDT0T228XxPP88!11d6G6G6Q6Q
 ,,z-=,>xj4<<.X^__rG   c                    | j                  t        || j                   | d|j                          | j                   d| j
                   d             y )Nr   r  z alias)r   r/   r  r  r  r  )r   rx   views      rE   codegen_deferred_allocationz0PythonWrapperCodegen.codegen_deferred_allocation#	  sS    <<.c$*@*@*B)CDKK=PRSWS_S_R``fg	
rG   c                j   |j                         }|t        j                  j                  v s(|| j                  v st        |t        j                        ry | j                  j                  |       t        |j                         t        j                  t        j                  f      r|j                         sy |j                         }t        |t        j                        ry t        |t        j                        ry t        |t        j                         rHt        |j"                  t        j$                        s*J dt'        |j"                         d|j"                          t        |j"                  j(                  t        j*                        s$J t'        |j"                  j(                               t        |j"                  j(                  j(                  t        j,                        s$J t'        |j"                  j(                               | j/                  |j"                  j(                  j(                         | j1                  ||j"                         y t        |t        j2                        r| j5                  t7        | |             y | j5                  t9        | |             y )Nzunexpected r   )rT  r+   r=   r]  r  r   r   DonatedBufferr   get_defining_opExternKernelAllocMultiOutputshould_allocater  MutationLayoutSHOULDREMOVErX  NonOwningLayoutr  r!   r   r  
StorageBoxrS  codegen_allocationr  r  r   r  rY  )r   rT  rx   rn  s       rE   r  z'PythonWrapperCodegen.codegen_allocation+	  s     AGG+++t~~%&""2"234 &&(%%r~~6 **,'')fb;;<fbmm,fb001fkk2+=+=> d6;;/06;;-@> fkk..>VV[[EUEU@VV>fkk..33RYY?WfkkFVFVAWW?##FKK$4$4$9$9:,,T6;;?fb112NN1$?@|D&12rG   c                   |j                         }t        |t        j                  t        j                  f      r!| j                  | j                  |             y t        |j                         t        j                        r| j                  t        | |             y | j                  |      sy | j                  j                  |       | j                  t        | |             y r<   )rT  r   r   InputBufferr  r   rq  r  r  r  	can_reuser  r   r  )r   rT  rx   s      rE   codegen_freez!PythonWrapperCodegen.codegen_freeS	  s      fr~~r/A/ABCNN40089f,,.0C0CD NN-dF;<~~f%

t*489rG   c                   |j                         }|t        j                  j                  v xs |t        j                  j                  v xr6 t        t        j                  j                  |   t        j                         xsh |t        j                  j                  v xsJ |t        j                  j                  v xs, |t        j                  j                  v xs || j                  v  S r<   )rT  r+   r=   r]  r
  r   graph_inputs_originalr   r  r&  r'  never_reuse_buffersr  )r   input_bufferoutput_bufferrx   s       rE   r  zPythonWrapperCodegen.can_reuseg	  s    $$&AGG+++ 
",,, "GG11$79I9I 
" qww(((
" qww222
" qww222
" tzz!
 	
rG   c                    |j                         | j                  v xr. | j                  |j                            |j                         k(  S r<   )rT  r  )r   rT  reused_buffers      rE   	did_reusezPythonWrapperCodegen.did_reusew	  sC     OO, KFOO-.-2H2H2JJ	
rG   c                t   t        ||      sJ | j                  |       | j                  j                  |j	                                | j
                  j                  |j	                                |j	                         | j                  |j	                         <   | j                  t        | ||             y r<   )	rO   r  r  r   rT  r  r  r   r`  )r   r  r  s      rE   codegen_inplace_reusez*PythonWrapperCodegen.codegen_inplace_reuse	  s    $\=AAA-

|,,./=11340<0E0E0GM**,-y|]CDrG   c                    t        |      }|| j                  v r|S | j                  j                  |       | j                  |z   S r<   )r   r  r   r  )r   r   rx   s      rE   codegen_unbacked_symbol_declz1PythonWrapperCodegen.codegen_unbacked_symbol_decl	  sC    6{4---K &&**40<<$&&rG   c                &   t        t        j                  j                  j                  |      }|sy |j                         D ]I  \  }dfdfd}| j                  | j                  |       d |        | j                          K y )Nc                   |dk(  r| S t        |      dk\  r_t        |d   t              rLt        |d   t        j                        r/ |  d|d   j
                   d|d   j                   d|dd        S t        |d   t              r |  d|d   j
                   d|dd        S t        |d   t        j                        rYt        j                  j                  r  d	|d   j                   d
|  d|dd        S  |  d|d   j                   d|dd        S t        |d   t              r |  d|d   j                   d|dd        S t        d|       )Nr   r   r   r,   r   rr   rs   r  z	std::get<z>(r  r  z.__floordiv__(r  )r_   r   r   pytreeSequenceKeyrx   rJ  r+   r=   r<  r   r  r`   )r  r  gos     rE   r!  zIPythonWrapperCodegen.codegen_unbacked_symbol_defs_for_outputs.<locals>.go	  s   b=K LA%"71:}="71:v/A/AB&'!*//!2!GAJNN3C1Ewqr{   
M:a
'8;WQR[II
F,>,>? 77.. Ywqz~~&6ba@'!"+N  4&'!*..)9 ;WQR[I
  
K8 nWQZ5G5G4HJGTUTVKXX(+@	)JKKrG   c                    t         j                  j                  rt              dk(  rZd   }  d   j	                         t        | t        j                        r!t        | j                        dk7  r	dd        S       S t        d   t        j                        sJ  d   j                     j	                         dd        S        S )Nr,   r   )r+   r=   r<  r_   rT  r   r   r  r  r  r   rJ  )rf  r!  r  r_  r  s    rE   go_outerzOPythonWrapperCodegen.codegen_unbacked_symbol_defs_for_outputs.<locals>.go_outer	  s    77&&
 7|q(%aj  "#AJ//1)#r~~>3s{{CSWXCX $ABK   ")	    *'!*f6H6HIII!''!*.."9"B"B"DgabkRRk733rG   r   )r  r   r  zpytree.KeyPath)	r   r+   r=   rA   	shape_envr\   r   r  r  )r   r_  r  unbacked_bindingsr  r#  r!  r  s    ``   @@rE   (codegen_unbacked_symbol_defs_for_outputsz=PythonWrapperCodegen.codegen_unbacked_symbol_defs_for_outputs	  s     6GG&&(9
 ! ,113 <	JAw
L<4. NN44Q78HJ<}Uu<	rG   c                     fd} fd}	  j                  j                          j                   j                   dj                           |        t
        j                  }t        j                  j                        5  j                  j                  |       d d d         |         j                          y # 1 sw Y   !xY w#  j                          w xY w)Nc                    t        j                  j                        t              k(  sJ t        j                  j                        D ]3  \  } }j	                  j
                   |  d| j                          5 y r  )r_   r=   r
  r   r   r  r  )inner_inputouter_inputouter_inputsr   subgraphs     rE   _codegen_subgraph_prefixzSPythonWrapperCodegen.codegen_subgraph_by_inlining.<locals>._codegen_subgraph_prefix	  sy    x~~223s<7HHHH,/++\- ([ ||n[M[M$++OrG   c                    t        j                  j                        t              k(  sJ t        j                  j                        D ]5  \  } }j	                  | d| j                          j                          7 y r  )r_   r=   r  r   r   r  r  )inner_outputouter_outputouter_outputsr   r,  s     rE   _codegen_subgraph_suffixzSPythonWrapperCodegen.codegen_subgraph_by_inlining.<locals>._codegen_subgraph_suffix	  s{    x~~334M8JJJJ.1,,m/ *l #nC(F(F(H'I$++WrG    subgraph: )parent_graph)	r%  r=   r   r  rx   r+   set_graph_handlercodegen_subgraphr2  )r   r,  r+  r1  r-  r2  r4  s   ````   rE   codegen_subgraph_by_inliningz1PythonWrapperCodegen.codegen_subgraph_by_inlining	  s    			'%%hnn5NNdll^;x}}oFG$&77L$$X^^4 //!- 0  %&$$&  $$&s$   A;C C,C CC C*c                H   t        |      t        |j                  j                        k(  s!J d|j                  j                   d|        t        |j                  j                  |      D ]3  \  }}| j	                  | j
                   | d| | j                          5 y )Nzgraph_input_names:z, outer_inputs: r   )r_   r=   r  r   r   r  r  )r   r,  r+  r1  r)  r*  s         rE   codegen_subgraph_prefixz,PythonWrapperCodegen.codegen_subgraph_prefix
  s     < C(H(H$II 	
 !A!A BBRS_R`a	
I ),NN,,l)
 	X$K NNdll^K=K=VW	XrG   c           	     T   |j                   }|j                  }dj                  |j                               t	        |      dk(  rdndz   }|D cg c]  }|j                          }}dj                  |      t	        |      dk(  rdndz   }| j                  d| d| d       |j                         D 	
cg c]
  \  }	}
|
s	|	 }}	}
|r#| j                  ddj                  |              | j                  d	| d
| d| d       | j                  d| d       yc c}w c c}
}	w )z'Generate code to call a graph partitionrp   r,   r  r  	partition	_args = [r  r  rr   z) = self.partitions[z](partition_args)zdel partition_argsN)input_deallocationoutput_nodesrw   r+  r_   rT  r   r\   )r   partition_idr  r?  r@  r  rC   output_namesr  rx   
deallocater  s               rE   codegen_partition_callz+PythonWrapperCodegen.codegen_partition_call
  s<    2DD+88-2245)*a/CR
 5AADAA))L)C4E4JSPRS 	<.	&CD *<)A)A)C
%T:zD
 
 NNT$))L"9!:;< 	y,\N+l^SYZ	
 	|nE:;! B
s   D7
D$D$c                P    t        |      D cg c]  }d| 	 c}| _        y c c}w )N
partition_)r  rK  )r   num_partitionsrJ  s      rE   set_all_partition_namesz,PythonWrapperCodegen.set_all_partition_names7
  s$    BGBW#X3j$6#X #Xs   #c           	        |j                   j                  }dj                  |      }t        |      dk(  r|dz  }dj                  |      t        |      dk(  rdndz   }| j	                  |j                   j
                   d| d       |d t        |       D ]  }| j	                  d|         | j	                  d| d	|j                   j
                   d|j                   j
                   d
       y )Nrp   r,   r  r  r<  r  r  rr   z) = r=  )r=   r  rw   r_   r   rx   )r   r,  r+  r1  r  inner_inputsouter_output_namesr)  s           rE   codegen_subgraph_callz*PythonWrapperCodegen.codegen_subgraph_call:
  s    nn66yy-{q CL!YY}5}%*C

 	(..--.i~QGH&':\):; 	1KNNT+/0	1 	"#4(;(;'<Ahnn>Q>Q=RRXY	
rG   c                ~   t         j                  j                  r| j                  |||       y | j	                  |j                         | j                  d       | j                  | j                   d|j                          | j                  |||       t         j                  }|j                  |j                  _	        |j                  j                  | j                  vrt        j                  |j                        5  t        j                  dd      5  |j                  j                         \  }}d d d        d d d        | j                  j                  |j                  j                         | j!                  j"                         | j%                  |||       y # 1 sw Y   oxY w# 1 sw Y   sxY w)Nr  r3  r  F)r+   r=   r=  r7  r%  r   r  rx   r9  r<  r  r5  r   patchr)  r   r<  r   rL  )r   r,  r+  r1  r4  subgraph_coderd  s          rE   r6  z%PythonWrapperCodegen.codegen_subgraphO
  sZ    77--hmT!!(..1r$,,{8==/BC$$X|]Kww%1%=%=">>d&F&FF $$X^^4 @\\"3U; @'/~~'='='?$M1@@
 ,,001D1DE,,]-@-@A""8\=I@ @@ @s$   F3F'9F3'F0	,F33F<c                t   |j                         }| j                  | dt        |j                                |j                  D cg c]  }|j                          }}t        t        |j                              D cg c]
  }| d| d }}| j                  |j                  ||       y c c}w c c}w )N = [None] * r  r  )	rT  r   r_   r  r  r  r  r6  r,  )r   invoke_subgraphrx   r  r+  r  r1  s          rE   codegen_invoke_subgraphz,PythonWrapperCodegen.codegen_invoke_subgraphk
  s    '')$|C0G0G,H+IJK;J;Q;QRC--/RR16s?;R;R7S1TUAD61#QUUo66mT SUs   B0B5c                   |j                         }|j                  D cg c]  }|j                          }}t        t	        |j
                              D cg c]
  }| d| d }}|j                  j                         }t        |j                  t        j                        s| d}| j                  | dt	        |j
                                | j                  d| d       | j                  t        | |j                  j                               | j                  |j                  ||       | j                  t        |              | j                  d       | j                  t        | |j                   j                               | j                  |j                   ||       | j                  t        |              y c c}w c c}w )Nr  r  r  rQ  r   r;  zelse:)rT  operandsr  r  r_   r  	predicater   r   ShapeAsConstantBufferr   r  true_subgraphr=   r6  r.  false_subgraph)r   conditionalrx   r  r+  r  r1  rV  s           rE   codegen_conditionalz(PythonWrapperCodegen.codegen_conditionals
  sx   ##%;F;O;OPC--/PP16s;;N;N7O1PQAD61#QQQ));;=	+//1I1IJ$+W-I$|C0C0C,D+EFGYKq)*({/H/H/N/NOPk77}U'-.w({/I/I/O/OPQk88,V'-.! QQs   GGc                   |j                         }|j                  D cg c]  }|j                          }}|j                  D cg c]  }|j                          }}| j	                  | dt        |              t        |      D ]  \  }}| j	                  | d| d|          g t        t        |            D cg c]
  }| d| d c}|}| dg}	t        |      }
|
d t        |       }| j	                  d       | j	                  t        | |j                  j                               | j                  |j                  ||	       | j	                  d|	d    d	       | j	                  t        |              | j	                  t        | |j                  j                               | j                  |j                  |
|       | j	                  t        |              y c c}w c c}w c c}w )
NrQ  r  z] = r  _cond_resultzwhile True:zif not r   z: break)rT  carried_inputsr  additional_inputsr   r_   r  r  r  r  cond_subgraphr=   r6  r.  body_subgraph)r   
while_looprx   r  outer_carried_inputsouter_additional_inputsr  inpcond_outer_inputscond_outer_outputsbody_outer_inputsbody_outer_outputss               rE   codegen_while_loopz'PythonWrapperCodegen.codegen_while_loop
  s   ""$/9/H/H 
(+C!!# 
  
 0:/K/K#
(+C!!##
 #
 	$|C0D,E+FGH 45 	3FAsNNdV1QCtC512	3
&+C0D,E&FGas!nG
$
 "&l34 
 //J5I1JK}%(z/G/G/M/MNO$$&79K	
 	(+,G4	
 	'-.(z/G/G/M/MNO$$&79K	
 	'-.M 
#
 Hs   G+G0G5c                    	 t        | dd       ry t        | t              r| S t        j                  j
                  j                  |       }||S t        |      S # t        $ r Y y w xY w)Nr[  )rR  r   r   r+   r=   
_shape_env_maybe_evaluate_staticr  )r   r   s     rE   statically_known_int_or_nonez1PythonWrapperCodegen.statically_known_int_or_none
  sf    	q.$/ !S!''$$;;A>C{
s8O 		s!   A A ,A 
A 	A&%A&c                l    g }| D ],  }t         j                  |      }| y |j                  |       . |S r<   )r  rn  r  )lstrH  r   nums       rE   %statically_known_list_of_ints_or_nonez:PythonWrapperCodegen.statically_known_list_of_ints_or_none
  sA     	A&CCAFC{MM#		
 rG   c                0    t         j                  |       d uS r<   )r  rr  )rp  s    rE    is_statically_known_list_of_intsz5PythonWrapperCodegen.is_statically_known_list_of_ints
  s     !FFsKSWW	
rG   c                H    t         j                  | j                               S r<   )r  rr  r  rT  s    rE   ra  z4PythonWrapperCodegen.static_shape_for_buffer_or_none
  s    #IIOO
 	
rG   c                0    t         j                  |       d uS r<   )r  ra  rv  s    rE   !can_prove_buffer_has_static_shapez6PythonWrapperCodegen.can_prove_buffer_has_static_shape
  s    #CCFKSWWWrG   r<   )r  r   r  r   r  Optional[PythonWrapperCodegen]r  $Optional[ir.GraphPartitionSignature]r*  )rx   r   r  r   r   r  )r  r   )r  TritonMetaParamsr   r   r   	list[str]r   z>dict[str, Union[ir.TensorBox, ir.TorchBindObject, sympy.Expr]]r   zlist[IRNode])r  r}  r  )r6  r   r   r   rW  )r6  r   r   r  )rD  r}  r   r  rH  r0   r   r  )ry   r   rf  r   rg  r   rz   r}  r  r   r   r  )F)NNN)
r  r   r  r   r  r   r  r}  r  zOptional[torch._ops.OpOverload])rx   r   r   zir.TensorBoxr  zOrderedSet[sympy.Symbol])r  zsympy.Symbol)r   r
   rB   r   r   r   )r   r
   r   r   )r  r   rx   r   r  r   r   r   )r  zSequence[Expr]r   r   )r   zCallable[..., None]r   r   )r	  r   )NTN)
ra  r   r3  r   r4  r   r5  r   r6  r   )r;  r   )r   z"list[list[Union[int, sympy.Expr]]])ra  r   r  r   )r  r2   )NN)ra  r   )rT  rZ  )r  )rT  z%Union[BufferLike, ir.TorchBindObject])r  r}  )r  r   r  r   r  r   )r  rZ  r  rZ  rt  r   )rx   r   r  zir.ReinterpretViewr   r  rT  r{  )r  r{  r  r{  )r_  r   r  r   r%  z,Optional[dict[sympy.Symbol, pytree.KeyPath]]r   r  )rA  r   r  zir.GraphPartitionSignature)rG  r   ){r   r   r   __doc__r  r  r  r  r  r  r  r  r$   r  r  r   r  r  r  r  r  r  r  r  r!  r#  r  r*  r  r4  r%  r2  r!  r0  r=  r?  rA  rE  rI  rM  rO  rS  rQ  ri  r  r  r  r  r  r  r  r  r  r  r  r  r)  r  r  r  r  r  r  r   r  r  r  r
  r  r  r.  r  r9  r<  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r   r%  r  rt  ri  r  r  rq  r  r  rw  r  r  r  r  r  r  r  r&  r7  r9  rD  rH  rL  r6  rS  r[  rj  rn  rr  rt  ra  rx  r  r  s   @rE   r  r    s   P#d 
 FJ	&&$& 7& C	& &'<;Az
   "  	! 
 

$	G$
%S$	(
.8)6(-.10J
/+7&5
5A:;; ;  	;
 ; ; 
;,
&< 8<
X
X  
X 	
X
  
X 5
X0M
^!SF:

B(K(K (K -	(KTJ"5 @D W CG +.&'6
(  ' 
:>G4*[Yz, #'(,!! !  	!
 ! &!&2C; 2C;J1%N:.>85W

4
4QGf' _X_XB !, F
 DH@_*Au` 
&3P:(
 
E'NN N H	N
 
N`+'Z	X<< 9<>Y
*J8U/*(/T     
 

 
 

 X XrG   r  c                       e Zd ZdZ	 d	 	 	 	 	 d fdZddZddZd Zd Zd Z	ddZ
dd	Zdd
ZddZ	 	 ddZddZddZd fdZedd       Zedd       Z xZS )r  a  
    A wrapper codegen that generates code for a subgraph. For most of the
    methods, we rely on the implementation in the PythonWrapperCodegen. But we
    override a few functions to produce cleaner code (like avoiding writing
    imports twice in the output code)
    c                L    || _         || _        || _        t        |           y r<   )r  r  r  r  r  )r   r  r  r  r
  s       rE   r  z%SubgraphPythonWrapperCodegen.__init__
  s(     +,$8!rG   c                &    | j                   | _        y r<   )r  r  r   s    rE   r  z1SubgraphPythonWrapperCodegen.set_launcher_fn_name
  s     !% 2 2rG   c                     y r<   r   r   s    rE   r  z)SubgraphPythonWrapperCodegen.write_header
  r  rG   c                     y r<   r   r1  s     rE   r  z2SubgraphPythonWrapperCodegen.add_benchmark_harness
  r  rG   c                     y r<   r   r1  s     rE   r.  z6SubgraphPythonWrapperCodegen.benchmark_compiled_module
  r  rG   c                     y r<   r   r   s    rE   r  z5SubgraphPythonWrapperCodegen.write_async_compile_wait  r  rG   c                6    | j                   j                         S r<   )r  r=  r   s    rE   r=  z/SubgraphPythonWrapperCodegen.next_kernel_suffix  s    ""5577rG   c                     y r<   r   rG  s     rE   rM  z2SubgraphPythonWrapperCodegen.generate_after_suffix  r  rG   c                \    | j                   j                  d| j                   d       d}|S )Nz
            def z(args):
            r,   )r  r   r  r  s     rE   r!  z>SubgraphPythonWrapperCodegen.write_launcher_fn_call_get_indent  s<    &&' (	

 rG   c                     yr   r   r   s    rE   r  z4SubgraphPythonWrapperCodegen.get_wrapper_call_indent  s    rG   c                r    | j                   x}r|j                  }|S t        j                  j                  }|S r<   )r  input_nodesr+   r=   r
  )r   rL  r  s      rE   r  z-SubgraphPythonWrapperCodegen.get_graph_inputs  s=     11191**F  WW))FrG   c                    | j                   x}r%t        |j                  j                               }|S t        j
                  j                  }|S r<   )r  r  r  r+  r+   r=   r  )r   rL  namess      rE   r#  z2SubgraphPythonWrapperCodegen.get_graph_input_names   sI    11191..3356E  GG--ErG   c                r    | j                   x}r|j                  }|S t        j                  j                  }|S r<   )r  r@  r+   r=   r  )r   rL  r  s      rE   r  z.SubgraphPythonWrapperCodegen.get_graph_outputs'  s;    11191,,G  gg++GrG   c                ~    |j                         }| j                  x}r||j                  v ry t        |   |       y r<   )rT  r  r  r  r  )r   rT  rx   rL  r
  s       rE   r  z/SubgraphPythonWrapperCodegen.codegen_allocation.  s?     222I2	@U@U8U "6*rG   c                8    | j                   j                          y r<   )r  r  r   s    rE   r  z5SubgraphPythonWrapperCodegen.write_triton_header_once8  s     	446rG   c                8    | j                   j                          y r<   )r  r  r   s    rE   r  z=SubgraphPythonWrapperCodegen.write_get_raw_stream_header_onceA  s     	<<>rG   r<   )r  r   r  r  r  rz  r*  rW  r  r  r~  r|  r  r  )r   r   r   r  r  r  r  r  r.  r  r=  rM  r!  r  r  r#  r  r  r$   r  r  r  r  s   @rE   r  r  
  s     FJ	 - C	3
8	G+ 7 7 ? ?rG   r  )rC   rZ  r   r  )rK   rZ  rL   rZ  )ra   torch.Argumentr   r   )rl   r  r   r   )ry   ztorch._ops.OpOverloadr   r   r<   )
rx   r   r   zlist[triton.Config]r   zlist[TritonGrid]r   ry  r   ztuple[str, str]rW  )
__future__r   r  r   rP  r   rb  rw  r  rd  r  r]   r  	itertoolsr   typingr   r   r   r   r	   r   r
   r  
torch._opstorch.utils._pytreeutils_pytreer  r   r  torch._dynamo.utilsr   r   #torch._inductor.codegen.debug_utilsr   $torch._inductor.codegen.multi_kernelr   %torch._inductor.runtime.runtime_utilsr   %torch.fx.experimental.symbolic_shapesr   r   r   r   r   torch.fx.noder   torch.utils._ordered_setr    torch.utils._sympy.singleton_intr   torch.utils._sympy.symbolr   r   r  r   r   r   	codecacher   r    r!   runtimer"   runtime.hintsr#   r$   r%   r&   r'   r(   r)   r*   virtualizedr+   rg  r-   r.   r/   r0   r1   r2   r3   	cpp_utilsr4   triton_utilsr5   r6   r7   collections.abcr8   r9   r   r=   r:   doprintr  r   r  r   r  rS  rZ  rF   rO   ri   rn   r   rm  r   r{  r   r   r   r  r   r   r  r  r.  r5  rD  rH  rY  r  r`  r^  rz  r  r  r  r  r  r   rG   rE   <module>r     sr   "    
      	   @ @     $ $ & 6 C A ;  . / 9 : ( ( ' ( ' ,       P P 2% 	 u{{C/0299l*+
	>B>$<$ S> 	%UZZ
 #
%&2B1CU3PS8_1T(UU
 /3	U&
U& U& U& ,	U&
 U&pJ&Z   * **	 	 	 	 	 {   KK K KD;  ; ; ;2 %  : E, E E, 
" 
 
(	! 	 ![ ! !: &^ & &R N N N 
N!X7 N!XbBl?#7 l?rG   