
    Vhr                        d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZmZ d dlmZ d dlZd dlZd dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ ddlmZ ddlm Z  ddl!m"Z" ddl#m$Z$m%Z% ddl&m'Z'm(Z(m)Z) ddl*m+Z+m,Z,m-Z- ddl.m/Z/m0Z0m1Z1m2Z2m3Z3 i ejh                  dejj                  dejl                  dejn                  dejp                  dejr                  dejt                  dejv                  dejx                  dejz                  dej|                  dej~                  dej                  d ej                  d!ej                  d"ej                  d#ej                  d$ej                  d%ej                  d&ej                  d'iZHi ejh                  d(ejj                  d)ejl                  d*ejn                  d+ejp                  d,ejr                  d-ejt                  d.ejv                  d/ejx                  d0ejz                  d1ej|                  d2ejx                  d0ejv                  d/ej~                  d3ej                  d4ej                  d5ej                  d6ej                  d7ej                  d8ej                  d9ej                  d:ej                  d;iZId<d=d>d?d@ZJej                  dAej                  dBiZMej                  dCk(  ZOdZP edDg dE      ZQdF ZRdG ZSdHej                  j                  dIe3fdJZVdIe3fdKZWdL ZX G dM dNe/      ZY G dO dPe      Z e       j                  Z[dQ Z\dR Z]dSdTdUej                  dVe_fdWZ`dSdTdUej                  dVe_fdXZa G dY dTe-j                        Zc G dZ d[      Zdej                  fd\e'd]efe/d^f   fd_Zgd` Zhejh                  fdaZidb Zjdc Zkdd Zlde Zmdfe$dgene$   dIefe?e?f   fdhZoy)i    N)
namedtuple)Sequence)AnyCallableOptional)patch)is_integer_dtype)
OrderedSet)
CppPrinter)symbol_is_typeSymT)ValueRanges   )ir)Dep)LoopBody)BaseSchedulerNodeSchedulerBuffer)IndentedBuffersympy_index_symbol_with_prefix
sympy_subs)opsOpsValueV   )CSEVariablededuce_output_dtype_by_nameKernel
KernelArgsOptimizationContextfloatdoublehalfint64_tint32_tint16_tint8_tuint64_tuint32_tuint16_tuint8_tboolbfloat16zc10::complex<half>zc10::complex<float>zc10::complex<double>float8_e4m3fnfloat8_e5m2float8_e4m3fnuzfloat8_e5m2fnuzz
at::kFloatzat::kDoublez	at::kHalfz	at::kLongzat::kIntz
at::kShortz	at::kCharzat::kUInt64zat::kUInt32zat::kUInt16z	at::kBytez	at::kBoolzat::kBFloat16zat::kComplexHalfzat::kComplexFloatzat::kComplexDoublezat::kFloat8_e4m3fnzat::kFloat8_e5m2zat::kFloat8_e4m3fnuzzat::kFloat8_e5m2fnuzz	at::kMetazat::kCPUz	at::kCUDAzat::kXPU)metacpucudaxpuzat::kStridedzat::kMkldnnwin32GemmBlocking)block_mblock_nblock_kc           
          t        d | D              rMt        j                  t        j                  | D cg c]  }t        |t              s|j                  ! c}      S d S c c}w )Nc              3   Z   K   | ]#  }t        |t              s|j                  d u % y wN
isinstanceCppCSEVariabledtype).0ns     Q/home/dcms/DCMS/lib/python3.12/site-packages/torch/_inductor/codegen/cpp_utils.py	<genexpr>z$get_promote_dtype.<locals>.<genexpr>m   s"     RqJq.4Qqwwd"R   ++)all	functoolsreducetorchpromote_typesr?   r@   rA   )argsrC   s     rD   get_promote_dtyperM   g   s[     RDRR	 	"DjN&CQWWD	
  Es   A"
A"
c                     d }t        |       }t        j                  ||      }t        d | D              r|rt	        t        ||             } | S )Nc                     t        | t              rX| j                  rL|rJ| j                  |k7  r;t        j                  | |      } t        | t
              r| j                  n| } || _        | S r=   )r?   r@   rA   r   to_dtyper   value)argpromote_types     rD   promote_argz!promote_args.<locals>.promote_args   sR    sN+				\),,sL1C)#x8#))cC$CI
    )rS   c              3   X   K   | ]"  }t        |t              r|j                  d u $ y wr=   r>   )rB   new_args     rD   rE   zpromote_args.<locals>.<genexpr>   s+      
'>2 MM%
s   (*)rM   rH   partialrG   listmap)new_argsrT   rS   
promote_fns       rD   promote_argsr]   r   s[    
 %X.L""!J
 	 
#
 	

 J12OrU   nodereturnc                 V    | j                   j                  t        j                  d       S r=   )r2   getr    key)r^   s    rD   get_opt_ctxrc      s    99==,00$77rU   c                      t         j                  j                  sJ t        t         j                  j                        S r=   )r   interpretercurrent_noderc    rU   rD   get_current_node_opt_ctxrh      s*    ==%%%%q}}1122rU   c           
         t        | g|i |x}	 |S | dk(  rmt        t        j                  d      r=t        j                  j                  j
                  j                  d      r
t               J t               j                  S t        d |D              sJ t        j                  t        j                  |D cg c]  }t        |t              s|j                  ! c}      S c c}w )Nmaskedrf   masked_subblockc              3   Z   K   | ]#  }t        |t              s|j                  d u % y wr=   r>   rB   rR   s     rD   rE   z4deduce_dtype_for_cpp_cse_variable.<locals>.<genexpr>   s'      
&)Z^5TCIIT!
rF   )r   hasattrr   re   rf   target
startswithrh   rA   rG   rH   rI   rJ   rK   r?   r@   )namerL   kwargsoutput_dtyperR   s        rD   !deduce_dtype_for_cpp_cse_variablert      s    3

 
 	

  		
 AMM>2**11<<=NO(*6	
7 ()///  
-1
 
 	
 
 "&J3*S.*ISYYJ
 	
Js   =C(
C(
c                        e Zd Z	 ddee   deej                     ddf fdZde	fdZ
d Zdej                  fd	Zd
ej                  fdZ xZS )r@   NboundsrA   r_   c                 v    t         |   |||       d| _        t        t        j
                            | _        y )NF)super__init__is_vecr
   sympySymboldependent_itervars)selfrq   rv   rA   	__class__s       rD   ry   zCppCSEVariable.__init__   s1     	vu-",U\\":"<rU   c                     d| j                    d| j                   d| j                   d| j                   d| j                   dS )NzCppCSEVariable(name: z
, bounds: z
, is_vec: z	, dtype: z, dependent_itervars: ))rq   rv   rz   rA   r}   )r~   s    rD   __repr__zCppCSEVariable.__repr__   sQ    #DII;jZPTP[P[}\efjfpfpeq r##'#:#:";1>	
rU   c           	         |dk(  r| j                  |d          ns | j                  j                  |D cg c]  }t        |t              r|j                    c}  |dk(  r| j                  |d          t        d |D              rd| _        | j                  t        |g|i || _        | j                  J y c c}w )Nloadr   
index_exprr   c              3   V   K   | ]!  }t        |t              s|j                   # y wr=   )r?   r@   rz   rm   s     rD   rE   z0CppCSEVariable.update_on_args.<locals>.<genexpr>   s     Q#C1P3::Qs   ))T)	_set_dependent_itervarsr}   updater?   r@   anyrz   rA   rt   )r~   rq   rL   rr   rR   s        rD   update_on_argszCppCSEVariable.update_on_args   s    6>((a1 +D##**  $!#~6 ** |#,,T!W5QQQ":: ;4Q$Q&QDJzz%%%!s   #B=indexc                    |j                   D ]  }|t        j                  j                  v r| j                  j                  |       ;|j                  t        j                  j                  j                  v sl| j                  j                  t        j                  j                  j                  |j                     j                          y)z
        Set the relevant itervars for this variable based on the `index` expression.
        This includes the itervars directly used in the `index` as well as relevant itervars
        of other cse variables used in the `index`.
        N)
free_symbolsr   kernelitervarsr}   addrq   csevarname_mapr   )r~   r   ss      rD   r   z&CppCSEVariable._set_dependent_itervars   s     ## 	AAHH%%%''++A.188<<333''..HHLL,,QVV4GG		rU   itervarc                     || j                   v S r=   )r}   )r~   r   s     rD   
depends_onzCppCSEVariable.depends_on   s    $1111rU   r=   )__name__
__module____qualname__r   r   r   rJ   rA   ry   strr   r   r{   Exprr   r|   r   __classcell__r   s   @rD   r@   r@      sj    
 (,	= C = $	=
 
=
# 
&2UZZ 2%,, 2rU   r@   c                   ,     e Zd Zddddef fdZ xZS )r   T)simplifypr   c                    |r]t        |t        j                        rCt        t        j
                  d      r)t        j
                  j                  j                  |      }t        | %  |      S )Nsizevars)
r?   r{   r   rn   r   graphr   r   rx   doprint)r~   exprr   r   r   s       rD   r   zCppPrinter.doprint   sI    
44*9U77##,,T2Dwt$$rU   )r   r   r   r,   r   r   r   s   @rD   r   r      s    04 % % %rU   r   c                 .    dt          dt        |        dS )Nstatic_cast<>(r   )
INDEX_TYPEcexprr   s    rD   cexpr_indexr      s    *Re~Q77rU   c                    | t        d      k(  rd| dS | t        d      k(  rd| dS t        | t              r d| dt        |       j	                          dS t        j                  |       rd| d	S d| dt        |        dS )
Nz-infz-std::numeric_limits<z>::infinity()infzstd::numeric_limits<r   r   r   z>::quiet_NaN())r!   r?   r,   r   lowermathisnanrepr)rQ   cpp_types     rD   value_to_cppr     s    f&xj>>	%,	%hZ}==	E4	 hZr#e**:*:*<)=Q??	E	%hZ~>>hZr$u+a88rU   localize_buffer_handlerLocalizeBufferHandlerr   global_buf_namec                    t         j                  j                  j                  |   j                  }|J | j
                  |   }|j                         }t        |d       j                  \  }\  }}t        |      t        |      z   }	t        t        |j                         j                              D 
cg c]  }
dt        |	      |
dz   z
    }}
t        |j                  d       }i }|D ]X  }|j                   j#                  d      s|j                   |vs.t$        j&                  j(                  j+                         ||<   Z t-        ||      }|S c c}
w )Nc                 4    t        | j                               S r=   )intis_reduction)xs    rD   <lambda>z,rewrite_index_for_function.<locals>.<lambda>  s    s1>>+;'< rU   )rb   r   r   c                     | j                   S r=   )rq   )r   s    rD   r   z,rewrite_index_for_function.<locals>.<lambda>"  s
    aff rU   )r   r   	schedulername_to_bufdefining_opglobal_to_local	get_nodesmaxgrouptuplerangelen
get_layoutsizesortedr   rq   rp   r{   corenumbersZeror   )r   r   r   snode	local_bufscheduler_nodes_r   reduction_groupcall_rangesidxindices_to_keepsorted_symbolsreplacementsr   s                  rD   rewrite_index_for_functionr     sJ    GG))/:FFE'77HIoo'O"%<#e  A ,!77K Y113889: Ca()*O  E..4DENL 866S!affO&C#jj00557LO8 ul+ELs   8Ec                 X   t        d |j                  D              }g }| j                  |   }t        t	        |j                                     D ]3  }t        t        j                  |      }|j                  ||v r|nd       5  |j                         j                         |      }|S )Nc              3   V   K   | ]!  }t        |t        j                        s| # y wr=   )r   r   INDEX)rB   r   s     rD   rE   z*rewrite_index_for_nodes.<locals>.<genexpr>1  s"      4::)Fs   ))r   )r
   r   r   r   r   get_sizer   r   r   appendr   make_indexer)r   r   r   	used_vars
index_varsr   ivars           rD   rewrite_index_for_nodesr   ,  s    
  %% I J'77HI3y))+,- :,TZZ;	!1#q9: 2I  "//1*=ELrU   c                        e Zd Zdeeej                  f   ded ej                  egej                  f   ddf fdZ
dedej                  fdZdedej                  fd	Zdd
Zd Z xZS )r   r   rewrite_indexr_   Nc                 @    t         |   |       || _        || _        y r=   )rx   ry   r   r   )r~   innerr   r   r   s       rD   ry   zLocalizeBufferHandler.__init__>  s"     	.*rU   rq   r   c                     | j                   rL|| j                   v r>| j                  J | j                  | ||      }| j                   |   j                         }||fS r=   )r   r   get_namer~   rq   r   s      rD   localizezLocalizeBufferHandler.localizeH  sa    DD,@,@$@%%111&&tUD9E''-668DU{rU   c                 T     | j                   j                  | j                  ||       S r=   )_innerr   r   r   s      rD   r   zLocalizeBufferHandler.loadO  s$    t{{tU!;<<rU   c                 .   | j                  ||      \  }}| j                  j                  ||||      }| j                  rU|| j                  v rGt	        t
        j                  t              r)t
        j                  j                  j                  |       |S r=   )
r   r   storer   r?   r   r   r   store_buffer_namesdiscard)r~   rq   r   rQ   modelocal_buffer_namelocal_buffer_indexress           rD   r   zLocalizeBufferHandler.storeR  sz    04dE0J--kk 13EudS  ,,,188V, HH''//0AB
rU   c                 ^     | j                   j                  g | j                  ||      | S r=   )r   store_reductionr   )r~   rq   r   rQ   s       rD   r   z%LocalizeBufferHandler.store_reduction_  s,    *t{{**NDMM$,FNNNrU   r=   )r   r   r   dictr   r   Bufferr   r{   r   ry   r   r   r   r   r   r   s   @rD   r   r   =  s    + c299n-+  !8%**c JEJJ VW	+
 
+S  = =UZZ =OrU   c                   N   e Zd ZdZdeddfdZd Zd Z	 ddej                  d	e
eej                        fd
Zefdedef   dedej"                  egej"                  f   fdZefdeej*                     dedej"                  egej"                  f   deej*                     fdZy)LocalBufferContexta  
    This class creates a context that helps to generate code involving Inductor IR with
    function local buffers. These buffers are constructed during the codegen process and
    are used to store intermediate results such as local accumulators. We do not want to
    add them to `V.graph` since they are not global and we do not want to add them as
    function arguments either. So we patch the codegen processes under this scope to support
    these buffers without exposure to the outside world.
    kernel_argsr_   Nc                     || _         t        j                         | _        i | _        i | _        i | _        t               | _        y r=   )	r   
contextlib	ExitStack
exit_stacklocal_buffersglobal_buffersr   r
   removed_buffers)r~   r   s     rD   ry   zLocalBufferContext.__init__m  s<    &$..03546570:rU   c                      j                   j                          t        j                  j                   fd} j                   j                  t        j                  t        j                  d|              j                  j                   fd} j                   j                  t        j                   j                  d|              j                  j                   fd} j                   j                  t        j                   j                  d|              j                   j                  t        j                                 S )Nc                 j    | j                   v rj                   |    j                         S  |       S r=   )r  	get_dtype)rq   original_get_dtyper~   s    rD   r  z/LocalBufferContext.__enter__.<locals>.get_dtype}  s6    t)))))$/99;;%d++rU   r  c                 4    | j                   v r| S  |       S r=   r  )rq   original_inputr~   s    rD   inputz+LocalBufferContext.__enter__.<locals>.input  s!    t)))!$''rU   r  c                 4    | j                   v r| S  |       S r=   r
  )rq   original_outputr~   s    rD   outputz,LocalBufferContext.__enter__.<locals>.output  s!    t)))"4((rU   r  )r  	__enter__r   r   r  enter_contextr   objectr   r  r  set_local_buffer_context)r~   r  r  r  r  r  r  s   `   @@@rD   r  zLocalBufferContext.__enter__y  s    !!#WW..	,
 	%%ell177K&ST))//	(
 	%%ell43C3CWe&TU**11	)
 	%%ell43C3CXv&VW 	%%a&@&@&FGrU   c                 r    | j                   j                          | j                  j                  |||       y r=   )r  clearr  __exit__)r~   exc_typeexc_valexc_tbs       rD   r  zLocalBufferContext.__exit__  s*      "  7F;rU   local_bufferr  c                    |j                         | j                  vsJ || j                  |j                         <   |r|D ]  }|j                         }|| j                  vr|| j                  vsJ || j                  |<   || j                  |<   |t        j
                  j                  vsl| j                  j                  |       t        j
                  j                  j                  |        y y r=   )r   r  r  r   r   r   r  r   )r~   r  r  global_bufferglobal_buffer_names        rD   add_local_bufferz#LocalBufferContext.add_local_buffer  s     $$&d.@.@@@@6B<0023!/ D%2%;%;%="&d.A.AA*$2F2FFG ;H##$67;G$$%78%QWW-D-DD ((,,-?@GG++//0BCD rU   fn.r   r   c                       fd}|S )Nc                      t        j                  t        t        j                         j                              5   | i |cd d d        S # 1 sw Y   y xY w)N)r   r   )r   set_ops_handlerr   get_ops_handlerr   )rL   rr   r  r   r~   s     rD   r   z3LocalBufferContext.localize_function.<locals>.inner  sU    ""%%%'$($8$8"/ + 4*6*+ + +s   AArg   )r~   r  r   r   s   ``` rD   localize_functionz$LocalBufferContext.localize_function  s    	+ rU   nodesc                      t        |      dkD  sJ dt        j                  f fd}|D cg c]
  } ||       c}S c c}w )aX  
        Given `local_buf` and `global_buf` registered in current `LocalBufferContext`
        though the method of `add_local_buffer`, localizes the `global_buf` to `local_buf`
        for the given `nodes` and returns a new list of IR nodes that work on `local_buf`
        instead of `global_buf`, i.e., all the loads and stores are redirected to
        `local_buf`. This helps the fused loops to work on smaller-sized local buffers
        for better data locality.

        The the data access of `local_buf` is assumed to be contiguous with the
        same order as the `global_buf`.
        r   r^   c                    t        | t        j                        r| j                  n| }t        |t        j                        sJ j                  |j                        }t        j                  ||      }t        | t        j                        r6t        j                  | j                         | j                         |      }|S |}|S )N)inner_fn)rq   layoutdata)r?   r   ComputedBufferr*  Loopsr$  r(  dataclassesreplacer   r   )r^   loopsnew_inner_fn	new_loopsnew_noder   r~   s        rD   wrap_inner_fn_for_nodezALocalBufferContext.localize_nodes.<locals>.wrap_inner_fn_for_node  s    !+D"2C2C!DDII$EeRXX...11L
 $++ELII$ 1 12,,1B O %OrU   )r   r   IRNode)r~   r%  r   r3  r^   s   ` `  rD   localize_nodesz!LocalBufferContext.localize_nodes  sA    $ 5zA~~	 	$ :??&t,???s   A r=   )r   r   r   __doc__r   ry   r  r  r   r   r   rY   r  r   r   r   r{   r   r   r$  r   r4  r5  rg   rU   rD   r   r   c  s    
=J 
=4 
= D<
 TXDIID7?RYY7PD4 'S#X  $ejj#6

B
0 $&@BII&@  $ejj#6

B
&@ 
bii&@rU   r   buffervars.c                 "      fd|D        }|S )zk
    Given list of cse variables,
    Cast each to new mask base dtype and return casted cse variable.
    c              3      K   | ]M  }t         j                  j                  j                  t         j                  j	                  |              O y wr=   )r   r   r   generate_get_mask_cast)rB   r   r7  rA   s     rD   rE   z'unify_mask_base_type.<locals>.<genexpr>  sH      
 	 	
xx&&sE23	
s   AArg   )r7  r8  rA   new_varss   ` ` rD   unify_mask_base_typer>    s    
 H OrU   c                     | j                   t        j                  k(  rV|j                   t        j                  k(  sJ t        j                  }t	        t
        j                  j                  | |f|      S | |fS )zx
    Given two cse variables, when dtype is bool, unify them to the same mask dtype and return casted cse variable.
    )rA   rJ   r,   int32r>  r   r   compute)ab
mask_dtypes      rD   may_unify_binary_op_mask_typerE    sY     	ww%**ww%**$$$[[
#AHH$4$4q!fjIIa4KrU   c                    t        | j                        sJ |j                  d       |j                         5  |j                  t        | j                      dt
        j                  j                   d       |j                  t        |    dt
        j                  j                   d       |j                  |  d       |j                  dt        | j                      dt
        j                  j                   d       |j                         5  |j                  |       d d d        t
        j                  j                  |	      }|d
k(  r|j                  dt        |    d       n|j                  dt        |    d| d       d d d        |j                  d       |S # 1 sw Y   xY w# 1 sw Y   (xY w)Nz[&]()z offset[z];z result[z.store(offset);zfor( z offset_idx = 0; offset_idx < z; offset_idx++ ))rA   r   zreturn at::vec::Vectorized<z>::loadu(result);zreturn at::vec::VectorizedN<z, z())	r	   rA   	writelineindentDTYPE_TO_CPPr   r   tiling_factor_get_num_vectors)offsetcoderand_function	dst_dtypenum_vectorss        rD   codegen_randrQ    s   FLL)))NN7	 FLL)*(1883I3I2J"M	
 	,y12(188;Q;Q:RRTUV&12L.//MahhNdNdMeeuv	
 [[] 	*NN=)	*hh//i/@!NN-l9.E-FFWX NN.|I/F.Gr+Vgh#( 	NN4K	* 	* s%   CF7F+'A)F7+F4	0F77G c                     | t         j                  t         j                  fv r t         j                  t         j                  fS t         j                  t         j                  fS r=   )rJ   uint8int8r@  float32)input_dtypes    rD   *get_gemm_template_output_and_compute_dtyperW  )  s=    u{{EJJ//U[[))u}}--rU   c                   	
 | j                         | j                         
dk(  r
fd}nxdk(  r(d|v sJ |d   dk(  r
fd}n]|d   dk(  sJ 
fd}nKd	k(  rfd
}n?dk(  rfd}n3dk(  rfd}n'dk(  sdk(  rd 
fd}ndk(  r)d|v sJ t        |d         dk(  sJ |d   d   
fd}ndk(  r2d|v sJ t        |d         dk(  sJ |d   d   |d   d   
fd}ndv r[d|v sJ |d   }t        | j                               }t        |j                               }||z
  	|j                         	fd}nLdk(  r9d|v sJ d|v sJ d|v sJ |d   |d   }|d   
|j                         fd }nt	        d!       t        j                  | j                         
|| j                         "      S )#Nreluc                 l     |       }t        j                  d      }t        j                  ||      S Nr   )r   constantmaximum)r   r  zerorA   input_loaders      rD   r(  z+create_epilogue_with_attr.<locals>.inner_fn5  s.     'E<<5)D;;ud++rU   gelu	algorithmnonec                     |       }t         j                  k7  r$t        j                  |t         j                        }t        j                  dt         j                        }t        j                  dt         j                        }t        j                  dt         j                        }||z  t        j
                  ||z        |z   z  }t         j                  k7  rt        j                  |      }|S )N      ?      ?g;f?)rJ   r!   r   rP   r\  erf)r   r  r#   oneconstresultrA   r_  s         rD   r(  z+create_epilogue_with_attr.<locals>.inner_fn>  s    $U+EKK'LL<E||C5ll34%7E)?#)EFEKK' \\&%8FrU   tanhc                 R    |       }t         j                  k7  r$t        j                  |t         j                        }t        j                  dt         j                        }t        j                  dt         j                        }t        j                  dt         j                        }t        j                  dt         j                        }||z  |t        j
                  ||||z  |z  |z  z   z        z   z  }t         j                  k7  rt        j                  |      }|S )Nrd  re  gQ63E?gHm?)rJ   r!   r   rP   r\  rj  )	r   r  r#   rg  const1const2ri  rA   r_  s	          rD   r(  z+create_epilogue_with_attr.<locals>.inner_fnM  s    $U+EKK'LL<E||C5ll34&8%++Fh< ((6UVe^e5Ke5S-S#TUV  EKK' \\&%8FrU   swishc                 H     |       }|t        j                  |      z  }|S r=   r   sigmoid)r   r  ri  r_  s      rD   r(  z+create_epilogue_with_attr.<locals>.inner_fnc  s%     'ES[[//FMrU   rq  c                 :    t        j                   |             S r=   rp  r   r_  s    rD   r(  z+create_epilogue_with_attr.<locals>.inner_fnj  s    ;;|E233rU   c                 :    t        j                   |             S r=   )r   rj  rs  s    rD   r(  z+create_epilogue_with_attr.<locals>.inner_fno  s    88L/00rU   	hardswishhardsigmoidc                    t        j                  dt        j                        }t        j                  dt        j                        }t        j                  dt        j                        }t        j                  dt        j                        }t        j                  | |z   |      }t        j
                  ||      }||z  S )Nr         gUUUUUU?)r   r\  rJ   r!   r]  minimum)r  r^  sixthreeone_over_sixr   mins          rD   hardsigmoid_floatz4create_epilogue_with_attr.<locals>.hardsigmoid_floatt  s    <<5;;/D,,q%++.CLLEKK0E<<(;U[[IL++eemT2C++c3'C%%rU   c                      |       }t         j                  k7  r$t        j                  |t         j                        } |      }dk(  r||z  }t         j                  k7  rt        j                  |      }|S )Nru  )rJ   r!   r   rP   )r   r  ri  attrrA   r  r_  s      rD   r(  z+create_epilogue_with_attr.<locals>.inner_fn}  si     'E#UEKK8&u-F{"#fe4MrU   
leaky_reluscalarsr   r   c           	          |       }t         j                  k7  r$t        j                  |t         j                        }t        j                  dt         j                        }t        j
                  ||kD  ||t        j                  t         j                        z        }t         j                  k7  rt        j                  |      }|S r[  )rJ   r!   r   rP   r\  where)r   r  r^  ri  rA   r_  negative_slopes       rD   r(  z+create_epilogue_with_attr.<locals>.inner_fn  s     'E#UEKK8<<5;;/DYYeUS\\.%++-V%VF #fe4MrU   hardtanhr   c           	          |       }t         j                  k7  r$t        j                  |t         j                        }t        j                  t        j
                  |t        j                  t         j                              t        j                  t         j                              }t         j                  k7  rt        j                  |      }|S r=   )rJ   r!   r   rP   rz  r]  r\  )r   r  ri  rA   r_  	max_value	min_values      rD   r(  z+create_epilogue_with_attr.<locals>.inner_fn  s     'E#UEKK8[[E3<<	5;;#GHY4F #fe4MrU   )r   submulotherc                     t        t              }dk7  r | |        | d              S  | |        |             S r[  )getattrr   )r   opr  	dims_diffr_  other_loaders     rD   r(  z+create_epilogue_with_attr.<locals>.inner_fn  sK    d#BA~,u-|E)*<M/NOO,u-|E/BCCrU   bias_addbetarA   c                      |       } |       }dk7  r,t        j                  t        j                        |z  |z   }|S ||z   }|S )Nr   )r   r\  rJ   r!   )r   biasr  ri  r  bias_loaderr_  s       rD   r(  z+create_epilogue_with_attr.<locals>.inner_fn  sS    u%D 'EqydEKK84?%G M MrU   z Unsupported epilogue attribute: )devicerA   r(  ranges)make_loaderr  r   r   
ValueErrorr   	Pointwise
get_device)input_bufferr  rr   r(  r  num_input_dimsnum_other_dimsr  r  r  rA   r  r_  r  r  r  r  s    `     @@@@@@@@@@rD   create_epilogue_with_attrr  0  sf   ++-L""$Ev~	,
 
f$$$+&(
 +&&000( 
	
 
		4 
	1 
	 5	&		 		 
	F"""6)$%***	*1-
	 
	F"""6)$%***9%a(	9%a(	
	 
	 
&	&&   w\2245U^^-."^3	((*	D 	D 
	&   &   f~ww'')	 ;D6BCC<<&&($$&	 rU   c                    t        d | D              r| }nt        | d   d      rYt        d | D              sJ t        d | D              sJ | D cg c]%  }|j                  j                  d   j                  ' }}nNt        d | D              sJ t        d | D              sJ | D cg c]  }|j                  d   j                   }}|J |S c c}w c c}w )Nc              3   <   K   | ]  }t        |t                y wr=   )r?   r   rB   r  s     rD   rE   z!_get_loop_body.<locals>.<genexpr>  s     
6:b(#
6s   r   original_fnc              3   4   K   | ]  }t        |d         yw)r  N)rn   r  s     rD   rE   z!_get_loop_body.<locals>.<genexpr>  s     Dbwr=1Ds   c              3   ~   K   | ]5  }t        |j                  j                  d    j                  t               7 ywr   N)r?   r  rL   _bodyr   r  s     rD   rE   z!_get_loop_body.<locals>.<genexpr>  s1      GI
2>>..q177Bs   ;=c              3   P   K   | ]  }t        |t        j                           y wr=   )r?   rH   rX   r  s     rD   rE   z!_get_loop_body.<locals>.<genexpr>  s     KRz"i&7&78Ks   $&c              3   j   K   | ]+  }t        |j                  d    j                  t               - ywr  )r?   rL   r  r   r  s     rD   rE   z!_get_loop_body.<locals>.<genexpr>  s%     P"z"''!*"2"2H=Ps   13)rG   rn   r  rL   r  )fn_listloop_bodiesr  s      rD   _get_loop_bodyr    s    

6g
6671:}-DGDDDD MT    CJJB2>>..q177JKJK7KKKKPPPPP6=>2771:++>K>""" K ?s   *C* Cc                    t        t        j                            }| D ]  }|j                  j                  gt        |j                  j                               D cg c]  }|j                   c}z   }|D ]Y  }|j                  D ]H  }|j                  dk7  r|j                  |j                  t        j                     j                         J [  |S c c}w )Ncall_method)r
   rJ   rA   
root_blockr   rY   	subblocksvaluesr%  r  r   r2   r    rb   )r  dtypes	loop_bodybodygraphsr   r^   s          rD   _get_dtype_from_loopbodiesr    s    $&F  E	&&,,-#'	(;(;(B(B(D#E1
DJJ1
 
  	EE E77m+

499%8%<%<=CCDE	E	E M1
s   Ctemplate	epiloguesc                 B   dt         j                  dt        t           dt        t        j
                     fddt        t        j
                     dt        t           dt        t        t        f   fddt        t           d	t        t         j                     dt        t        t        f   ffd
}| j                         sJ | j                         }|D cg c]/  }|j                         D ]  }|j                  |j                   1 }}} |||      S c c}}w )Nepilogue_nodetemplate_buf_namesr_   c                 x    | j                         D cg c]  }|j                  |v r|j                   c}S c c}w r=   )	get_readsrq   r   )r  r  reads      rD   !_get_indexes_of_template_buf_readzStemplate_fusion_with_epilogues_supported.<locals>._get_indexes_of_template_buf_read  s>    
 &//1
yy.. JJ
 	
 
s   !7index_of_template_buf_readepilogue_writesc                     t        t        |             }|dkD  rd}d}||fS |dk(  rd}d}||fS |dk(  r| d   t        fd|D              }|}||fS t        d      )Nr   Fr   Tc              3   <   K   | ]  }|j                   k(    y wr=   r   )rB   writeiotbrs     rD   rE   zftemplate_fusion_with_epilogues_supported.<locals>._check_supported_and_same_indexes.<locals>.<genexpr>  s     OeU[[E1Os   zShould not reach here)r   r
   rG   AssertionError)r  r  num_indexes
same_index	supportedr  s        @rD   !_check_supported_and_same_indexeszStemplate_fusion_with_epilogues_supported.<locals>._check_supported_and_same_indexes  s     *%?@A?JI *$$ AJI *$$ A.q1EOOOJ #I *$$ !!899rU   template_outputsepilogue_nodesc                 n   | D cg c]  }|j                          }}|D cg c]  } ||       }}|D cg c]  }|j                         j                   }}t        ||      D cg c]  \  }} ||       }	}}t        |	 \  }
}t	        |
      t	        |      fS c c}w c c}w c c}w c c}}w r=   )r   get_read_writeswritesziprG   )r  r  r   r  r  indexes_of_template_buf_readsepilogue_nodes_writesreadsr  resultsr  same_indexesr  r  s               rD   _template_fusion_supportedzLtemplate_fusion_with_epilogues_supported.<locals>._template_fusion_supported  s     5EEqajjlEE "0)
 .m=OP)
% )

 IW!
7DM))+22!
 !
 "%-/D"
v .eV<
 
 #&w-	<9~s<000! F)
!

s   B"B'!B,+B1)r   	OperationrY   r   r{   r   r   r
   r   r   r,   r   is_templateget_outputsr   r^   )	r  r  r  r  epiloguerC   r  r  r  s	          @@rD   (template_fusion_with_epilogues_supportedr    s   
||
9=c
	ejj	
%$,UZZ$8%#C% 
tTz	%.1"?31EI",,EW1	tTz	1* !!!++- "##% 66 	
N  &&6GGs   4D)pr   r-  rH   r   syscollectionsr   collections.abcr   typingr   r   r   unittest.mockr   r{   rJ   torch._prims_commonr	   torch.utils._ordered_setr
   torch.utils._sympy.printersr   _CppPrintertorch.utils._sympy.symbolr   r   torch.utils._sympy.value_rangesr    r   dependenciesr   r  r   r   r   r   utilsr   r   r   virtualizedr   r   r   commonr   r   r   r   r    rU  float64float16int64r@  int16rT  uint64uint32uint16rS  r,   r-   	complex32	complex64
complex128r.   r/   r0   r1   rI  DTYPE_TO_ATENDEVICE_TO_ATENstrided_mkldnnLAYOUT_TO_ATENplatform_IS_WINDOWSr   r7   rM   r]   fxNoderc   rh   rt   r@   r   r   r   r   r   r   r   r   WrapperHandlerr   r   r!   r   r>  rE  rQ  rW  r  r  r  rY   r  rg   rU   rD   <module>r     s       
 " $ * *    0 / A : 7     : N N * * 	MM7	MM8 
MM6 
KK	
 
KK 
KK 
JJ 
LL* 
LL* 
LL* 
KK 
JJ 
NNJ 
OO) 
OO*  
,!" 
#$ 
}	,	,).	MM<	MM= 
MM; 
KK	
 
KK 
KK 
JJ 
LL- 
LL- 
LL- 
KK 
LL- 
LL- 
JJ 
NNO  
OO'!" 
OO(#$ 
*	-	)	1	1-4 	 
MM>	MM=
 llg%
.*KL<8ehhmm 8(; 83"5 3

<92[ 92x% % 	8
94:: 84:: "#OA,, #OLJ@ J@` ++
S 
!& 9> 6.cL&AHAH,01B,CAH
4:AHrU   