
    VhU                    D	   U d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z
 d dlmZ d dlmZmZmZmZmZ d dlZd dlZd dlZd dlmZ d dlmZmZ d dlmZ d dlmZmZm Z  d d	l!m"Z"m#Z#m$Z$ d
dl%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7 ddl8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZCmDZD ddlEmFZFmGZGmHZHmIZI ddlJmKZKmLZLmMZMmNZNmOZOmPZPmQZQmRZRmSZSmTZTmUZUmVZV ddlWmXZXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_m`Z`maZambZbmcZcmdZdmeZe ej                  dk(  Zg ej                  d      d        Ziej                  j                  eld      Zm eg d      Zndddddddd d!d!d"
Zo eg d#      Zpd$d%d&d'd(d)d*d+d,d-d.
Zqd/d0d1Zrej                  ej                  gZuej                  ej                  ej                  ej                  ej                  ej                  ej                  ej                  ej                  g	Z}e~ej                     ed2<   ej                  ej                  ej                  ej                  ej                  gZe~ej                     ed3<   d4 Zd5 Z	 	 dwd6eej                     fd7Zd8 Zd9eRd:ej                  d;ed<ej                  d=ej                  d>eLfd?Zd@eeeNf   dAedBedCej                  dDeeef   f
dEZdFeRdGedHefdIZej                  d6ej                  dJej                  fdK       Zej                  d6ej                  dJej                  dLefdM       Zej                  	 dxd6ej                  dJej                  dLee   fdN       Zej                    G dO dP             Z G dQ dRe5      Z G dS dT      Z G dU dVeU      Zej+                  dW        G dX dYe      Zej+                  dZ       ej/                           G d[ d\e      Z G d] d^eS      Z G d_ d`e      Z G da dbe      Zdce/d>eeej                     exf   fddZ G de df      Z G dg dhe      Z G di dje      Z G dk dle      Z G dm dne2      Z G do dp      Z G dq dr      Zej                    G ds dt             Zej                    G du dv             Zy)y    N)Sequence)Enum)AnyCallablecastOptionalUnion)dependencies)is_float_dtypeis_integer_dtype)
OrderedSet)CeilDivFloorDivModularIndexing)free_symbol_is_typesymbol_is_typeSymT   )counters   )	codecacheconfigcpp_buildercpu_vec_isairmetrics)LoopBody)BaseSchedulerNodeBaseSchedulingExternKernelSchedulerNodeForeachKernelSchedulerNodeFusedSchedulerNode	SchedulerSchedulerNode)cache_on_selfget_bounds_index_exprget_fused_kernel_namehas_free_symbolsis_multi_outputs_templateis_welford_reductionparallel_num_threadsPlaceholdersympy_index_symbolsympy_index_symbol_with_prefixsympy_product
sympy_subs)NullKernelHandleropsOpsValueV   )BackendFeatureBracesBufferCSECSEVariableDataTypePropagationDeferredLineDTYPE_TO_COMPUTATION_DTYPEIndentedBufferKernel
KernelArgsOpOverridesOptimizationContext)_get_dtype_from_loopbodies_get_loop_bodycexprcexpr_indexcodegen_randCppCSEVariableDTYPE_TO_CPP
INDEX_TYPELocalBufferContextmay_unify_binary_op_mask_typepromote_args(template_fusion_with_epilogues_supportedunify_mask_base_typevalue_to_cppwin32c                      t         rdS dS )Nz__declspec(dllexport) _IS_WINDOWS     K/home/dcms/DCMS/lib/python3.12/site-packages/torch/_inductor/codegen/cpp.pyget_export_declarationrX   W   s    &1"9r9rV   schedule)+*^||minmaxrZ   r[   r\   r^   r_   argminargmaxr]   welford)
sumprodxor_sumr^   r_   r`   ra   anywelford_reducewelford_combine)
r_   r^   rc   rd   re   rg   rh   r`   ra   rf   z
at::Tensorlongdoubleboolzstd::stringzc10::ScalarTypezat::MemoryFormatz
at::Layoutz
at::Devicez
at::Scalar)
Tensorintfloatrk   str
ScalarTypeMemoryFormatLayoutDevicenumberzstd::vectorzstd::optional)Listr   VECTORIZABLE_DTYPESMASKED_VECTORIZABLE_DTYPESc                    |t         v rt        j                  }| dv ry| dk(  ry| dv r|t        |   }|t        j                  k(  r| dv rt        t        j
                     }t        |      rd| dnd	| d
}t        |      rd	| dnd	| d}| dv r|n|}| dv r|S d| d| dS t        |       rdt        |    dS t        |       )N)re   rc   rf   r   rd   r5   )r_   ra   r^   r`   r`   ra   -std::numeric_limits<>::infinity()std::numeric_limits<>::min()>::max())r_   ra   )r_   r^   IndexValue<z>{0, }Welford<>())	DTYPE_LOWP_FPtorchfloat32rH   rk   rn   r   r*   AssertionError)reduction_typedtypecdtypemin_varmax_varinit_vars         rW   reduction_initr      s    22;;e$EJJ>5I#I!%++.F e$ $F8=9'xx8 	 e$ #6(-8'xx8 	
 -0AA7w / 	
 vhfXJb9	

 N+,u-.c22

((rV   c                     t         t        |      }t        |       rd| dS | dv r0|t        j                  k(  rt         t        j
                     }d| dS |S )Nr   >ry   r   )rH   r<   r*   r   rk   rn   )r   r   scalar_types      rW   reduction_acc_typer      sa    9%@AKN++a((--EJJ&u{{3K[M++rV   indexc           	      >   |t         j                  k(  }| dk(  r|rdnd}| d| d| S | dk(  r| d| S | dk(  r| d| S | d	k(  r| d
| S | dv r|  d| d| dS | dk(  r	d| d| dS | dk(  r6t        |t              r|\  }}}	nt	        | |      \  }}}	d| d| d| d|	 d	S | dv rkt        |d      rD|j                  t         j                  k(  r'|j                  s||  d| d| d| dS |  d| d| dS ||  d| d| d| dS |  d| d| dS t        |       )Nrc   |rZ    rd    * re    ^ rf    || )r^   r_   z_propagate_nan(, )rg   welford_combine(rh   , {})ry   r   z	_combine(z, static_cast<float>(), )))	r   rk   
isinstancetuplereduction_projecthasattrr   is_vecr   )
r   var
next_valuer   	src_dtypeis_boolconjunctionmeanm2weights
             rW   reduction_combiner      s    5::%G$c#a}Aj\22c*&&"c*&&d:,''' !R
|1EE))!#bA66**j%()D"f0LD"f!#d4&2$bDD--J(  EJJ.%% ()3%7LZLX[\a[bbcdd &&iu4I*UWX $%Yse2j\E7!LL$%Yse2j\CC

((rV   c                 J    t        |       r| d| d| dfS | dv r| dS |S )Nz.meanz.m2z.weightry   z.index)r*   )r   accs     rW   r   r     sC    N+e}SkcU'?::	/	/f~JrV   codeiter_varnew_iter_var
loop_startloop_endreturnc                 b   t               }t        j                         5 }|j                  dt         d| dt        |       d| dt        |       d| dz          |j                  |j                                t        | j                        D ]  \  }}t        |t        t        f      sJ d}	t        |t              r|j                  }	|j                  }t        j                   d	| z   d	z   | |      }
|	rt        |	|
      }
|j                  |
        	 ddd       |S # 1 sw Y   |S xY w)
a  
    f(iter_var) is transformed to f(new_iter_var) under the inner loop
      \/
    for (new_iter_var = loop_start; new_iter_var < loop_end; new_iter_var++) {
        f(new_iter_var)
    }
    Please be careful while using this function,
    as the variable defined in f(iter_var) will be invalid outside the for loop.
    For example:
    auto tmp0 = in_ptr[x0]; ->
    for (new_x0 = start; new_x0 < end; new_x0++){
        auto tmp0 = in_ptr[new_x0];
    }
    The tmp0 is invalid outside the loop.
    zfor (r    = ; < ; ++)N\b)r7   
contextlib	ExitStack	writelinerI   rE   enter_contextindent	enumerate_linesr   ro   r;   namelineresub)r   r   r   r   r   transformed_codestack_r   deferred_namenew_lines              rW   move_code_under_inner_loopr   	  s?   , $~				 15""J<qc+j2I1J!Lc+h"7!8<.LM	
 	,3356 - 	1GAt    !M$- $		yyvve
3e;~PTUH'x@&&x0	11, -1, s   C:D$$D.acc_varacc_typer   r   lenc                     t               }t        j                         rd|  d| d| dn
| d|  d| d}|j                  |        |j	                  d| d	d
d|  d |||       ddg       |S )a  
    MSVC don't support dynamic array(VLA). So we use std::unique_ptr here.
    Ref: https://stackoverflow.com/questions/56555406/creating-dynamic-sized-array-using-msvc-c-compiler
    MSVC is the only one compiler without VLA. support. Since MSVC can't get good performance here.
    We just use unique_ptr make it works on MSVC.
    For other compilers, we continue to use VLA to get best performence.
    auto z_arr = std::make_unique<z[]>();r   _arr[];for (int i = 0; i < ; i++){    z
_arr[i] = r   r   )r=   r   
is_msvc_clr   
writelines)r   r   r   r   r   init_fncode_bufferacc_decls           rW   reduction_prefix_arrayr   9  s     !"K !!# y0
$se2Fz7)5R0 
 XJ("3%v.7):gne&D%EQG		
 rV   bufferr   new_namec                 D   t        | j                        D ]  \  }}t        |t        t        f      sJ t        |t              r/t        j                  d| z   dz   | |j                        |_        ]t        j                  d| z   dz   | |      | j                  |<    y )Nr   )r   r   r   ro   r;   r   r   r   )r   r   r   ir   s        rW   replace_acc_namer   Z  s    V]]+ V4
 	
 
 dL)u$058XJSDI!vve&7%&?H:PTUFMM!VrV   r   c                     | j                  |      st        j                  j                  S ||dz   i}t	        | |      }t        j
                  || z
        S Nr5   )hassympySZeror0   simplify)r   r   replacement	new_indexs       rW   	stride_atr   i  sJ    99S> ww||a.K5+.I>>)e+,,rV   
vec_lengthc                   	 dd	fd}	fd}| }t        j                  dd      }| j                  t              r| j	                  t        |      |      } t        j                  dd      }| j                  t
              r| j	                  t        ||      |      } t        j                  |       } | |k7  rt        |       S | S )a  
    Simplifies the index expression within the range of a vectorized loop.
    Given a vectorized loop variable `var` in the range of a loop with `vec_length`,
    this function transforms the `index` into an equivalent form. It handles
    simplifications for cases where `var` can be expressed as `vec_length * a + b`,
    where `b` ranges from 0 to `vec_length - 1`. The function reduces occurrences
    of `FloorDiv` and `ModularIndexing` in the `index` with best-effort optimizations.

    NOTE:
    The simplified index expression is intended for analysis purposes only, not
    for code generation. It replaces `FloorDiv` and `ModularIndexing` with free variables
    which are not dependent on the loop variable `var` in the vectorized range. Check
    https://github.com/pytorch/pytorch/pull/117221#discussion_r1449746217 for more details.

    Examples:
    1. If `var` is `x3` and `vec_length` is 16, and `x3 = 16*a + b`, then
       `FloorDiv(x3, div)` or `ModularIndexing(x3, div, mod)` becomes a free variable
       when `div` is divisible by 16.
    2. `ModularIndexing(x3, 1, mod)` can be simplified to `x3 + c` where `c` is a free
       variable when `mod` is divisible by 16.
    r   c                     t        |       }t        j                  |       k(  rt        j                   d       }dz  |S )N_div_cr5   )r   r   gcdSymbol)divisorresultdiv_freevar_idr   r   s     rW   visit_indexing_divz7simplify_index_in_vec_range.<locals>.visit_indexing_div  sK    #w'99Wj)Z7\\SE/?"@AFaNrV   c                    t        | |      }t        j                  |       k(  r!t        j                   d       }dz  |S | dk(  r;t        j                  |      k(  r"t        j                   d       z   }dz  |S )N_mod_cr5   )r   r   r   r   )r   modulusr   mod_freevar_idr   r   s      rW   visit_modular_indexingz;simplify_index_in_vec_range.<locals>.visit_modular_indexing  s     gw799Wj)Z7\\SE/?"@AFaN  \eii<
J5<<3%vn5E(FGGFaNrV   r   T)integerr   )r   Wildr   r   replacer   r   simplify_index_in_vec_range)
r   r   r   r   r   original_indexdivmodr   r   s
    ``     @@rW   r  r  u  s    0 NN	 N
**Y
-CyyhsC02DE
**Y
-Cyy!oc3<>TUNN5!E*5#zBBLrV   c                 8    |rt        | ||      } t        | |      S N)r  r   )r   r   r   s      rW   stride_at_vec_ranger    s"     +E3
CUC  rV   c                   &    e Zd ZU dZeed<   eed<   y)ParallelDepthz{
    A class representing parallel depth.
    Includes the starting depth of parallelism and the depth of parallelism.
    parallel_depthstart_depthN)__name__
__module____qualname____doc__rm   __annotations__rU   rV   rW   r	  r	    s    
 rV   r	  c                   d     e Zd Zededefd       Zdddeeee	f      f fdZ
d Zd	 Zd
 Z xZS )OuterLoopFusedSchedulerNodenode1node2c                    |j                   |j                   u sJ t        d ||fD              sJ t        d ||fD              rt | |j                   t        |      t        u rt        |j                               n|gt        |      t        u r t        |j                               z   |      S |gz   |      S  | |j                   ||g|      S )Nc              3   T   K   | ]   }t        |      t        t        t        fv  " y wr  )typer  r$   r"   .0nodes     rW   	<genexpr>z3OuterLoopFusedSchedulerNode.fuse.<locals>.<genexpr>  s1      
  J+"
   &(c              3   >   K   | ]  }t        |      t        u   y wr  r  r  r  s     rW   r  z3OuterLoopFusedSchedulerNode.fuse.<locals>.<genexpr>       TTtDz88T   )	schedulerallrf   r  r  listget_outer_nodes)clsr  r  outer_loop_fusion_depths       rW   fusez OuterLoopFusedSchedulerNode.fuse  s     %//111 
 
 
 	
 
 TeU^TT E{&AA ..01  E{&AA ..01 (!   (! & u8OPPrV   r!  r#   outer_fused_nodesc                     || _         || _        g }| j                   D ]B  }t        |t        t        f      sJ |j                  t        |j                                      D t        | %  ||       y r  )
r(  r&  r   r$   r"   extendr#  	get_nodessuper__init__)selfr!  r(  r&  flatten_snodes_node	__class__s         rW   r-  z$OuterLoopFusedSchedulerNode.__init__  su      	 (?$++ 	;Eem5G%HIII!!$u'8"9:	; 	N3rV   c                     | j                   S r  )r(  r.  s    rW   r$  z+OuterLoopFusedSchedulerNode.get_outer_nodes  s    %%%rV   c           
      8   dt         dt         dt        dt        dt        f
fdt        t	        |      dz
        D ]0  }||   j
                  }||dz      j
                  } |||d      r0 y	 |D ]  }t        j                  t        j                  |j                  d |       }t	        |j                        |kD  sMt        |t        j                        sht        |j                  |   t        j                        s|d
z  |j                  |   k  s y	 y)Nleft_loop_nestright_loop_nestloop_fusion_depthcurrent_checking_depthr   c                 ^   | j                   sJ |j                   sJ | j                   |   |j                   |   g d}t        fd|D              sy|dk\  sJ |dz
  x}dkD  rE|dz   }|t        | j                         k  sJ |t        |j                         k  sJ  | |||      syy)N)r   sizeoffsetstepsc              3   P   K   | ]  }t        |      t        |      k(    y wr  )getattr)r  attr_compareleft_loop_levelright_loop_levels     rW   r  zaOuterLoopFusedSchedulerNode.check_outer_fusion_loop_level_attr.<locals>._inner.<locals>.<genexpr>  s2       % O\:/>?s   #&Fr5   r   T)loopsr"  r   )r5  r6  r7  r8  outer_loops_attr_compare_listr@  rA  _inners        @@rW   rD  zNOuterLoopFusedSchedulerNode.check_outer_fusion_loop_level_attr.<locals>._inner	  s     "''''"((((,223IJO.445KL-)   )F  $)))%6%::!a?)?!)C&-N4H4H0IIII-O4I4I0JJJJ"#%*	 !rV   r5   r   F,  T)LoopNestrm   rk   ranger   	loop_nest	functoolsreduceoperatormulrangesr   r   Integer)	r.  cpp_kernel_proxy_listr&  idxr5  r6  cpp_kernel_proxyouter_rangesrD  s	           @rW   "check_outer_fusion_loop_level_attrz>OuterLoopFusedSchedulerNode.check_outer_fusion_loop_level_attr   s<   (	$(	%(	  #(	 %(	(	
 (	T 23a78 		C237AAN3C!G<FFO'	 		 !6 	$++ ''(@)@AL $++,/FF|U]];$++,CDMM !3&"))*ABC )	, rV   c                 D   |d   j                   }t        |      }|D cg c]'  }|j                  j                  | j                        ) c}|_        |d   }||j                  _        |j                  j                  d | j                   |j                  _        |S c c}w Nr   )kernel_groupOuterLoopFusedKernelrH  from_loop_levelr&  innerkernelrB  )r.  rO  rV  outer_loop_fused_kernelproxyouter_fused_proxys         rW   merge_outer_fusion_kernelsz6OuterLoopFusedSchedulerNode.merge_outer_fusion_kernelsV  s     -Q/<<"6|"D /)
 OO++D,H,HI)
% 2!4-D##*,=,G,G,M,M*d**-
##) ! )
s   ,B)r  r  r  classmethodr   r'  r#  r	   r"   r$   r-  r$  rS  r^  __classcell__r1  s   @rW   r  r    sb    !Q%!Q.?!Q !QF44  &8-&G HI4 &Tl!rV   r  c                   2    e Zd ZddefdZd Zd Zd Zd Zy)	RecordOptimizationContext	func_namec                 .    || _         d | _        d | _        y r  )rd  current_nodeopt_ctx)r.  rd  s     rW   r-  z"RecordOptimizationContext.__init__i  s    "596:rV   c                    t         j                  sJ t         j                  j                  sJ t         j                  j                  | _        | j                  J t        j                  | j                  j
                  v r-| j                  j
                  t        j                     | _        nt               | _        | j                  J | j                  | j                  _        | S r  )	r4   interpreterrf  rA   keymetarg  rd  ops_namer3  s    rW   	__enter__z#RecordOptimizationContext.__enter__n  s    }}}}}))))MM66  ,,,""d&7&7&<&<<,,112E2I2IJDL.0DL||''' $rV   c                     | j                   sJ | j                  sJ | j                  | j                   j                  t        j                  <   y r  )rf  rg  rk  rA   rj  r.  exc_typeexc_valexc_tbs       rW   __exit__z"RecordOptimizationContext.__exit__|  s>        |||:>,,2667rV   c                     | j                   S r  )rg  r3  s    rW   get_opt_ctxz%RecordOptimizationContext.get_opt_ctx  s    ||rV   c                 6    | j                   sJ | j                   S r  )rf  r3  s    rW   get_fx_nodez%RecordOptimizationContext.get_fx_node  s           rV   N)rR   )	r  r  r  ro   r-  rm  rs  ru  rw  rU   rV   rW   rc  rc  h  s#    ;# ;
G
!rV   rc  c                      e Zd ZdZed        Zed        Zed        ZedLd       Zed        Z	ed        Z
ed	        Zed
        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Z ed        Z!ed         Z"ed!        Z#ed"        Z$ed#        Z%ed$        Z&ed%        Z'ed&        Z(ed'        Z)ed(        Z*ed)        Z+ed*        Z,ed+        Z-ed,        Z.ed-        Z/ed.        Z0ed/        Z1ed0        Z2ed1        Z3ed2        Z4ed3        Z5ed4        Z6ed5        Z7ed6        Z8ed7        Z9ed8        Z:ed9        Z;ed:        Z<ed;        Z=ed<        Z>ed=        Z?ed>        Z@ed?        ZAed@        ZBedA        ZCedB        ZDedC        ZEedD        ZFedEeGj                  dFeGj                  fdG       ZIedEeGj                  dFeGj                  fdH       ZJedEeGj                  dFeGj                  fdI       ZKedJ        ZLedK        ZMy)MCppOverrideszMap element-wise ops to C++c                     d|  d|  d| dS )N	decltype()( + r   rU   abs     rW   addzCppOverrides.add      1#Rs#aS**rV   c                     d|  d|  d| dS )Nr{  r|   - r   rU   r~  s     rW   r   zCppOverrides.sub  r  rV   c                     d|  d|  d| dS )Nr{  r|  r   r   rU   r~  s     rW   rL  zCppOverrides.mul  r  rV   Nc                    t        | t              sJ || j                  }t        j                  j                  | ||      }t        j                  j                  j                  t        j                  j                  |      }|j                  d| |fd|i       |t        v r6|t        j                  k(  r#	 t        j                  j                  | |||       |S )Nto_dtyper   )r   rG   r   r4   rZ  get_to_dtype_exprcsegeneratecomputeupdate_on_argsr   r   rn   cache_dtype_convert)xr   r   use_compute_typesexprcsevars         rW   r  zCppOverrides.to_dtype  s    !^,,,Ixx))!UI>&&qxx'7'7>j1e*{I6NOM!i5;;&>> HH((IvuErV   c                 T    |t         v sJ | dt         d       dt         |    d|  dS )Nz missing from z.DTYPE_TO_CPPzc10::bit_cast<>(r   )rH   r  )r  r   r   s      rW   to_dtype_bitcastzCppOverrides.to_dtype_bitcast  s=    $U~hZ}&UU$U 34Bqc;;rV   c                     d|  dS )Nz	std::abs(r   rU   r  s    rW   abszCppOverrides.abs      1#QrV   c                     d|  dS )Nz	std::sin(r   rU   r  s    rW   sinzCppOverrides.sin  r  rV   c                     d|  dS )Nz	std::cos(r   rU   r  s    rW   coszCppOverrides.cos  r  rV   c                     d|  d|  dS )Nr{  z)(-r   rU   r  s    rW   negzCppOverrides.neg      1#S1%%rV   c                     d|  dS )Nz	std::exp(r   rU   r  s    rW   expzCppOverrides.exp  s     1#QrV   c                     d|  dS )Nz
std::exp2(r   rU   r  s    rW   exp2zCppOverrides.exp2      A3a  rV   c                     d|  dS )Nzstd::expm1(r   rU   r  s    rW   expm1zCppOverrides.expm1      QCq!!rV   c                     d|  dS )Nz	std::erf(r   rU   r  s    rW   erfzCppOverrides.erf  r  rV   c                     d|  dS )Nz
std::erfc(r   rU   r  s    rW   erfczCppOverrides.erfc  r  rV   c                     d|  dS )Nzcalc_erfinv(r   rU   r  s    rW   erfinvzCppOverrides.erfinv      aS""rV   c                     d|  dS )Nz
std::sqrt(r   rU   r  s    rW   sqrtzCppOverrides.sqrt  r  rV   c                     d|  dS )Nz1 / std::sqrt(r   rU   r  s    rW   rsqrtzCppOverrides.rsqrt  s    s!$$rV   c                 |    t         j                  j                  }|dk(  r|  d|  dS |d|  dS t        d|      )Naccuracy + decltype()(1)zstd::log1p(r   8unrecognized config cpp.inject_log1p_bug_TESTING_ONLY = r   cppinject_log1p_bug_TESTING_ONLYr   r  bugs     rW   log1pzCppOverrides.log1p  sW    jj66*SQCt,,[ 1%% J3'R rV   c                     d|  dS )Nz	std::tan(r   rU   r  s    rW   tanzCppOverrides.tan  r  rV   c                     d|  dS )Nz
std::tanh(r   rU   r  s    rW   tanhzCppOverrides.tanh	  r  rV   c                 &    t         rd|  dS d|  dS )z
        On windows std::signbit only support float type.
        Ref: https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/signbit?view=msvc-170
        z std::signbit(static_cast<float>(r   zstd::signbit(r   rS   r  s    rW   signbitzCppOverrides.signbit  s-      /qc4	
 !1%	
rV   c                     d|  d| dS )Nz	std::pow(r   r   rU   r~  s     rW   powzCppOverrides.pow  s    1#Rs!$$rV   c                     d|  dS )Nz	std::log(r   rU   r  s    rW   logzCppOverrides.log  r  rV   c                     d|  dS )Nzstd::nearbyint(r   rU   r  s    rW   roundzCppOverrides.round!  s     1%%rV   c                     d|  dS )Nzstd::floor(r   rU   r  s    rW   floorzCppOverrides.floor%  r  rV   c                 H    |  d| }|  d| }d|  d| d| d| d| d| d	S )
N /  % ((z
 < 0) != (z	 < 0) ? (z != 0 ? z - 1 : z) : r   rU   )r  r  quotrems       rW   floordivzCppOverrides.floordiv)  sR     Cs|3qclA3j9SE$wtfDQUPVVWXXrV   c                     d|  dS )Nz
std::ceil(r   rU   r  s    rW   ceilzCppOverrides.ceil0  r  rV   c                     d|  dS )Nzstd::trunc(r   rU   r  s    rW   trunczCppOverrides.trunc4  r  rV   c                     |  d| S Nr  rU   r~  s     rW   truncdivzCppOverrides.truncdiv8  s     Cs|rV   c                     d|  d| dS )Nz
std::fmod(r   r   rU   r~  s     rW   fmodzCppOverrides.fmod=  s    A3b1%%rV   c                     d|  dS )Nzstd::isinf(r   rU   r  s    rW   isinfzCppOverrides.isinfA  r  rV   c                     d|  dS )Nzstd::isnan(r   rU   r  s    rW   isnanzCppOverrides.isnanE  r  rV   c                     d|  dS )Nzstd::lgamma(r   rU   r  s    rW   lgammazCppOverrides.lgammaI  r  rV   c                     d|  dS )Nz
std::acos(r   rU   r  s    rW   acoszCppOverrides.acosM  r  rV   c                     d|  dS )Nzstd::acosh(r   rU   r  s    rW   acoshzCppOverrides.acoshQ  r  rV   c                     d|  dS )Nz
std::cosh(r   rU   r  s    rW   coshzCppOverrides.coshU  r  rV   c                     d|  dS )Nz
std::sinh(r   rU   r  s    rW   sinhzCppOverrides.sinhY  r  rV   c                     d|  dS )Nz
std::asin(r   rU   r  s    rW   asinzCppOverrides.asin]  r  rV   c                     d|  dS )Nzstd::asinh(r   rU   r  s    rW   asinhzCppOverrides.asinha  r  rV   c                     d|  d| dS )Nzstd::atan2(r   r   rU   r  ys     rW   atan2zCppOverrides.atan2e      QCr!A&&rV   c                     d|  dS )Nz
std::atan(r   rU   r  s    rW   atanzCppOverrides.atani  r  rV   c                     d|  dS )Nzstd::atanh(r   rU   r  s    rW   atanhzCppOverrides.atanhm  r  rV   c                     d|  d| dS )Nzstd::copysign(r   r   rU   r  s     rW   copysignzCppOverrides.copysignq  s    s"QCq))rV   c           	         d|  dd|  df}t        d |D              rt        d |D              S t               }t        j                  j
                  j                  t        j                        }t        j                  j
                  j                  | j                        }|j                  d| d       |j                  d	| d
|  d| d       t        j                  j                  j                  |       ||f}t        ||      D ]/  \  }}t        j                  j
                  j                  ||       1 ||fS )Nfrexp()[0])[1]c              3   r   K   | ]/  }t         j                  j                  j                  |      d u 1 y wr  r4   rZ  r  try_getr  	cache_keys     rW   r  z%CppOverrides.frexp.<locals>.<genexpr>x  (     Wyqxx||##I.d:W   57c              3   n   K   | ]-  }t         j                  j                  j                  |       / y wr  r  r   s     rW   r  z%CppOverrides.frexp.<locals>.<genexpr>y  #     UY--i8U   35r   zint32_t r   r   z = std::frexp(, &r   )r"  r   r7   r4   rZ  r  newvarr   int32r   r   r  splicezipput)r  
cache_keysr   exponentmantissacse_varsr  cse_vars           rW   frexpzCppOverrides.frexpu  s   aS%s$'77
WJWWU*UUU~88<<&&U[[&988<<&&QWW&5(1-.xjqcXJbIJ	%h'"%j("; 	1IwHHLLY0	1!!rV   c                     d|  d| dS )Nzstd::hypot(r   r   rU   r  s     rW   hypotzCppOverrides.hypot  r  rV   c                     d|  dS )Nzstd::log10(r   rU   r  s    rW   log10zCppOverrides.log10  r  rV   c                     d|  dS )Nz
std::log2(r   rU   r  s    rW   log2zCppOverrides.log2  r  rV   c                     d|  d| dS )Nzstd::nextafter(r   r   rU   r  s     rW   	nextafterzCppOverrides.nextafter  s     2aS**rV   c                     t         j                  j                  }|dk(  ry|dk(  r|  dS |dk(  r|  d|  dS |	d|  d	|  d
S t        d|      )Ncompile_errorcompile error!runtime_error	; throw 1r  r  r  z	std::max(, decltype()(0))7unrecognized config cpp.inject_relu_bug_TESTING_ONLY = r   r  inject_relu_bug_TESTING_ONLYr   r  s     rW   reluzCppOverrides.relu  s|    jj55/!#O#S	?"JSQCt,,[qcQCu55 I#Q rV   c                     d|  d| dS )Nzmin_propagate_nan(r   r   rU   r~  s     rW   minimumzCppOverrides.minimum      #A3b1--rV   c                     d|  d| dS )Nzmax_propagate_nan(r   r   rU   r~  s     rW   maximumzCppOverrides.maximum  r)  rV   c                     |  d| d| S )N ?  : rU   )r  r  cs      rW   wherezCppOverrides.where  s    Cs#aS!!rV   c                     d|  d| dS )Nzmod(r   r   rU   r~  s     rW   r  zCppOverrides.mod  s    aS1#QrV   c                 (    t        | t        |         S r  )rO   rH   )valr   s     rW   constantzCppOverrides.constant  s    Ce!455rV   c                    t        t        j                  j                  |             }t        j                  j                  j                  t        j                  j                  |t        |             }t        j                  ||      S )Nbounds)
rD   r4   rZ  rename_indexingr  r  r  r&   r2   r  )r  r   idx_strr   s       rW   
index_exprzCppOverrides.index_expr  sb    0067hhll##HHg.CD.I $ 
 ||C''rV   c                 "   t               }t        j                  j                  j	                         }|j                  d| d       t        j                  j                  |      5  |j                         5   |       }|j                  d| d       d d d        d d d        |j                  d       t        j                  j                  j                  |       t        |d| d      }|  d| d| S # 1 sw Y   exY w# 1 sw Y   ixY w)	Nr    = [&]return r   r{  z())r-  z() : )r7   r4   rZ  r  r	  r   swap_buffersr   r  r  rO   )maskbodyotherr   body_varr   
other_codes          rW   maskedzCppOverrides.masked  s    ~ 88<<&&(xj/0XX""4( 	0$++- 	0VFNNWVHA./	0 	0 	s	% "%9XJc)BC
s8*E*66	0 	0 	0 	0s$   'D8C9D9D	>DDc                     |  d| S )N && rU   r~  s     rW   logical_andzCppOverrides.logical_and      D}rV   c                     d|  S )N!rU   r  s    rW   logical_notzCppOverrides.logical_not      1#wrV   c                     |  d| S )Nr   rU   r~  s     rW   
logical_orzCppOverrides.logical_or  rH  rV   c                     |  d| S )N != rU   r~  s     rW   logical_xorzCppOverrides.logical_xor  rH  rV   c                     d|  d|  d| dS )Nr{  r|   & r   rU   r~  s     rW   bitwise_andzCppOverrides.bitwise_and  r  rV   c                     d|  d|  dS )Nr{  z)(~r   rU   rK  s    rW   bitwise_notzCppOverrides.bitwise_not  r  rV   c                     d|  d|  d| dS )Nr{  r|   | r   rU   r~  s     rW   
bitwise_orzCppOverrides.bitwise_or  r  rV   c                     d|  d|  d| dS )Nr{  r|  r   r   rU   r~  s     rW   bitwise_xorzCppOverrides.bitwise_xor  r  rV   c                    t               }|j                  d       |j                         5  t        | j                     }|j                  d| d| d       |j                  d| d| d| d       |j                         5  |j                  d	|  d
       d d d        |j                  d	|  d| d|  d| d	       d d d        |j                  d       |S # 1 sw Y   BxY w# 1 sw Y   (xY w)N[&]()constexpr decltype() max_shift = sizeof(z) * CHAR_BIT;$if ((static_cast<std::make_signed_t<>>() < 0) || ( >= max_shift))return decltype(z)(0);z#)(static_cast<std::make_unsigned_t<z) << r   ()r7   r   r   rH   r   r  r  r   scalar_ts       rW   bitwise_left_shiftzCppOverrides.bitwise_left_shift  s   ~w[[] 	#AGG,HNN%aS(=hZ}U NN6xjA3kRSQTTcd  <!1!E:;<NN"1#%H
RUVWUXX]^_]``bc	 	t< <	 	s$   AC&C&C&C#	C&&C/c           
         t               }|j                  d       |j                         5  t        | j                     }|j                  d| d| d| d       |j                  d| d| d| d	       |j                         5  |j                  d
|  d|  d       d d d        |j                  d
|  d|  d| d       d d d        |j                  d       |S # 1 sw Y   ?xY w# 1 sw Y   (xY w)Nr^  r_  r`  z ) * CHAR_BIT - std::is_signed_v<z>;ra  rb  rc  rd  re  r|  z >> max_shift); >> r   rf  rg  rh  s       rW   bitwise_right_shiftz CppOverrides.bitwise_right_shift  s   ~w[[] 
	A#AGG,HNN%aS(=hZGghpgqqst NN6xjA3kRSQTTcd  K!1!BqcIJKNN-aS1#T!B?@
	A 	t	K K
	A 
	As$   AC)C#C)C&	"C))C2seedr;  c                     d|  d| dS )Nznormalized_rand_cpu(r   r   rU   rn  r;  s     rW   randzCppOverrides.rand  s    %dV2fXQ77rV   c                     d|  d| dS )Nz
randn_cpu(r   r   rU   rp  s     rW   randnzCppOverrides.randn  s    D6F81--rV   c           	           d|  d| d| d| d	S )Nzrandint64_cpu(r   r   rU   )rn  r;  lowhighs       rW   	randint64zCppOverrides.randint64!  s#    vRxr#ba@@rV   c                     d|  d|  d|  dS )Nr{  z)(1) / (decltype(z)(1) + std::exp(-r   rU   r  s    rW   sigmoidzCppOverrides.sigmoid%  s    1#.qc1B1#RHHrV   c           
      N   t               }d|  d}d|  d}|j                  d       |j                         5  |j                  d|  d| d| d       |j                  d	|  d
| d| d       |j                  d       d d d        |j                  d       |S # 1 sw Y   xY w)Nr{  )(0)r  r^  auto left = z > 0 ? r.  r   auto right = z < 0 ? return left - right;rf  r7   r   r   )r  r   scalar_zero
scalar_ones       rW   signzCppOverrides.sign)  s    ~!!D) 4(
w[[] 	3NN\!GJ<s;-qQRNN]1#WZLK=PQRSNN12	3 	t	3 	3s   ABB$NT)Nr  r  r  r  staticmethodr  r   rL  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r&  r(  r+  r0  r  r4  r:  rD  rG  rL  rO  rR  rU  rW  rZ  r\  rj  rm  r   Exprrq  rs  rw  ry  r  rU   rV   rW   ry  ry    s   %+ + + + + + ( (T < <             & &     ! ! " "     ! ! # # ! ! % % 	 	     ! ! 	
 	
 % %     & & " " Y Y ! ! " "   & & " " " " # # ! ! " " ! ! ! ! ! ! " " ' ' ! ! " " * * " "  ' ' " " ! ! + +   . . . . " "     6 6 ( ( 7 7          + + & & + + + +  &  " 85:: 8uzz 8 8 .EJJ .

 . . A

 AEJJ A A I I 
 
rV   ry  r  c                       e Zd ZdZ fdZed        Zed        Zed        Zed        Z	ed        Z
ed        Zed	        Zed
        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Z ed        Z!ed        Z"ed         Z#ed!        Z$ed"        Z%ed#        Z&ed$        Z'ed%        Z(ed&        Z)ed'        Z*ed(        Z+ed)        Z,ed*        Z-ed+        Z.ed,        Z/ed-        Z0ed.        Z1ed/        Z2ed0        Z3ed1        Z4ed2        Z5ed3        Z6ed4        Z7ed5        Z8ed6        Z9ed7        Z:ed8        Z;ed9        Z<ed:        Z=ed;        Z>ed<        Z?ed=        Z@ed>        ZAed?        ZBed@        ZCedA        ZDedB        ZEedC        ZFedD        ZGedE        ZHedF        ZIedG        ZJedH        ZKedI        ZLedJ        ZMedRdK       ZNedL        ZOedM        ZPedN        ZQedO        ZReSdP        ZTeSdQ        ZU xZVS )SCppVecOverridesz.Map element-wise ops to aten vectorization C++c                     t         |   |       fd}t        t              j	                         D ]<  \  }}t        |dd       t        k(  s|dvs t        | ||j                               > S )Nc                       fd}|S )Nc                     | D cg c]@  }t        |t        t        j                  f      st        |t              r|j
                  s|B }}| D cg c]   }t        |t              r|j
                  r|" }}t        |       }|r|rg }| D ]  }t        |t        t        j                  f      rt        |t        j                        r1|j                  s%t        j                  |t        j                        }n$t        j                  |t        j                        }t        |t              r|j                  n|}|j                  |        |r>t!        |      dk(  rt#        |      }n$
t$        j&                  k(  rt#        |dd        |dd  |r|rt        t(        j*                  t,              sJ |D cg c]p  }t        |t              r\|j
                  sP
t$        j.                  t$        j0                  t$        j2                  fvrt(        j*                  j5                  |      n|r }}|r 
|i |S t7        t$              }t9        |
j:                        }|J  || i |S c c}w c c}w c c}w )Nr   r5   )r   rm   r   r  rG   r   r#  	is_numberr2   r:  r   int64r4  r3   valueappendr   rL   r  r0  r4   rZ  CppVecKernelrq  rs  rw  	broadcastr,  r>  r  )argskwargsargscalarsvectorsnew_argsnew_arg
scalar_opsscalar_funcr1  funcr.  s            rW   wrapperz6CppVecOverrides.__new__.<locals>.wrap.<locals>.wrapperM  s8     $!#UZZ'89"37

    $!#~63::  
  :w!H# -%cC+<=)#uzz:3==&)nnS%++&F&)ll3&D/9#x/H#))cC ,-  8})#/#9!6!66'3HQRL'A w%ahh===  (0  $ !+7N C(/$($3$8$8$3$9$9$3$=$=("%"	 HH..w7 ")) H  $ 4V44 "'!=J")*dmm"DK&222&777@ s   AI)%I.6A5I3rU   )r  r  r1  r.  s   ` rW   wrapz%CppVecOverrides.__new__.<locals>.wrap@  s    @8D NrV   r1  )rD  r:  )	r,  __new__varsr  itemsr>  r  setattr__func__)r%  r  kargsr  r   methodr.  r1  s         @rW   r  zCppVecOverrides.__new__=  sx    ws#O	b !1779 	;LD&v{D1\Ad S G dD$9:	; rV   c                     |  d| S )Nr}  rU   r~  s     rW   r  zCppVecOverrides.add      Cs|rV   c                     |  d| S )Nr  rU   r~  s     rW   r   zCppVecOverrides.sub  r  rV   c                     |  d| S Nr   rU   r~  s     rW   rL  zCppVecOverrides.mul  r  rV   c                     |  d| S r  rU   r~  s     rW   truedivzCppVecOverrides.truediv  r  rV   c                     |  dS )Nz.abs()rU   r  s    rW   r  zCppVecOverrides.abs      F|rV   c                     |  dS )Nz.sin()rU   r  s    rW   r  zCppVecOverrides.sin  r  rV   c                     |  dS )Nz.cos()rU   r  s    rW   r  zCppVecOverrides.cos  r  rV   c                     |  dS )Nz.exp()rU   r  s    rW   r  zCppVecOverrides.exp  r  rV   c                     |  dS )Nz.exp2()rU   r  s    rW   r  zCppVecOverrides.exp2      G}rV   c                     d|  d}|  d| S )Nr{  r  z	.exp() - rU   )r  vec_ones     rW   r  zCppVecOverrides.expm1  s#     aS%IgY''rV   c                     |  dS )Nz.erf()rU   r  s    rW   r  zCppVecOverrides.erf  r  rV   c                     |  dS )Nz.erfc()rU   r  s    rW   r  zCppVecOverrides.erfc  r  rV   c                     |  dS )Nz	.erfinv()rU   r  s    rW   r  zCppVecOverrides.erfinv      IrV   c                     |  dS )Nz.sqrt()rU   r  s    rW   r  zCppVecOverrides.sqrt  r  rV   c                     t        t        j                  t              sJ t        | t              sJ | j
                  J t        j                  j                  | j
                         d|  d| dS )N( == r   r   r4   rZ  r  rG   r   _get_mask_typer  s     rW   eqzCppVecOverrides.eq  c    !((L111!^,,,ww"""(())!''231QCtA3a@@rV   c                    t        t        j                  t              sJ t        | t              sJ | j
                  t        j                  k(  rO|j
                  t        j                  k(  sJ t        t        j                  j                  | |f      \  }}| d| S | j
                  J t        j                  j                  | j
                         d|  d| dS )NrQ  r  r   )r   r4   rZ  r  rG   r   r   rk   rN   r  r  )r  r  x_casty_casts       rW   nezCppVecOverrides.ne  s    !((L111!^,,,77ejj 77ejj(((1!((2B2BQFKNFFXT&**77&&&hh--agg67q4s!DDrV   c                     t        t        j                  t              sJ t        | t              sJ | j
                  J t        j                  j                  | j
                         d|  d| dS )Nr  r   r   r  r  s     rW   ltzCppVecOverrides.lt  c    !((L111!^,,,ww"""(())!''231QCs1#Q??rV   c                     t        t        j                  t              sJ t        | t              sJ | j
                  J t        j                  j                  | j
                         d|  d| dS )Nr  z > r   r  r  s     rW   gtzCppVecOverrides.gt  r  rV   c                     t        t        j                  t              sJ t        | t              sJ | j
                  J t        j                  j                  | j
                         d|  d| dS )Nr   <= r   r  r  s     rW   lezCppVecOverrides.le  r  rV   c                     t        t        j                  t              sJ t        | t              sJ | j
                  J t        j                  j                  | j
                         d|  d| dS )Nr   >= r   r  r  s     rW   gezCppVecOverrides.ge  r  rV   c                     |  d| S NrT  rU   r  s     rW   and_zCppVecOverrides.and_  r  rV   c                     |  dS )Nz.rsqrt()rU   r  s    rW   r  zCppVecOverrides.rsqrt      H~rV   c                     |  d| dS )Nz.pow(r   rU   r~  s     rW   r  zCppVecOverrides.pow  s    E!ArV   c                     |  dS )Nz.log()rU   r  s    rW   r  zCppVecOverrides.log  r  rV   c                     |  dS )Nz.round()rU   r  s    rW   r  zCppVecOverrides.round  r  rV   c                     |  dS )Nz.floor()rU   r  s    rW   r  zCppVecOverrides.floor  r  rV   c                     |  dS )Nz.ceil()rU   r  s    rW   r  zCppVecOverrides.ceil  r  rV   c                     |  dS )Nz.trunc()rU   r  s    rW   r  zCppVecOverrides.trunc  r  rV   c                     |  d| dS )Nz.fmod(r   rU   r~  s     rW   r  zCppVecOverrides.fmod#  s    F1#QrV   c                     |  dS )Nz	.lgamma()rU   r  s    rW   r  zCppVecOverrides.lgamma'  r  rV   c                 .    t        | |      \  } }|  d| S r  rK   r~  s     rW   rG  zCppVecOverrides.logical_and+  "    ,Q21Cs|rV   c                     d|  S N~rU   rK  s    rW   rL  zCppVecOverrides.logical_not0  rM  rV   c                 .    t        | |      \  } }|  d| S NrY  r  r~  s     rW   rO  zCppVecOverrides.logical_or4  r  rV   c                 .    t        | |      \  } }|  d| S Nr   r  r~  s     rW   rR  zCppVecOverrides.logical_xor9  r  rV   c                 .    t        | |      \  } }|  d| S r  r  r~  s     rW   rU  zCppVecOverrides.bitwise_and>  r  rV   c                     d|  S r  rU   rK  s    rW   rW  zCppVecOverrides.bitwise_notC  rM  rV   c                 .    t        | |      \  } }|  d| S r  r  r~  s     rW   rZ  zCppVecOverrides.bitwise_orG  r  rV   c                 .    t        | |      \  } }|  d| S r  r  r~  s     rW   r\  zCppVecOverrides.bitwise_xorL  r  rV   c                     |  d| S )Nz << rU   r~  s     rW   rj  z"CppVecOverrides.bitwise_left_shiftQ  rH  rV   c                     |  d| S )Nrl  rU   r~  s     rW   rm  z#CppVecOverrides.bitwise_right_shiftU  rH  rV   c                     t        t        j                  t              sJ t        j                  j	                  | |       S r  )r   r4   rZ  r  load)r   r;  s     rW   	load_seedzCppVecOverrides.load_seedY  s.    !((L111((--f-./rV   c                 |    t        t        j                  t              sJ t	               }d|  d}t        |||      S )Nz)result[offset_idx] = normalized_rand_cpu(, offset[offset_idx]);r   r4   rZ  r  r7   rF   rn  r;  r   rand_functions       rW   rq  zCppVecOverrides.rand^  s@    !((L111~7v=ST 	 FD-88rV   c                 |    t        t        j                  t              sJ t	               }d|  d}t        |||      S )Nzresult[offset_idx] = randn_cpu(r  r  r  s       rW   rs  zCppVecOverrides.randng  s;    !((L111~9$?UVFD-88rV   c                     t        t        j                  t              sJ t	               }d|  d| d| d}t        |||t        j                        S )Nz#result[offset_idx] = randint64_cpu(z, offset[offset_idx], r   r   )r   r4   rZ  r  r7   rF   r   r  )rn  r;  ru  rv  r   r  s         rW   rw  zCppVecOverrides.randint64n  sS    !((L111~=dVCYZ]Y^^`ae`ffhiFD-EErV   c                 ~    | j                   |j                   k(  sJ d       |  dt        j                  | |       d| S )Nz;remainder vec implementation expect the same inputs' dtype.z - (z) * )r   r  r  r~  s     rW   	remainderzCppVecOverrides.remainderu  sI    ww!''! 	
I	
! D11!Q78QC@@rV   c                     |  dS )Nz.tan()rU   rK  s    rW   r  zCppVecOverrides.tan|  r  rV   c                     |  dS )Nz.tanh()rU   rK  s    rW   r  zCppVecOverrides.tanh  r  rV   c                     |  dS )Nz.reciprocal()rU   rK  s    rW   
reciprocalzCppVecOverrides.reciprocal  s    M""rV   c                     |  dS )Nz.atan()rU   r  s    rW   r  zCppVecOverrides.atan  r  rV   c                     |  dS )Nz.acos()rU   r  s    rW   r  zCppVecOverrides.acos  r  rV   c                     |  dS )Nz.asin()rU   r  s    rW   r  zCppVecOverrides.asin  r  rV   c                     |  dS )Nz.cosh()rU   r  s    rW   r  zCppVecOverrides.cosh  r  rV   c                     |  dS )Nz.sinh()rU   r  s    rW   r  zCppVecOverrides.sinh  r  rV   c                     |  dS )Nz.log10()rU   r  s    rW   r  zCppVecOverrides.log10  r  rV   c                     |  dS )Nz.log2()rU   r  s    rW   r  zCppVecOverrides.log2  r  rV   c                     |  d| dS )Nz.nextafter(r   rU   r  s     rW   r  zCppVecOverrides.nextafter  s    Ks!$$rV   c                     |  d| dS )Nz
.copysign(r   rU   r~  s     rW   r  zCppVecOverrides.copysign  s    Jqc##rV   c                     |  d| dS )Nz.atan2(r   rU   r~  s     rW   r  zCppVecOverrides.atan2      GA3a  rV   c                     |  d| dS )Nz.hypot(r   rU   r~  s     rW   r  zCppVecOverrides.hypot  r  rV   c           
      <    d|  d}d|  d}| d| d|  d| d|  d
S )	Nr{  r  z)(0.5)z * ((r}  z)/(r  z)).log()rU   )r  r  vec_one_halfs      rW   r  zCppVecOverrides.atanh  sE     aS%"1#V,uWIS3wis1#XNNrV   c                     |  dS )Nz.asinh()rU   r  s    rW   r  zCppVecOverrides.asinh  r  rV   c                     |  dS )Nz.acosh()rU   r  s    rW   r  zCppVecOverrides.acosh  r  rV   c                     t         j                  j                  }|dk(  ry|dk(  r|  dS |dk(  r|  d|  dS |	d|  d	|  d
S t        d|      )Nr  r  r  r   r  r  r  zat::vec::clamp_min(r!  r"  r#  r$  r  s     rW   r&  zCppVecOverrides.relu  s|    jj55/!#O#S	?"JSQCt,,[(;qc?? I#Q rV   c                     d|  d|  d|  dS )Nr{  z)(1)/(decltype(z)(1) + z.neg().exp())rU   r  s    rW   ry  zCppVecOverrides.sigmoid  s    1#_QCwqcGGrV   c                     |  dS )Nz.neg()rU   r  s    rW   r  zCppVecOverrides.neg  r  rV   c                    t        | j                        r)| j                  |j                  k(  sJ d       d|  d| dS t        d | |fD              sJ d|  d}t        j                  j                  |j                        dk  r,| ddt        j                  j                  z  dz
   d	| d
| d}|  d| }d|  d| d| d}d|  d| d| d| d	}| d| d| d| d
| d| dS )NzDdiv_floor_floating_vec implementation expect the same inputs' dtype.zdiv_floor_floating_vec(r   r   c              3   F   K   | ]  }t        |j                          y wr  )r   r   )r  items     rW   r  z+CppVecOverrides.floordiv.<locals>.<genexpr>  s     G'

3G   !r{  r5   ::blend<r  (1), r  r  r  rQ  z(0))r  r   z	(0)) != (z(0)))z	::blendv(r  rT  )r   r   r"  r4   rZ  _get_raw_num_vectorstiling_factor)r  r  _tr  has_remis_negs         rW   r  zCppVecOverrides.floordiv  s1   !''"77agg% V% -QCr!A66GAGGGGQCq!Bxx,,QWW59d(A)?)?$?1#D"ERt5QRPSSTUSA3<D!Cs$rd$/G!Ct9QCs2$e<FT4&4&B4uWISPQRRrV   c                     t         j                  j                  |j                        dk  r2d| d}| ddt         j                  j                  z  dz
   d| d| d}|  d| S )Nr5   r{  r   r  r  r  r  )r4   rZ  r  r   r  )r  r  r  s      rW   r  zCppVecOverrides.truncdiv  sp     88((1A5QCq!B$hQXX%;%; ;q@AB4uQCqQACs|rV   c                     | j                   t        j                  k(  rO|j                   t        j                  k(  sJ t        t        j
                  j                  | |f      \  }}| d| S d|  d| dS )NrT  at::vec::minimum(r   r   r   r   rk   rN   r4   rZ  r  r  r  a_castb_casts       rW   r(  zCppVecOverrides.minimum  l    77ejj 77ejj(((1!((2B2BQFKNFFXS))&qcA3a00rV   c                     | j                   t        j                  k(  rO|j                   t        j                  k(  sJ t        t        j
                  j                  | |f      \  }}| d| S d|  d| dS )NrY  at::vec::maximum(r   r   r  r  s       rW   r+  zCppVecOverrides.maximum  r!  rV   c                     |  d|  S r  rU   rK  s    rW   squarezCppVecOverrides.square
  r  rV   c                    t        t        j                  t              sJ |j                  t
        j                  k(  rY|j                  t
        j                  k(  sJ t        t        j                  j                  | ||f      \  }}}d| d| d| d| d	S d| d| d| dt        j                  j                  | |j                         d	S )Nr{  
)::blendv(r   r   )
r   r4   rZ  r  r   r   rk   rN   r  _get_mask_cast)r  r  r/  blendv_ablendv_bblendv_cs         rW   r0  zCppVecOverrides.where  s    !((L11177ejj 77ejj(((+?  1a),(Hh xj
8*Bxj8*TUVVqcA3b2ahh6M6MaQRQXQX6Y5ZZ[\\rV   c                 ~   t               }d|  d}d|  d}d|  d| d| d| d|  d}d|  d| d| d|  d| d}|j                  d       |j                         5  |j                  d	| d
       |j                  d| d
       |j                  d       d d d        |j                  d       |S # 1 sw Y   xY w)Nr{  r{  r  r'  r   r   r   r^  r|  r   r}  r~  rf  r  )r  r   vec_zeror  blendv_lblendv_rs         rW   r  zCppVecOverrides.sign  s    ~qc&aS%qcH:Ry8*CPQsRSTqcH:Ry1#S
RSTw[[] 	3NN\(156NN]8*A67NN12	3 	t	3 	3s   <B33B<c           
         |t         j                  t         j                  t         j                  t         j                  t         j
                  t         j                  t         j                  t         j                  t         j                  f	v sJ t         d|        t        | t              sJ | j                  }t        j                  j!                  | ||      }t        j                  j"                  j%                  t        j                  j&                  |      }|j)                  d| |fd|i       |t*        v r5|t         j                  k(  r"t        j                  j-                  | |||       |S )Nz does not support r  r   )r   rk   float64rn   bfloat16float16uint8int8r
  r  r  r   rG   r   r4   rZ  r  r  r  r  r  r   r  )r  r   r   use_compute_dtypesr  r  s         rW   r  zCppVecOverrides.to_dtype)  s   JJMMKKNNMMKKJJKKKK

 

 
	2 Z)%1
	2 

 !^,,,GG	xx))!UI>&&qxx'7'7>j1e*{I6NOM!i5;;&>HH((IvuErV   c                 z    t         j                  j                  }|dk(  r|  d|  dS ||  dS t        d|      )Nr  r  r  z.log1p()r  r  r  s     rW   r  zCppVecOverrides.log1p?  sT    jj66*SQCt,,[S>! J3'R rV   c                 	   t        t        j                  t              sJ t	               }t        j                  j
                  j                         }t        j                  j                  |       5 }|j                  d| d       t        j                  j                  |      5  |j                         5   |       }|j                  d| d       d d d        d d d        d d d        |j                  d       t        j                  j                  j                  |       j                  | d}fd}|j                  r|}	n ||      }	t        |t                  }
 ||
      }t        t"              sJ |       |j                  rt	               }|j                  d       t        j                  j                  |      5  |j                         5  |j                  d| d	       |j                         5  |j                  d| d       d d d        |j                  d
       |j                         5  t        j                  j
                  j%                  t        j                  j                  |	      }t        j                  j
                  j%                  t        j                  j                  |      }t        |t"              sJ |       t        |t"              sJ |       |_        |_        t        j                  j&                  }|j                  d|j)                  |||       d       d d d        d d d        d d d        |j                  d       t        j                  j
                  j%                  t        j                  j                  |      }n|j                  rKt        j                  j
                  j%                  t        j                  j                  |  d|	 d|       }nJt        j                  j
                  j%                  t        j                  j                  |  d| d|
       }|j+                  d| |||fi        |S # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   yxY w# 1 sw Y   \xY w# 1 sw Y   axY w# 1 sw Y   fxY w)Nr   r<  r=  r   rf  c                     t         j                  k(  r$t        j                  j	                          d|  dS t        j                  j                         d|  dS )N::from(r   r  )r   rk   r4   rZ  r  _get_vec_type)r   r   s    rW   maskify_or_vecifyz1CppVecOverrides.masked.<locals>.maskify_or_vecify[  s]     EJJ& 88**,-WTF!< ..u56avQ?rV   [&]if (z.all_zero())elser-  r.  rD  )r   r4   rZ  r  r7   r  r	  rD  r   r>  r   r  r  r   r   rO   rH   rG   r  	overridesr0  r  )r?  r@  rA  r   r   new_maskr   	body_coder<  body_code_vecrC  other_code_vecbody_vec_varother_vec_varr@  r  r   s                   @rW   rD  zCppVecOverrides.maskedK  s   !((L111~hhll!!#XX__T" 	4hNNU3%v./&&t, 4dkkm 4234 4	4
 	s	%e2J		 ==%M-i8M!%e)<=
*:6(N3=X=3??>DNN5!&&t, dkkm hZ|<=[[] @NNW^,<A#>?@v&[[] #$88<<#8#8((%$L %&HHLL$9$9((&%M &lNCQ\QC%m^DSmSD).L&*/M'**  NN!)//(L-"X!YYZ[# 2 NN4 XX\\**  F ]]XX\\**  TF#m_C?O"PF XX\\**  TF#i[J<"HF
 	htUF(CRHI4 4 4 4	4 	4>@ @    s   35R(Q89Q+Q8RR9&R,R)R,DRR,R9+Q50Q88R	=RRRR,R)$R,,R6	1R99Sc                 X   t        t        j                  t              sJ t        j                  j	                  |       }t        j                  j
                  t        j                  j                     }t        j                  j                  ||      }|dk(  rt        j                  | |      S |t        j                  j                  j                  t        j                  j                  t        |      t        |             }t        j                   ||      }t        |t"              r|j$                  }t        j                  j'                  ||      }n:t        j                  j)                  d ||t        j                  j                        }|j+                  d| |fi        |S )Nr   r6  r:  )r   r4   rZ  r  r8  itervars
tiling_idx_try_get_const_stridery  r:  r  r  r  rD   r&   r2   r  r3   r  arange_load_or_store_non_contiguousr  )r  r   r   
tiling_varstriderP  r  r  s           rW   r:  zCppVecOverrides.index_expr  s3   !((L111((.XX&&qxx':':;
//zBQ;**477((,,''  %,7LT7R ( C LLe,E%*XX__UF3FXX;;eUAHH$4$4F 	lT5M2>rV   c           
      x   d|  dd|  df}t        d |D              rt        d |D              S t        | j                     }t        j
                  j                  rt        j
                  j                  nt        j
                  j                  }t               }t        j
                  j                  j                  t        j                        }t        j
                  j                  j                  | j                        }|j                  d| fi        |j                  d| fi        t        j
                  j                  | j                        }|d	k(  rd
| dnd| d| d}|j                  |d	k(  rd| dnd| d| d       |j                  | d| d       |j                  d       |j!                         5  |j                  d| dt        j
                  j                   d       |j                  |  dt#        |       d       |j                  dt        j
                  j                   d       |j                  d| dt        j
                  j                   d       |j                  dt#        |       d       |j!                         5  |j                  d       d d d        |j                  |d	k(  r| dt#        |       dn| d| d t#        |       d       |j                  | d!| d"t#        |       d       d d d        |j                  d#       t        j
                  j$                  j'                  |       ||f}	t)        ||	      D ]/  \  }
}t        j
                  j                  j+                  |
|       1 ||fS # 1 sw Y   xY w# 1 sw Y   xY w)$Nr  r  r  c              3   r   K   | ]/  }t         j                  j                  j                  |      d u 1 y wr  r  r   s     rW   r  z(CppVecOverrides.frexp.<locals>.<genexpr>  r  r  c              3   n   K   | ]-  }t         j                  j                  j                  |       / y wr  r  r   s     rW   r  z(CppVecOverrides.frexp.<locals>.<genexpr>  r  r  r  r  )r  r5   at::vec::Vectorized<r   at::vec::VectorizedN<r   zat::vec::Vectorized<int32_t> r   zat::vec::VectorizedN<int32_t, > r   r^  __at_align__ std::array<	> tmpbuf;.store(tmpbuf.data(), r   z!__at_align__ std::array<int32_t, z> tmpbuf_exponent;z> tmpbuf_mantissa;r   r   z@tmpbuf_mantissa[i] = std::frexp(tmpbuf[i], &tmpbuf_exponent[i]);z? = at::vec::Vectorized<int32_t>::loadu(tmpbuf_exponent.data(), z! = at::vec::VectorizedN<int32_t, z!>::loadu(tmpbuf_exponent.data(), r   z ::loadu(tmpbuf_mantissa.data(), z();)r"  r   rH   r   r4   rZ  	tail_sizer  r7   r  r	  r   r
  r  _get_num_vectorsr   r   rE   r  r  r  r  )r  r  r   r:  r   r  r  n_vec
mantissa_tr  r  r  s               rW   r  zCppVecOverrides.frexp  s{   aS%s$'77
WJWWU*UUUagg&%&XX%7%7qxx!!QXX=S=S~88<<&&U[[&988<<&&QWW&5!b9!b9))!''2 z #6(!,(5'; 	
 	z ,H:Q71%8*AF	

 	*Qxj23w[[] 	NN*6("QXX5K5K4LIV NNaS 6{47H6ILMNN3AHH4J4J3KK]^ NN*6("QXX5K5K4LL^_ NN1+d2C1DFKL V NNA: *[\ghl\m[nnpq z!B5'Ijkvw{k|j}}  A
 NN*C
|+KKX\L]K^^`a+	0 	u	%h'"%j("; 	1IwHHLLY0	1!!# 	 	s&   
CN0&N$8A$N0$N-	)N00N9c                     fd}|S )Nc                     |rJ t         j                  }t        |t              sJ t	               }|j                  d       | d   j                  }|j                  |      }|j                  r|j                  n|j                  }g }t        |   }j                  dv }	|	rdn|}
j                  dk(  rt        | d      n|
}
|j                         5  t        |       D ]  \  }}t        |t              r}|j                  sJ |j                  |k(  sJ |j                  d| d|j                   d	| d
       |j                  | d| dt!        |       d       |j#                  d| d       |j#                  |        |j                  d|
 d|j                   d        | }|j                  dt!        |       d       |j                         5  |j                  d| d
       d d d        |	r|j                  rJ d}d| d| d}n#dt!        |       }|dk(  rd|
 d}n	d|
 d| d}|j                  d| d| d       d d d        |j                  d       |S # 1 sw Y   {xY w# 1 sw Y   (xY w) Nr^  r   )r  r  r  rk   r  rU  r   z> tmpbufr   z.store(tmpbufz	.data(), r   tmpbufz[i]z> tmpbuf_out;r   r   ztmpbuf_out[i] = ztmpbuf_out.data()at::vec::VecMask<,z>::fromztmpbuf_out.data(), r5   rR  z>::loaduz at::vec::VectorizedN<r=  r  rf  )r4   rZ  r   r  r7   r   r   rY  rX  r  rH   r  r   r   rG   r   rE   r  )r  r  rZ  r   	vec_dtyperZ  r:  scalar_argsr   output_maskoctypeargidxr  res	load_argsload_fnr  s                   rW   rY  z)CppVecOverrides._scalarize.<locals>.inner  s   :XXFfl333>DNN7#QI++I6E'-'7'76##V=Q=QDK!),F%.. 3 K
  +VF  ((,>> T"X& 
  B#,T? 0KFC!#~6"zz)z"yyI5556vhbAUAU@VV^_e^ffgh "e=	+dBSATTVW $**VF83+?@#**3/0 .vhb9M9M8Nm\ ";/!5k$6G5HOP[[] >NN%5cU!#<=>%//// 3I 1&5'IG"5k$6G5H IIz$8"I$:6("UG8"T	9+R@A?B@ NN4 K> >'B Bs&   DI6I*0AI6*I3	/I66I?rU   )r%  r  rY  s    ` rW   
_scalarizezCppVecOverrides._scalarize  s    7	r rV   c                    t        t              }t        t              j                         D ]S  \  }}t	        |t
              s||vs| j                  |j                        }||_        t        | |t        |             U y r  )
r  r  ry  r  r   r  rj  r  r  r  )r%  vec_varsr   r  r  s        rW   _initialize_scalarizez%CppVecOverrides._initialize_scalarize%  sh    ( .446 	7LD&&,/D4H~~foo6 $T<#56		7rV   r  )Wr  r  r  r  r  r  r  r   rL  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rG  rL  rO  rR  rU  rW  rZ  r\  rj  rm  r  rq  rs  rw  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r&  ry  r  r  r  r(  r+  r%  r0  r  r  r  rD  r:  r  r_  rj  rm  r`  ra  s   @rW   r  r  :  sV   8[z                   ( (
         A A 	E 	E @ @ @ @ A A A A                                           0 0 9 9 9 9 F F A A     # #               % % $ $ ! ! ! ! O O        H H   S S$   1 1 1 1   	] 	]    * 	 	 J JX  , 6" 6"p : :x 7 7rV   r  cppvecc                       e Zd Zed        Zy)CppTile2DOverridesc                     t        t        j                  t              sJ t        j                  j	                  |       } t
        j                  | |      S r  )r   r4   rZ  CppTile2DKerneltransform_indexingr  r:  )r  r   s     rW   r:  zCppTile2DOverrides.index_expr4  s=    !((O444xx**40))$66rV   N)r  r  r  r  r:  rU   rV   rW   rp  rp  3  s    7 7rV   rp  c                       e Zd ZeZeZdZdZ fdZ	e
efdZd Zd.dee   fdZej$                  d        Z	 d/d	ej*                  fd
Zd	ej*                  defdZd	ej*                  dej2                  fdZd	ej*                  dej2                  fdZd Zdej*                  dej*                  dedefdZded	ej*                  fdZd.dZ de!e"ef   dedede#jH                  fdZ%d.dee&   fdZ'd Z(d  Z)d! Z*d" Z+d# Z,d$ Z-e.defd%       Z/d& Z0ej$                  d'        Z1d( Z2d) Z3d* Z4	 	 d0ded+ee   d,eej2                     fd-Z5 xZ6S )1	CppKernelr   r   c                 h   t         |   |       i | _        g | _        d | _        g | _        g | _        d | _        t               | _	        g | _
        t               | _        t               | _        t               | _        t               | _        t               | _        d| _        t               | _        t%        | j&                  | j(                  d      | _        t%        | j&                  | j(                  d      | _        t               | _        t               | _        || _        i | _        g | _        y )NFtmp_acc)name_prefixwrecps)r,  r-  active_rangesinner_itervarscall_rangesrM  rH  reduction_depthr=   reduction_prefixreduction_prefix_generatorsreduction_suffixparallel_reduction_prefixparallel_reduction_suffixlocal_reduction_initlocal_reduction_storesis_reductionnon_parallel_reduction_prefixr8   newvar_prefixsuffixreduction_cseweight_recps_csepreloads
poststoresnum_threadsreduction_omp_decreduction_var_names)r.  r  r  r1  s      rW   r-  zCppKernel.__init__A  s
    HJ 35=A(*,.# . 0 <>( . 0)7)9&)7)9&$2$4!&4&6#!-;-=* !3!3T[[iX #!
 '((*&=?.0 rV   c                 8   t         j                  j                  r'| j                  s| j                  j	                  d       | d}t         j                  j                  rdn	t               }| d}	| j                  j	                  | d| d |||       d       | j                  j                  t        ||||||             | j                  j	                  |	 d| d       | j                  j                  d| d	d
d| d ||||	|       ddg       y )Nz(int max_threads = omp_get_max_threads();_localmax_threadsz	_arr[tid]r   r   r   zfor (int tid = 0; tid < z; tid++)r   r   r   r   )r   r  dynamic_threadsr  r   r+   r  r  r   r  r  r   )
r.  r   r   r   r   reduction_combine_fnreduction_init_fn	acc_localr  acc_local_in_arrays
             rW   _gen_parallel_reduction_buffersz)CppKernel._gen_parallel_reduction_buffersf  s=    ::%%d.L.L**44: e6N	#ZZ77M=Q=S 	 !$uI.!!++j)C(9.%(P'QQRS	
 	&&--"!		
 	##--1C0DC	{RS.TU&&11*;-x@se33NCI[glmnnop		
rV   c                 Z    | j                   D ]  }t        | j                  || d        y )Nr  )r  r   stores)r.  var_names     rW   %update_stores_with_parallel_reductionz/CppKernel.update_stores_with_parallel_reduction  s0    00 	IHT[[(xj4GH	IrV   r   c                    |J t               }t        j                         5 }t        | d      rK|j	                  | j
                         | j                  |       |j                  |j                                |j	                  | j                         |j	                  | j                         |j	                  | j                         d d d        t        | d      r|j	                  | j                         | j                  r5| j                  D ]&  }| j                  |   \  }}t        ||| d||      }( |S # 1 sw Y   sxY w)Ncodegen_inner_loops_tail)r7   r   r   r   r  r  r  r   r   loadsr  r  r  r{  rz  r   )r.  r   r   rP  startends         rW   gen_bodyzCppKernel.gen_body  s   ||~!!# 	%ut23DMM*((.##DKKM2KK

#KK%KK$	% 4./KK(** X!//4
s1$uE]ESVWX 	% 	%s   B)D>>Ec              #     K   | j                   }|rYt        j                  ||      }t        |t              r3|j
                  }t        |t              sJ t        j                  |_	        || _         	 | || _         y# || _         w xY ww)z>Context manager to add an additional mask to loads and stores.N)

_load_maskr2   r  r   r3   r  rG   r   rk   r   )r.  r?  priors      rW   rD  zCppKernel.masked  ss      88D%(D$)zz!$777 #ZZ
	$J#DOeDOs   A/B
2A> 6B
>	BB
r   c                 P    | j                   |   }|||z  |z   i}t        ||      }|S r  )rH  r0   )r.  r   scaleitervar_idxr;  r   r   r   s           rW   scale_index_with_offsetz!CppKernel.scale_index_with_offset  s7     mmK(C%K&01uk2	rV   r   c                 6    t        | j                  |            S )z
        Convert an index expr to a string that can be used in cpp code.
        e.g. a sympy expression "s2" may actually appear as "ks1" in the cpp kernel.
        )rD   r8  r.  r   s     rW   index_to_strzCppKernel.index_to_str  s    
 T))%011rV   itervarc                 D     t         fd|j                  D              S )z]
        Check if an index has free symbol CppCSEVariable that depends on `itervar`.
        c              3   (  K   | ]  }|j                   j                  j                  v ret        j                  j                  |j                      t              r4j                  j                  |j                      j                          y wr  )r   r  varname_mapr   rG   
depends_on)r  sr  r.  s     rW   r  z6CppKernel.index_indirect_depends_on.<locals>.<genexpr>  sj      
vv---488//7H HH  (33G<
s   BB)rf   free_symbolsr.  r   r  s   ` `rW   index_indirect_depends_onz#CppKernel.index_indirect_depends_on  s%      
''
 
 	
rV   c                 F    ||j                   v xs | j                  ||      S r  )r  r  r  s      rW   index_depends_onzCppKernel.index_depends_on  s,    %,,, 
0N0N71
 	
rV   c                 T    t        t        | j                  | j                              S r  )dictr  rH  rM  r3  s    rW   
var_rangeszCppKernel.var_ranges  s    Ct{{344rV   r  r:  lowerupperc                    |s|sy t        |t        j                        }|rIt        j                  |t
        j                        j                  }t        j                  j                  }nt        j                  j                  }	 | j                  t        j                  _
        t        j                  |t
        j                        j                  }|t        j                  _
        | j                  }|r.t        j                  j                  | j                  |            nd }	| j                  ||rdnd |	| j                        }
| j                   j#                  ||
d       y # |t        j                  _
        w xY w)N0F)
assignment)r   r   TMPr2   r:  r   r  r  r4   rZ  r  r  sexprr8  indirect_assertr  r  r  )r.  r  r:  r  r  indirectr  r   prior_computesize_strr   s              rW   check_boundszCppKernel.check_bounds  s
    &tTXX6^^D%++6<<FXX%%F HH,,M1#':: ekk:@@#0 ZZFAF188>>$"6"6t"<=D##5CdHdoo
 	&$59 $1 s   AE' 'E>r   c                     | j                   j                  |      }| j                  |      }| dt        |       d}| j                  j                  | j                  |      }|j                  d| ||fi        |S )N[]r  )r  inputr8  rE   r  r  r  r  )r.  r   r   r   r   r  s         rW   r  zCppKernel.load  st    iiood#$$U+aE*+1-""4::t4ftT5&92>rV   c                    d|v sJ | j                   j                  |      }| j                  |      }|| dt        |       d| d}n|dk(  rt        j
                  j                  s$| j                  dk(  r| dt        |       d| d}nSt        j                  j                  |      }dt        |    d	| d
}d| dt        |       d| d}nt        d|       | j                  j                  t        ||             y )Nbufr  ] = r   
atomic_addr5   z] += zstatic_cast<r  r   zatomic_add(&z], r   store mode=)r  outputr8  rE   r   r  r  r  r4   graph	get_dtyperH   NotImplementedErrorr  r   r;   )r.  r   r   r  moder   r   r   s           rW   storezCppKernel.store  s   }}iit$$$U+<U!K./tE7!<D\!::--$2B2Ba2GaE 235qA))$/&|E':&;2eWAF%cU!K,>+?s5'L%D6&:;;l467rV   r   r   rtyper   c                 @    ddt         t           ffd}|S )Nr:  c                 L    |  d d        dS t        |       S )Nr   r   r   )r   )r:  r   r   r   r   r  s    rW   rY  z.CppKernel._gen_reduction_prefix.<locals>.inner$  sH    |"1SEWUE-B,C1EE- rV   r  )r   rm   )r.  r   r   r  r   r   rY  s    ````` rW   _gen_reduction_prefixzCppKernel._gen_reduction_prefix  s    	 	 	 rV   c                 h    | j                   D ]#  }| j                  j                   ||             % y r  )r  r~  r  )r.  r:  gen_fns      rW   finalize_reduction_prefixz#CppKernel.finalize_reduction_prefix3  s/    66 	7F!!((6	7rV   c           
      ^   |dv }|||f}|| j                   j                  v r| j                   j                  |   S | j                   j                  | j                  d| d      }| j                  j                  |        d| _        |r|n|}t        ||      }	| j                  j                  | j                  ||	||t                     | j                  J | j                  | j                     }
t        | j                  dz   t        | j                              D ]$  }|
| j                  |   z  | j                  |   z   }
& | j                   j#                  | dt%        ||||
       d       | j'                  ||	||       t)        ||      }|| j                   j                  |<   |S )	Nra   r`   
reduction FwriteTr5   r   r   )r  reduction_cacher  r  r  r  r  r   r  r  r   r}  rH  rG  r   rM  r  r   r   r  r   )r.  r   r   r   r  argmax_or_argminreduction_keyr   
init_dtyper   r   r   r   s                rW   	reductionzCppKernel.reduction7  s   )-AA!>58D..>>>%%55mDD  ))JJ*]O4E * 
 	  ''3%1 "2Y
%njA((//&&X~z>	

 ##///d223t++a/T]]1CD 	>ADKKN*T]]1-==E	>e3(eUKLAN	
 	,,S(NJW">37<B**=9rV   c                     | j                  |      }| j                  j                  |      }| j                  j	                  t        || dt        |       d| d             y )Nr  r  r   )r8  r  r  r  r   r;   rE   )r.  r   r   r  r   s        rW   store_reductionzCppKernel.store_reductionV  s]    $$U+iit$''#aE(:';4waHI	
rV   c                    | j                   ri| j                   t        |      t        |      z   k(  s+J | j                    dt        |       dt        |              | j                  t        |      k(  sJ t        |      t        |      z   | _         | j                   D cg c]  }| j	                  |       c}| _        t        t        | j
                              D cg c]  }t        t        j                  |       c}| _
        t        |      | _        | j                  d | j                   | j                  | j                  d  fS c c}w c c}w )Nr  r}  )r|  r   r}  r   r8  rM  rG  r.   r   XBLOCKrH  )r.  lengthsreduction_lengthsr  ns        rW   
set_rangeszCppKernel.set_ranges]  s2   ##uW~>O8P'PP ##$Dw(8EBS<T;UVP ''3w<777$W~6G0HHD<@<L<LMq4//2MDK s4;;/0 /t{{A>DM $'w<D MM0D001MM$..01
 	
 Ns    E !Ec                     | j                   J t        j                  j                  j	                  t        | j                         d      S )N    fallback)r|  r4   r  sizevars	size_hintr/   r3  s    rW   r  zCppKernel.size_hintp  sF    +++ww))$**+d * 
 	
rV   c                 ^   t        | t              sJ t               | j                  J t        |j                  t
              r+|j                  j                  |j                               n | j                  |j                               |j                  d uxr# |j                  j                     j                  t        j                         5 }j                  r6rj                          nj                         |j!                         n4dkD  r/j#                         r|j%                  j'                                dt(        ffddd	 ddt(        dt*        ffdddt(        dt*        ffd	 	 ddt(        dt*        dt,        ffd	|j%                  j'                                t        |j                  t
              r t        t.        j0                  t2              rt.        j0                  j4                  rt.        j0                  j4                  }|j7                         D ]  }t9        |j;                         j<                  D cg c]  }| j?                  |       c}      }t@        |j;                         jB                     }	d
|	 dtE        |       d}
|jG                         }jI                  d|	 d| d|
 d       jI                  |	 d| d| d         |       d d d        y c c}w # 1 sw Y   y xY w)Nr5   
_loop_nestc                      fd} j                         }t        |t              r|j                  D ]
             y t        |t              sJ  j
                   |       r|j                          t        j                         5 }|j                  j                                |j                         d d d        y # 1 sw Y   y xY w)Nc                      j                   sJ j                   j                     } | j                  xr | j                  S r  )rB  r  r  parallel)rootr  	par_depths    rW   is_parallel_reductionzOCppKernel.codegen_loops_impl.<locals>.gen_kernel.<locals>.is_parallel_reduction  s=    %++++%++I,A,ABD,,>>rV   )
get_kernelr   rW  rY  CppKernelProxyrB  r  r   r   r   r   r  )r  r  rZ  r   r   gen_loop_nestr  s   `   rW   
gen_kernelz0CppKernel.codegen_loops_impl.<locals>.gen_kernel  s    ?
 $..0f&:;&,ll 2
%j12 &fn===!''38M8ODDF#--/ .5++DKKM:-. . .s   1CCc                     |r| j                   }|r| j                  |z   }|S | j                  }|r|| j                  z   }|S || j                  z   }|S r  )r  r  r~  r  r  )rZ  r  	is_suffixr  prefixs        rW   get_reduction_prefix_suffixzACppKernel.codegen_loops_impl.<locals>.get_reduction_prefix_suffix  sd    #44F!'!A!AF!J!M#44F!'&*J*J!J "M "(&*N*N!N!MrV   depthc                    | j                         }| j                  sJ | j                  |   }t        j                         5 }|j                  rI|sG 	||j
                  d      }|r|j                  j                                j                  |       
rR|j
                  rFj                         |j                  r)|j                  sJ j                  |j                          | |       
rC|j
                  r7|j                  rj                  |j                         j                          |j                  r&|s$j                   	||j
                  d             d d d        y # 1 sw Y   y xY w)NF)r  T)r  rB  r   r   r  r  r   r   r  r  r  close)r  r  in_reductionrZ  loopstack_outerr~  r   gen_loop_atr  is_reduction_loopthreadsworksharings          rW   gen_loop_with_reductionz=CppKernel.codegen_loops_impl.<locals>.gen_loop_with_reduction  sA    $..0!''''!''.))+ {((+F"DMMU,( ,'55dkkmD$45(T]]#,,W5!66#)#@#@@#@ KK(C(CD
E2(T]]!88 KK(E(EF#))+((7 &+  s   D*E66E?c                 X   t        j                         5 }| j                  sJ | j                  |   }|j                         }|
	 d d d        y j	                  |       |j                  j                                 | |dz   |j                         d d d        y # 1 sw Y   y xY wr   )r   r   rB  linesr   r   r   r  )r  r  r   r  
loop_linesr   r  s        rW   r  z1CppKernel.codegen_loops_impl.<locals>.gen_loop_at  s    ))+ Lu%++++%++E2D!%J!)L L OOJ/''6!*eai9J9JKL L Ls   1B AB  B)r
  c                 t    | j                   |t        | j                         k(  r	 |        y  | ||       y r  )rB  r   )r  r  r
  r  r  s      rW   r  z3CppKernel.codegen_loops_impl.<locals>.gen_loop_nest  s7    
 ##+uJ<L<L8M/Mz*+J|LrV   zstd::make_unique<z []>(r   zstd::unique_ptr<z	 []> buf_r   r   z* z = buf_z.get();)FF)r   F)r   )%r   r   r+   r|  rZ  rW  decide_parallel_depthmax_parallel_depthrB  r  r  r   r   r
  r	  r  mark_parallelsingler   r   rF  rm   rk   r4   local_buffer_contextrJ   local_buffersvaluesr/   
get_layoutr:  r8  rH   r   rD   get_namer  )r.  rH  r   r  r   r  local_buffersize_vallocal_buf_sizelocal_buf_dtypeallocatelocal_buffer_namer  r  r  r  r  r  r  r  s     ``        @@@@@@@@rW   codegen_loops_implzCppKernel.codegen_loops_implv  s   $///&(+++i&&(<=!((>>,,.I 22,,.I
 OO4' D	 5 56CC 	 !!# |	%u''$%%'((1''	21%%'''6.x .$" DI$-0 B	L 	L 	L %*M$MM #M . 9++-ABq557IJ**88 !" 6 6 D D$1$8$8$: L%2 -9,C,C,E,J,J ( !00:&N '3<3J3J3L3R3R&SO!2?2C5~I^H__`aH(4(=(=(?%KK*?*;9EVDWWZ[cZddef KK*+2.?-@HYGZZab" )$y|	% |	%\]|	% |	%s    !FL#4LB	L#L##L,c                 T    t         j                  |       }| j                  |||       y r  )rF  buildr%  )r.  r   r  rH  s       rW   codegen_loopszCppKernel.codegen_loops	  s"    NN4(		4=rV   c                 :    t         j                  j                  ryy)NAOTI_TORCH_CHECKTORCH_CHECK)r4   r  aot_moder3  s    rW   assert_functionzCppKernel.assert_function		  s    77% rV   c                    | j                   J | j                   |j                  |j                  |j                  z    }| j                         }d}d}|D ]m  }t        j
                  j                  j                  |d      }|d|z  k\  s||k(  r n3||z  t        j                  j                  k  r n|dz  }||z  }||z  }o t        j                  j                  r|dk(  rt        |      dkD  rd}t        ||j                        S )Nr5   r   r  r  r   r
  r  )r|  r  r
  r  r4   r  r  r   r  min_chunk_sizer  r   r	  )	r.  r  r  rM  seqparr  r  hints	            rW   r  zCppKernel.decide_parallel_depth	  s   +++!!**"..1C1R1RR

 nn 		D77##--dT-BDa'k!SG^g~

 9 99QJE4KC4KC		 ::%%%1*VqE .@.L.L
 	
rV   c              #     K   | j                   | j                  | j                  | j                  f}t	               | _         t	               | _        t	               | _        | j                  j                         | _        d  | j                  j                  | j                          | j                  j                  | j                         | j                  j                  | j                         |\  | _         | _        | _        | _        y wr  )r  r  r  r  r=   cloner  r  )r.  r  s     rW   write_to_suffixzCppKernel.write_to_suffix-	  s     T\\4;;A#%
%'$&88>>#$$TZZ0$$T\\2$$T[[1<A9T\4;s   D
Dc                     t        |i |S r  )rG   )r.  r  r  s      rW   create_cse_varzCppKernel.create_cse_var:	  s    t.v..rV   c                 "    dt         |    d| dS )Nzc10::convert<r  r   )rH   )r.  srcr   r   s       rW   r  zCppKernel.get_to_dtype_expr=	  s    |E232cU!<<rV   c                 b    | j                  |||      }| j                  j                  ||       y r  )r  r  r  )r.  dst	dst_dtyper:  r   r  s         rW   r  zCppKernel.cache_dtype_convert@	  s(    %%c9i@T3rV   r  r   c                 V   
 |d} j                   syg 

 fd}|-| j                   v sJ  j                   |   \  }} ||||      s4y j                   j                         D ]  \  }}|\  }} ||||      r y dj                  
      }	|	r|j                  d| d|	 d       yy)	NrR   Tc                 8   | |k(  ryd }t        j                        D ]  \  }}||k(  s|} n t              t        k(  r|r| dk(  r|j                  |   k(  rd}j                  | dt        |               j                  | dt        |              y)NFr   r5   r  r   T)r   rH  r  ru  rM  r  rE   )r  r  r   var_idr   _var
conditionsr.  s         rW   genz)CppKernel.codegen_conditions.<locals>.genP	  s    |F$T]]3 4$;F
 T
i'QJ4;;v..T+e*<)=>?SS)9(:;<rV   FrF  zif(r  r   )rz  r  joinr   )r.  r   r  r   rC  r  r  rA  _rangejoined_conditionsrB  s   `         @rW   codegen_conditionszCppKernel.codegen_conditionsD	  s     >F!!
	& ?$,,,,,++C0JE3uc3' $ 2 2 8 8 : !f#
s5#t, ! #KK
3NNS*;)<B?@rV   r  )r5   r   NN)7r  r  r  ry  r@  rD   r  r  r  r-  r   r   r  r  r   r7   r  r   contextmanagerrD  r   r  r  ro   r  r   r  r  r  rk   r  r  r  r	   r9   r   r   r  rm   r  r  r  r  r  r%  r(  propertyr-  r  r6  r8  r  r  rG  r`  ra  s   @rW   ru  ru  ;  s    IEMF#1V /('
RIXl3 ( $ $& BCZZ2%** 2 2	
uzz 	
ELL 	

ejj 
5<< 

5:jj: jj: 	:
 :@ UZZ 8$;#$  	
 {{:7hsm 7>

&
M%^> ! ! !
: 
B 
B/=  !%&*	.. . ell#	.rV   ru  c                   J    e Zd ZeZ	 d) fd	Zdej                  dej                  fdZ	de
j                  defdZde
j                  defd	Zde
j                  defd
Ze
j                  fde
j                  defdZdede
j                  defdZ	 d)dedej                  de
j                  dee   fdZ	 	 	 d*dee   dej                  de
j                  dee   deeeef      dedee   fdZdedej                  f fdZ	 d+deeef   dedej                  de
j                  def
dZd)dZd Zd ZdedefdZ dedej                  defd Z!d! Z"d" Z#d)d#Z$ddde
jJ                  fdeej                     d$ee   d%ee
j                     fd&Z&d) fd'	Z' fd(Z( xZ)S ),r  Nc                     t         |   ||       t        j                         | _        | j                  sJ |dkD  sJ d       || _        || _        || _        |r|| _        y || _        y )Nr   z0Expect pass in Non-Zero tiling_factor explicitly)	r,  r-  r   pick_vec_isavec_isar  rI  rX  	num_elems)r.  r  r  r  rI  rX  r1  s         rW   r-  zCppVecKernel.__init__x	  si     	{+"//1|||q T"TT *$"&/]rV   r   r  c                       j                  ||      ry  fd|j                  D        D ]"  }t        |t              sJ |j                  s" y  t        || j                        }|j                  r|S d S )Nc              3      K   | ]A  }t        |t        j                        r%j                  j                  |j
                      C y wr  r   r   r  r  r  r   r  r  r.  s     rW   r  z5CppVecKernel._try_get_const_stride.<locals>.<genexpr>	  s:      
a* HH  (
   AA
)r  r  r   rG   r   r  r  r  )r.  r   r  indirect_varrN  s   `    rW   rJ  z"CppVecKernel._try_get_const_stride	  s|    ))%9
''
 	L
 lN;;;""	 %UGT5G5GH))v3t3rV   r   r   c                     t        j                  | j                  |j                  z  dz  | j                  j                         z        }|dk\  sJ |S )N   r5   )mathr  r  itemsizerO  	bit_widthr.  r   num_vectorss      rW   rY  zCppVecKernel._get_num_vectors	  sO    ii/!3dll6L6L6NN
 arV   c                 p    | j                   |j                  z  dz  | j                  j                         z  S )NrX  )r  rZ  rO  r[  )r.  r   s     rW   r  z!CppVecKernel._get_raw_num_vectors	  s0     !!ENN2Q69O9O9QQQrV   c                 h    | j                  |      }|dk(  rdt        |    dS dt        |    d| dS )Nr5   rR  r   rS  ra  )rY  rH   r\  s      rW   r;  zCppVecKernel._get_vec_type	  sJ    ++E2!),u*=)>a@@*<+>*?qQOOrV   c                 l    |t         j                  k(  ry| j                  |      }dt        |    d| dS )NrR   r`  ra  r   )r   rk   rY  rH   r\  s      rW   r  zCppVecKernel._get_mask_type	  s<    EJJ++E2"<#6"7qQGGrV   r?  c                     |j                   t        j                  k(  sJ t        |             | j	                  |      }| dt
        |    d| dS )Nz.template cast<ra  r   )r   r   rk   reprrY  rH   )r.  r?  r   r]  s       rW   r(  zCppVecKernel._get_mask_cast	  sP    zzUZZ'3d3'++E2|E':&;1[MMMrV   r   	load_maskc                    t         |   }| j                  |      }d}|rS|j                  s&| j                  t        j
                         d| d}n!| j                  |t        j
                         }|dk7  r| dt        |       n|}|t        j                  k(  r| j                          d| d}	|	S |r| d| d| d| dn,| j                  |       d	| d
t        | j                         d}	|	S )a  
        Get a load line str that loads a vector from `var` at `index` of type `dtype`.
        If `load_mask` is not None, we do a masked load accordingly.
        Notes on the `dtype`:
        1. We always load `self.tiling_factor` number of elements regardless of the `dtype`.
           It means we load half of the vector lanes for 16-bit data types and quarter of the
           vector lanes for 8-bit data types.
        2. `torch.bool` and `torch.uint8` could mean masks and we load them as float mask vectors.
        Nr:  r   r   r}  z.template loadu<ra  r  ::loadu(r   )rH   rY  r   r  r   rn   r(  rE   rk   r;  rP  )
r.  r   r   r   rc  cpp_typer]  load_mask_strloadbufr   s
             rW   _get_vec_load_linezCppVecKernel._get_vec_load_line	  s      &++E2###'#6#6u{{#C"DGI;VW X#'#6#6y%++#N"O5:aZSE[/01SEJJ))+,GG9A>D  ! !/!1(1[MG9TUV**512(7)2kRVR`R`FaEbbcd 
 rV   Fr   store_value
accu_storec                 Z    |r	|J d       |r|sJ  j                   dt        j                  dt        f fddt        j                  dt        f fddt        dt        f fd}t               }|j                  d	       |j                         5   |      }	 |      }
d
t        |    d|
 d}|j                  |       |r |j                  | dt        |	       d       t         j                   j                      d      }i } fd|j                  D        D ]4  }t        |t              sJ |j                  s" ||      }| d| d||<   6  j!                  | j                  |      }d} j"                  l|rJ d       t         j"                  t              sJ  j"                          j"                  j                  r j"                   d| d}n j"                   d}t%        j&                         r|j                  d j(                          n|j                  d j(                          |j                  d| d| dt         j*                         dz   | dz          |j                         5  t-        j.                         5 }t        |      }|D ]#  }t1        j2                  d| z   dz   ||   |      }% || d| dn| }|r4|j                  d | d       |j5                  |j                                |r!|rd!nd"}|j                  | d#| d$| d%       n|j                  d&| d'| d(       ddd       ddd       |s( j7                  d)d*|      }|j                  d+| d(       ddd       |j                  d,       |r#|j                  d(       j9                  |       y j:                  j=                  |      }t        |t              sJ d-|_        |S # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w).a  
        Load or store a vector in a non-contiguous way. The vector is initialized from an array that is
        filled in an inner loop over the tiling factor.
        :param var: buffer to load from or store to, i.e. `var[transformed(index)]`. If None, we load the index
                    as index expression, i.e. `transformed(index)`.
        :param index: index into the `var` or the index expression by its own if `var` is None.
                      The `index` could contain indirect indexing or the tiling itervar. When used in
                      the inner loop, the index is transformed as follows:
                      1. the index is linearized along the tiling dim.
                      2. the indirect indexing vector variables are transformed into arrays over the tiling dim.
        :param dtype: data type of `var` or `index` if `var` is None.
        :param buffer: the code buffer to write the generated code to. If None, we write to `self.loads`.
        :param store_value: the value to store. If None, we load the vector.
        :param accu_store: whether accumulate the store_value to store_ptr. If True, a store_value should be provided
        :return: a CppCSEVariable that represents the loaded vector or None if it is a store.
        Nzstore var must be providedr   r   c                 r    | j                   dk  rj                  d| j                   z  z  S j                  S N   )rZ  rP  r   r.  s    rW   get_result_sizezCCppVecKernel._load_or_store_non_contiguous.<locals>.get_result_size	  s1    ~~!~~enn)<==~~%rV   c                 r    | j                   dk  rj                  d| j                   z  z  S j                  S rn  )rZ  r  rp  s    rW   get_tiling_sizezCCppVecKernel._load_or_store_non_contiguous.<locals>.get_tiling_size	  s5    ~~!))Q%..-@AA)))rV   vec_varc                 L   | j                   sJ t               }|j                  d       |j                         5  | j                  }|J |t
        j                  k(  rt
        j                  } |      } 	|      }|j                  dt        |    d| d       |  dt        |       d}|j                  |       |j                  d       d d d        |j                  d       
j                  j                  |      }t        |t              sJ |S # 1 sw Y   JxY w)	Nr=  rU  r   rV  rW  r   zreturn tmpbuf;rf  )r   r7   r   r   r   r   rk   rn   rH   rE   r  r  r   rG   )rt  r   rb  result_sizetiling_sizer   r  r   rq  rs  r.  s          rW   vec_to_arrayz@CppVecKernel._load_or_store_non_contiguous.<locals>.vec_to_array
  s   >>!>>DNN5! 1#MM	 ,,,

* %I-i8-i8.|I/F.Gr+V_` ""8[9Q8RRTUt$/01 NN4 XX&&vt4Ffn555M!1 1s   BDD#r=  rU  r   rV  rW  r   rD  c              3      K   | ]A  }t        |t        j                        r%j                  j                  |j
                      C y wr  rS  rT  s     rW   r  z=CppVecKernel._load_or_store_non_contiguous.<locals>.<genexpr>,
  s:      !!!TXX. $$QVV,!rU  r  r  r  r;  zunexpected store with load maskz.is_masked(r   z != 0z#pragma GCC unroll z#pragma unroll 
for (long  = 0; r   r   r   r   r>  +==r   z tmpbuf[r   ztmpbuf[r  r   ztmpbuf.data()r   r=  rf  T)r  r   r   rm   rG   r7   r   r   rH   rE   r-   rH  rI  r  r   r   r  r  r   is_gccr  rP  r   r   r   r   r   ri  r  r  r  )r.  r   r   r   r   rj  rk  rx  r   rv  rw  result_declareitervar_innerreplacementsrV  	array_varrc  r   index_crhsr   	load_liner  rq  rs  s   `   `                  @@rW   rL  z*CppVecKernel._load_or_store_non_contiguous	  s[   2 #/O3OO1;>ZZF	&5;; 	&3 	&	*5;; 	*3 	*	. 	^ 	 	, ~u[[] ?	7)%0K)%0K*<+>*?r+iX  NN>*"m#9+k:R9SSUV /==12&9M L!++! Q
 ",???&& ,\ :I4=;aa1PL.Q 004??= 1 E I*&I(II!$//>BSDOOSB??))#'??"3;}oQ OI#'??"35 9I!!#!4T5G5G4HIJ1C1C0DEFNN]O62"O3{4>>'B&C2FG"O3'(
  H
 4 4 6 H%%e,$0 L ff<.1E9$\2G .1_Qwiq)WINNT)A#67''6*4$#KNNcU!K=r#RSNNW]O4uA#FG!H H"  33OQN	156?	7@ 	tNN3MM$XX&&vt4Ffn555 FMM;H H H HY?	7 ?	7sE   B1P!EP!P(B2P	P"2P!	PPP	P!!P*r   c                 6   | j                   j                  |      }| j                  |      }t        j                  j                  |      }| j                  | j                     }| j                  ||      }|dk(  rt        	| )  ||      S |dk(  rE| j                  |||| j                        }| j                  j                  | j                  |      }n| j!                  |||      }t#        |t$              sJ |j'                  d| ||fi        d|_        |S )Nr   r5   r  T)r  r  r8  r4   r  r  rH  rI  rJ  r,  r  ri  r  r  r  r  rL  r   rG   r  r   )
r.  r   r   r   r   rM  rN  r   r  r1  s
            rW   r  zCppVecKernel.loadh
  s    iiood#$$U+!!$']]4??3
++E:>Q;7<e,,q[**3udooNDXX&&tzz48F77UEJF&.111ftT5&92>rV   r  c           	         t        |t              s#t        |t              r|j                  sJ |       | j                  | j
                     }| dt        |       }| j                  ||      }t               }	|dk(  r|rl|t        j                  k(  r#| j                  | j                  |       d| dn,| j                  |       d| dt        | j                         d}
d| d|
 d}|t        j                  k(  r%| j                  |	j                  | d| d       |	S |	j                  | d| dt        | j                         d       |	S | j                  ||||	||	       |	S )
a2  
        Get a store line buffer that stores `value` into `var` at `index` of `dtype`. It handles
        both contiguous and non-contiguous store cases.
        :param value: Vectorized type templaterized on `dtype`.
        :param var: buffer to store into.
        :index: index into the `var`.
        r}  r5   re  r   r   r  .store(r   )r   rj  rk  )r   ro   rG   r   rH  rI  rE   rJ  r=   r   rn   rX  r;  rP  r   rL  )r.  r  r   r   r   rk  rM  var_exprrN  r   r  s              rW   _get_store_linezCppVecKernel._get_store_line|
  s   " %%un-%,,		 
 ]]4??3
U#k%012++E:>Q; +0F ))%01(1E ..u56hxj;W[WeWeKfJgghi 
 E7#dV1-#(>%z<=  gWXJbT^^1L0MRP  ..UE$Ej /  rV   c                    dv sJ t        |t              sJ |       |j                  s| j                  |      }| j                  j                        }| j                  |      }t        j                  j                        }|B| j                  ||||      }| j                  j                  |j                  fd             y |dk(  r.t        j                  j                   sT| j"                  dk(  rE| j                  | |||d      }| j                  j                  |j                  fd             y | j%                  |      }| j%                  t&        j(                        }	t*        |   }
t-        j.                  |t&        j(                        j0                  }t        |t              r|j                  sJ d|
 d	|	 d	| d
| d	| d	| d}| j                  j3                  t5        |             y t7        d|       )Nr  c                     t        |       S r  r;   r  r   s    rW   <lambda>z$CppVecKernel.store.<locals>.<lambda>
  s    ,tQ2G rV   r  r5   T)rk  c                     t        |       S r  r  r  s    rW   r  z$CppVecKernel.store.<locals>.<lambda>
  s    l46K rV   zatomic_add_vec<r   r  r   r  )r   rG   r   r  r  r  r8  r4   r  r  r  r  r  mapr   r  r  r  rY  r   r  rH   r2   r:  r  r   r;   r  )r.  r   r   r  r  r   r   r   n_srcn_idxr   r   s    `          rW   r  zCppVecKernel.store
  s   }}%07%70||NN5)Eiit$$$U+!!$'<''sE5ADKKtxx(GHI\!::--$2B2Ba2G++g# ,  ""488,K#LM--e4--ekk:%e,uekk:@@!%8U\\II(5'E7"SEE7RTUZT[[]^%%l4&>?%D6&:;;rV   c           
         |t         v sJ |dv }| j                  | j                  k\  }|r|n|}t        |t              sJ |       |j
                  s| j                  |      }|||f}|| j                  j                  v r| j                  j                  |   S d}	|	 dt        |    d}
t        ||      }| j                  ||      }| j                  j                  | j                  d| d      }t        |t              sJ | d}d	| }| xj                  | ||gz  c_        d
| _        | j                   j#                  | j%                  ||||t&                     | j                   j#                  | j%                  ||||| j(                               t+        j,                  d | j.                  | j                  d        }|dk(  r| j                  J | j                   j#                  | j%                  ||||| j(                               t+        j,                  d | j.                  | j                  d        }| j                  | j                  k\  r| j0                  nd}t3        ||      | _        | j4                  | j6                  j                  vr| j6                  j                  | j8                  d| j4                   d      | _        | j:                  | j6                  j                  | j4                  <   | j<                  j?                  | jA                  |             tB        jD                  jF                  rdn	tI               }| jJ                  j?                  | jA                  ||             n(| j6                  j                  | j4                     | _        | jL                  r|n|}| jN                  j?                  | d| jQ                  |||d
       d       n| j                  J | jR                  | j                     }tU        | j                  dz   tW        | jR                              D ]$  }|| j.                  |   z  | jR                  |   z   }& ||||d}| jN                  j?                  | d | jP                  ||fi | d       | jY                  ||||| jP                  | j(                         | jY                  ||||tZ        t&               |dk(  r+| jY                  ||||| jP                  | j(                         |t\        j^                  k(  }|r1ta        |      rS| jc                  |      dv sJ d       d| d}d| d}| jd                  j?                  | dt[        |||       d       n|r	| d| d}n|r|dv rd| d}n|dk(  sJ | d}n}d| jQ                  |dd       z   d!z   }|t\        j^                  k(  }|rt\        jf                  n|}d"t        |    d}
d#t        |    d$| jc                  |       d}| d%|
 d&|
 d'| d$| d
}| jd                  j?                  | dt[        ||||(       d       |}n>|}ta        |      r1d	| }| jd                  j?                  | dt[        |||       d       ti        ||      }|| j                  j                  |<   |S ))Nr  zat::vecz::Vectorized<r   r  Fr  _vecmasked_Tc                     | |z  S r  rU   r  s     rW   r  z(CppVecKernel.reduction.<locals>.<lambda>
  s
    Q rV   rg   c                     | |z  S r  rU   r  s     rW   r  z(CppVecKernel.reduction.<locals>.<lambda>  s
    QU rV   r5   r  r   r   )r   r   horizontal_reductionr   )r  r  )r5   r   z4Welford reduction does not support VectorizedN (N>2)zwelford_vec_reduce_all(r   z_vec_reduce_all()rf   rc   r_   rJ  z.all_zero()r^   z.all_masked()z	{ return r  r  z; }rR  zat::vec::vec_reduce_all<r   z([](z& x, z& y) r  )5VECTORIZABLE_RTYPESrI  r}  r   rG   r   r  r  r  rH   r   reduction_acc_type_vecr  r  r  r  r  r  r  r   reduction_init_vecrI  rJ  rM  r  r   weight_recp_vec_ranger  r  weight_recps_valr  r   welford_weight_reciprocal_vecr   r  r  r+   r  rX  r  reduction_combine_vecrH  rG  r   r  r   r   rk   r*   rY  r  rn   r   ) r.  r   r   r   r  r  r  r  r  vec_nsvecr   acc_type_vecr   acc_vecmasked_acc_vecreduction_sizereduction_factorr  acc_vec_r   r   r  r   r   masked_next_valuereduce_all_bodyrb  vec_reduce_all_functmpvarmasked_tmpvarr   s                                    rW   r  zCppVecKernel.reduction
  s    !4444)-AA#$2F2FF"2Y
%07%70||NN5)E!>58D..>>>%%55mDDl5&9%:!<%njA22>:N  ))JJ*]O4E * 
 #~...E,"7),  uw$GG  ((//&&X~z>	

 	((//&&''	
 #))D,@,@,B C
 --''333,,33**" "++ '--"DKK0D0D0F$GN '+oo9M9M&M""ST  *2.BR)SD&))1F1F1V1VV(,(=(=(F(FLLJt/I/I.J"KSX )G )% )) %%55d6P6PQ 22<<66u= zz11 "-/ 
 ))3366ukJ )-(=(=(M(M..)% *.~WHKK!!*C : :>8UZ\` abbcd ''333MM$"6"67E4//!3S5GH BA.q1AAB $(<&	F KK!!)39t99.'\U[\]]^_ 	,,!%!;!;"55 	- 	
 	,,!2, 	- 	
 --00%)%?%?"&"9"9 1  5::%#N3,,U3 8  J JJ   7wiqA
&=n=MQ$O!%%//e30FWXYYZ[ " .//?yJ
! & 
 $%WI[!9J)U222$+9M!:J  00cJK  
  5::-+2EKK	,\)-D,EQG(@iAX@YY[\`\q\qr{\|[}}~&# 34DU3%u_L]]_`g_hhij
!!++%s,^S*Xabccde FF#N3")& 2%%//hc"3NFM"Z![[\] #>6:<B**=9rV   c                 &   | j                  |      }| j                  j                        }t        j                  j                        }|j                  r%|t        j                  k(  r|nt        j                  nt        j                  }t        j                  j                  |      }t        j                  j                  |      }t               }	| j                  | j                  k\  r.|	j!                  | dt#        |       dt$        |    d| d       n||k7  rt$        |    d| }
|t        j&                  k(  r&| d| j                  t        j&                         d}n?||cxk(  rdk(  rn nd	t$        |    d| d
}n d	t$        |    d| dt$        |    d| d| d
}|	j!                  d|
 d| d       |
}|	j)                  | j+                  ||||             | j,                  j)                  |	j/                  fd             y )Nr  z] = static_cast<r  r   r   z.template cast<bool,r   r5   at::vec::convert<r   ra  r   r   r   c                     t        |       S r  r  r  s    rW   r  z.CppVecKernel.store_reduction.<locals>.<lambda>  s    T18M rV   )r8  r  r  r4   r  r  is_floating_pointr   rj   rn   r  rZ  rY  r=   rI  r}  r   rE   rH   rk   r  r  r  r  )r.  r   r   r  r   	out_dtyper   out_num_vectorssrc_num_vectorsr   converted_valueconverts    `          rW   r  zCppVecKernel.store_reduction  s   $$U+iit$GG%%d+	 ** $u||3Y 	
 ((33I>((33E:??d222NN%qU+,,<\)=T<UUWX]W^^`a
 E!%1)%<$=Qug"F

*!&';D<Q<QRWR\R\<];^^abG&/>Q>/Y0G/H5'QRS  
 0Y0G/H./qe1D0EQFWWYZ_Y``ac   &7s7)1EF'KK,,UC	JK$$TXX.M%NOrV   
scalar_varc                    |j                   rJ |j                  t        j                  k(  rE| j                  j                  | j                  | j                          d|j                   d      }n]|j                  J | j                  j                  | j                  | j                  |j                         d|j                   d      }t        |t              sJ |j                  |_        |j                  |_        d|_         |S )Nr:  r   r  T)r   r   r   rk   r  r  r  r  r   r;  r   rG   dependent_itervars)r.  r  rt  s      rW   r  zCppVecKernel.broadcast  s    $$$$uzz)hh''!4!4!6 7wz>OqQG ##///hh''%%j&6&678*//9J!LG '>222"((%/%B%B"rV   rN  c           	      "   |j                   rJ |j                  J | j                  j                  | j                  | j                  |j                         d| d| d      }t        |t              sJ |j                  |_        d|_         |S )Nz	::arange(r   r   T)r   r   r  r  r  r;  r   rG   )r.  r   rN  r  s       rW   rK  zCppVecKernel.arange  s    <<{{&&&""LL!!%++./yr&K
 &.111{{rV   c                    t         |   }| j                  |      }t        |      rd| dS |dv rWt        |   }| j	                  ||      }|dk(  rt        |      rd| dnd| d}nt        |      rd| dnd| d	}| d
| dS |dk(  r| j                          dS t        ||      }| d
| d}	|t        j                  k(  r|dv sJ | j                          d| dS |	S )Nr   r   ry   r`   r|   r{   r~   rz   r}   r  r   rf   z	::from(0))r^   r_   rc   r:  )
r<   r;  r*   rH   r  r   r  r   r   rk   )
r.  r   r   r   vec_typer   r   r3  scalar_initvec_inits
             rW   r  zCppVecKernel.reduction_init_vec  sA   07%%k2/hZs++11!+.F22>5IH) &e, +6(-@/xx@  &e, ,F8=A/xx@ 
 ZqQ''U"))+,I66$^U;ZqQ/EJJ!%::::))+,GK=BBrV   c                    t         |   }| j                  |      }t        |      rd| dS |dv rw| j                  |      }| j                  t        j
                        }|t        j                  k(  r!dt        t        j                      d| d| dS dt        |    d| d| dS |t        j                  k(  r|dv sJ | j                          S |S )Nr   r   ry   zIndexValueVec<r   )r^   r_   rf   rc   )
r<   r;  r*   rY  r   r  rk   rH   rn   r  )r.  r   r   r   r  r  r  s          rW   r  z#CppVecKernel.reduction_acc_type_vec  s    07%%k2/hZq))11))+6E))%++6E

"'U[[(A'B"UG2eWTUVV#L$=#>br%PQRREJJ!%AAAA))+,-rV   c                     |rt        | j                  |      n| j                  }t        |      }d| j                  |       d| j                   d| dS )Nzstatic WeightRecp<rT  r  r   )r   r  rE   r;  r  )r.  r   r  vec_num_range_threadvec_num_range_thread_exprs        rW   r  z*CppVecKernel.welford_weight_reciprocal_vec  sk      D..<++ 	
 %00D$E! !3!3E!: ;2d>S>S=T()	
rV   r  r   c                    |t         j                  k(  }|dk(  r=| j                  rd| d| dt        | j                         dS |r| d| S d| d| dS |dk(  r=| j                  rd| d| dt        | j                         dS |r| d	| S d
| d| dS |dk(  r;| j                  rd| d| dt        | j                         dS |rdnd}	| d|	 d| S |dk(  r2| j                  rd| d| dt        | j                         dS | d| S |dk(  r2| j                  rd| d| dt        | j                         dS | d| S |dk(  r|rN| j                  r,d| d| dt        | j                         d| j                   d	S d| d| d| j                   dS | j                  rd| d| dt        | j                         dS d| d| dS |dk(  rgt        |t              r|\  }
}}nt        ||      \  }
}}| j                  r%d| d|
 d| d| dt        | j                         dS d| d|
 d| d| d	S |dv r|J t        |   }|t         j                  k(  rt        t         j                     }| j                  |      }| j                  t         j                        }d}d}|%|J dt        |      j                          }d| }| j                  r.| d| d| d| | d | d| | dt        | j                         dS | d| d| d| | d | d| | dS |d!k(  r]t        |t              rF|j                  t         j                  k(  sJ t!        t"        j$                  j&                  |f      \  }| d| S t(        )"Nr_   zmax_masked_reduce(r   r   rY  r#  r^   zmin_masked_reduce(rT  r  rc   zsum_masked_reduce(r   rZ   r   rd   zprod_masked_reduce(r   re   zxor_sum_masked_reduce(r   rg   r   r  rh   r   z}, r   ry   rR   z_combine_vec<r  rf   )r   rk   rX  rE   r  r   r   r   rH   rn   rY  r  ro   r  rG   r   rN   r4   rZ  r  r  )r.  r   r   r   use_weight_recpsr   r  r   r   r   r   r   r   r   r  r  t_extra	arg_extras                     rW   r  z"CppVecKernel.reduction_combine_vec
  s    uzz)U"~~+C5:,bT^^A\@]]^__  e3zl+ -SEJ<qA
 u$~~+C5:,bT^^A\@]]^__  e3zl+ -SEJ<qA
 u$~~+C5:,bT^^A\@]]^__%,c#a}Aj\::v%~~,SEJ<r+dnnB]A^^_``c*..y(~~/uBzl"[QUQ_Q_E`Daabccc*..//>>-cU"ZL;t~~C^B__bcgcxcxbyyz{{-cU"ZLDDYDYCZZ[\\>>-cU"ZL;t~~C^B__`aa-cU"ZLBB00*e,#- b& $5^Z#P b&~~)#d4&2$b[Y]YgYgMhLiijkk)#d4&2$bLL33(((!),FEJJ&%ekk2)))4E))%++6EGI +777s#78>>@AB L	~~%&mF82eWBuggY WuBzl9+RDNN8S7TTUW
 ))vhbr%QXPYY[\_[``bcmbnoxnyyz{{u$*n5!''5::555 4QXX5E5E
} UU#j\**%%rV   c           	         t        |t              sJ |j                  J |j                  s4t        |t              r|j                  rd| d}t        	|   ||||      S |}|}|r!| j                  |j                         d| d}|r!| j                  |j                         d| d}|r|rd| d| d| d| d	}| d| d| }n#|r| d| }| d| }n|sJ | d| }| d| }| j                  |j                         d| d}|r6|j                  s!| j                  |j                         d| d}d| d| d}| j                  rS| j                  |j                         d| j                  |j                         d	| d
t        | j                         d}d| d}| j                   d| d| dS )Nr  z).all_masked()r   r  z) & (r   z) | ~(z::set(z::from(1), (r   z, "index out of bounds: z"))r   rG   r   r   r,  r  r;  r  rX  rE   r-  )
r.  r   r  r  r?  lower_scalarupper_scalarcond
cond_printr1  s
            rW   r  zCppVecKernel.indirect_asserti  s$   #~...yy$$$zz$/DKK4&/7*3udCC))#))45QugQ?E))#))45QugQ?EUugT#eC5E7!<D(>cU#l^DJWD&D(>cU3JL5U#eW%D5L>2J%%cii014&:;;--cii894&BtfF4&*D>>&&syy12&9L9LSYY9W8X YV3{4>>:;1>  4&'&&'q.FzlRTUUrV   c                 *   t        |t              sJ |j                  st        	|   |||      S t
        |   }| j                  |      }t
        |   }| j                  |      }d| d}|t        j                  k7  r2|t        j                  k(  r| j                  |       d| d| d| d}|S |t        j                  k(  r |t        j                  k7  r| d| d| d}|S ||k7  r+||cxk(  rdk(  rn nd	| d| d}|S d	| d| d| d| d| d}|S )
Nr  r   z::from<ra  r  z.to<r   r5   r  )
r   rG   r   r,  r  rH   rY  r   rk   r  )
r.  r:  r   r   src_cpp_typer  dst_cpp_typedst_num_vectorsr  r1  s
            rW   r  zCppVecKernel.get_to_dtype_expr  s\   #~...zz7,S%CC#I.//	:#E*//63%qz

"u

':)))45W\N!OK\\^_b^ccdeD  %**$%**)<U$|nAo->cBD  %/6Q6*<.3%qA  +<./9J!L>YZ[jZkkmnqmrrstrV   r  )NNF)F)*r  r  r  r  r@  r-  r   r  r   rJ  r   r   rm   rY  rn   r  ro   r;  r  rG   r(  r   ri  r=   r	   rk   rL  r  r  r  r  r  r  rK  r  r  r  r   r  r  r  r`  ra  s   @rW   r  r  u	  s   I C"45:: 4 4ekk c R%++ R% R
P5;; P3 P 38++ HEKK H# HN> N%++ N# N /3## zz# {{	#
 N+#T ,0<@ Lc]L zzL {{	L
 (L eC$789L L 
.	!L\ UZZ 4 !*S.()* * zz	*
 {{* *X<B{z$PLN ~ $
N 
ELL 
^ 
 D 
& (,/3+0==]& %]& 'tn]& EKK(]&~#VJ rV   r  c                        e Zd ZdZeZ	 	 d fd	Zd Zd Z	 ddZ	de
dej                  f fdZd fd		Zd
 Z fdZdej                  dej                  fdZ xZS )rr  an  
    A vector kernel that handles the 2d tiles with the tile size defined in `tiling_factor` on
    the inner-most loop level and one of the outer loop level (`outer_tiling_idx`). When the data
    tile is accessed in a contiguous way from the outer loop axis, a transposition is applied on the
    tile to make the access contiguous from the inner-most loop axis. Then, the same vectorization
    logic from its parent `CppVecKernel` is leveraged for load/store/compute. The transposed tile load
    and store are generated into kernel.preloads and kernel.poststores buffers.

    The loop structure looks like below:
    for ...
      for i_outer ...
        for ...
          for inner_most ...
            // generated by CppTile2DKernel
            float tmp0[16*16]; at::vec::transpose_mxn<...>(tmp0, in_ptr0 + ..., ...); // into kernel.preloads
            float tmp1[16*16]; // into kernel.preloads
            for i_inner ... { // the kernel inner loop
              vectorized loads/compute/stores (e.g., load tmp0, store tmp1) // into kernel.loads/compute/stores
            }
            at::vec::transpose_mxn(out_ptr0 + ..., tmp1, ...) // into kernel.poststores
          for inner_most ... (tail)
            // generated by CppVecKernel
            ...
      for i_outer ... (tail)
        for ...
          for ...
            // generated by CppKernel
            ...
    c                     t         |   ||||d   |       || _        || _        || _        |r|n|| _        |r|n|| _        d| _        y )Nr5   T)r,  r-  tiling_indicesinner_tail_sizeouter_tail_sizeinner_num_elemsouter_num_elemsinner_is_tiling_idx)r.  r  r  r  r  r  r  r1  s          rW   r-  zCppTile2DKernel.__init__  s`     	1	
 -..2A}2A}#' rV   c                 L    t        | j                  | j                      d      S )NrD  )r-   rH  	outer_idxr3  s    rW   inner_itervarzCppTile2DKernel.inner_itervar  s"    !T]]4>>%B$C6"JKKrV   c                 b   | j                   | j                     }| j                   | j                     }t        ||| j                        }t        ||| j                        }| j
                  d u xr@ |dk(  xr9 |j                  |      xr& |j                  |       xr |j                  |       S r   )rH  r  rI  r  r  r  r   )r.  r   	outer_var	inner_varouter_strideinner_strides         rW   need_vec_transposez"CppTile2DKernel.need_vec_transpose  s    MM$..1	MM$//2	*5)T=O=OP*5)T=O=OPOOt# 0!0		)$0 !$$Y//0 !$$Y//	
rV   c                 n   t         j                  j                  |      }| j                  }| dt	        |       }d}	t	        t        || j                  | j                     | j                               }
t	        | j                         }|r|	|}	}||
}}
d}| j                  |z  r| j                  | j                  }}n| j                  | j                  }}|r|dk(  rdnd}t        |t        j                        r|j                  r&t        |t        j                        rA|j                  s5dt         |    d| d	| d
|
 d
|	 d
| d
t	        |       d
t	        |       d}n4dt         |    dt	        |       dt	        |       d| d	| d
|
 d
|	 d
| d}|r| j"                  j%                         }na| j"                  j'                  |      s)| j"                  j)                  | j*                  |d      }nd}| j"                  j-                  |      }|r>t         |   }d| d| d}| d| d| d| d| d
}| j*                  j/                  |       |j1                  dt3        |            }|r'| j4                  j/                  t7        ||             |S | j*                  j/                  |       |S )Nr}  __place_holder__Tr  truefalseztranspose_mxn<ra  r  r   r   Fr  zalignas(std::max(std::size_t(z), alignof(z)))r   r  r[   r   )r4   r  r  r  rE   r  rH  rI  rP  r  r  r  r   r   r  r  rH   r  r	  containsr  r  getr   r   ro   r  r;   )r.  r   r   r   is_store
store_moder   factorr:  r<  ld_srcld_dstneed_defineMNr  load_or_storetile_var	cpp_dtypealignasdefine_lines                        rW   gen_transposed_tile_load_storez.CppTile2DKernel.gen_transposed_tile_load_store  s    !!$'##SU+,-  3E4==;Y[_[m[m nop/0CC#VFF##h.'')=)=qA $$$$ A !)jL.HVPW
q%**%akkq%**%akk !e!4 5Qzl C56("SEF82k!n5ERTUGWWY[  !e!4 5Q{1~6FaTUGWWXYcXd e56("SEF827  xx(H""=1xx((U(SHKxx||M2H$U+I 6fX[SVWG$IQyk8*AfXQvhbQKMM##K0%--.@#h-POO%%l4&GH  MM##M2rV   r   r   c                 V   | j                   j                  |      }| j                  |      }| j                         }| j	                  |      r| j                  |||d      }| dt        || j                  z         }t        j                  j                  |      }| j                  |d|      }| j                  j                  | j                  |      }	|	j                  d| ||fi        t!        |	t"              sJ d|	_        |	S | j'                  |      }
t(        | U  ||
      S )NF)r  r}  r   r  T)r  r  r8  r  r  r  rE   rP  r4   r  r  ri  r  r  r  r  r   rG   r   rs  r,  r  )r.  r   r   r   rY  r  rh  r   r   r  r   r1  s              rW   r  zCppTile2DKernel.load'  s   iiood#$$U+""$""5)::c55 ; H "
#k%$..2H&I%JKGGG%%d+E**7Au=DXX&&tzz48F!!&4u*=rBfn555 FMM//6I7<i00rV   c                    d|v sJ t        |t              sJ |       |j                  s| j                  |      }| j                  j                  |      }| j                         }| j                  |      }| j                  |      r| j                  |||d|      }| dt        || j                  z         }| j                  sFt        j                  j                  |      t         t"        j$                  t"        j&                  gz   v r| d| dt        | j                         d}	n| d| d}	| j(                  j+                  t-        ||	             y | j/                  |      }
t0        | e  ||
||       y )Nr  T)r  r  r}  r  r   r   )r   rG   r   r  r  r  r  r8  r  r  rE   rP  rX  r4   r  r  r   r   r4  r5  r  r   r;   rs  r,  r  )r.  r   r   r  r  r   rY  r  storebufr   r   r1  s              rW   r  zCppTile2DKernel.store=  sd   }}%07%70||NN5)Eiit$""$$$U+""5)::c54D ; H #3{54>>3I'J&KLH~~!2!24!8M

M = "  zK4O3PPRSz4KK!!,tT":;//6IGM$	5$7rV   c                    | j                         }| j                  r2|j                  d| d| dt        | j                         d| d	       y |j                  d| d| dt        | j
                         d| d	       y )Nr{  r|  r   r   r   )r  r  r   rE   r  r  )r.  r   rY  s      rW   r  z#CppTile2DKernel.codegen_inner_loopsZ  s    ""$##NNUG6%K@T@T4U3VVXY^X__bc NNUG6%K@T@T4U3VVXY^X__bcrV   c                    t         |   ||      }| j                  d   | j                  k  r| j                  nt	        | j                        \  | _        | _        | j                  | j                  d   k(  r+| j                  | _        | j                  | _
        d| _        |S | j                  | _        | j                  | _
        d| _        |S )Nr5   r   FT)r,  r  r  r}  reversedr  rI  r  rX  r  rP  r  r  r  )r.  groupreduction_groupr  r1  s       rW   r  zCppTile2DKernel.set_rangese  s    w!%9 ""1%(<(<< $--. 	(
 ??d11!44!11DN!11DN',D$
  "11DN!11DN'+D$rV   r   c                 Z    | j                  || j                  | j                               S )Nrz  )r  r  r  r  s     rW   rs  z"CppTile2DKernel.transform_indexingw  s0    ++%%' , 
 	
rV   rI  r  )r  r  r  r  rp  r@  r-  r  r  r  ro   r   r  r  r  r  r  rs  r`  ra  s   @rW   rr  rr    ss    < #I (.L
 6::x1 1UZZ 1,8:	$


 
uzz 
rV   rr  _bodyc                 j   | j                   gt        | j                  j                               z   }d}d}|D ]  }|j                  j
                  D ]  }|j                  dk(  s|j                  dv r!|j                  dvrd}t        |d      r|j                  rt        j                  |j                  v sJ |j                  t        j                     }|j                  r|j                  t        vrd}|&||j                  k7  st        j                  d       |j                  }d}  ||fS )	z
    Returns the low precision data type (torch.float16/torch.bfloat16) contained in the nodes
    and if all the nodes can codegen with this data type without converting to float.
    Otherwise returns None and True.
    NFplaceholder)	get_indexr:  )r  r  r  r  r  Trk  z.bf16 and fp16 are mixed in the scheduler node.)
root_blockr#  	subblocksr  r  nodesoptargetr   rk  rA   rj  r   r   warningswarn)r  
sub_blocks_lowp_fp_type	_use_fp32	sub_blockr0  rg  s          rW   get_loop_body_lowp_fpr    s$    ""#d5??+A+A+C&DDJ+/MI !	__** 	!Exx=(ELL = -  || $  !	uf%%***..%**<<</4zz:M:Q:Q/R}}](J $I".$5 &VW$+MMM 	9	!!> )##rV   c                   J     e Zd ZdZ fdZdeee   ee   f   fdZd Z	 xZ
S )TilingSelectz
    Implement the heuristic to select the tiling factors and tiling indices.
    In the future, we can implement advanced heuristic in a subclass.
    c                 "    t         |           y r  )r,  r-  r.  r1  s    rW   r-  zTilingSelect.__init__  s    rV   r   c           	        # t        |      }t        |      }|sJ t        d |D              rg g fS t        j                  }t        |d         d   ##rt        #fd|dd  D              r#}t        j                         j                  |      }| j                  |||      }|rt        |d       \  }}	t        |      t        |	      z   }
t        j                  j                  rVd }d	 }d
 }t!        t#        |
            D cg c]  }t%        t&        j(                  |       }}t#        |      }|d | ||d  }}i }i }|D ]n  }|j*                  gt-        |j.                  j1                               z   }|D ]4  }|j2                  j4                  D ]  }|j6                  dv r|j6                  dk(  rdnd}|j8                  j;                  ||f      |j<                  |   j<                  d      } |||      r4 |||||      }|j6                  dk(  r|n|dvr ||j6                  |       t?        |j6                  t@              s|j6                  jC                  d      r|j6                  dv r|j6                  |vrd||j6                  <   ||j6                  xx   dz  cc<    7 q tE        |j1                               }tE        |j1                               }d}d}||k\  s|dkD  r||z  |k\  rg g fS |	s9|r7t#        |      dk(  r)tG        ||d      g      s||d      |dz  k  r	|dk  rg g fS |tH        v rt        j                         j                  |      } |D ]  }!|!dk  r|!t#        |
      z   }!|!dk  s|!t#        |
      k\  r*tG        |
      retJ        j2                  jL                  jO                  |
|!   d      }"|"| k  sitJ        j2                  jL                  jQ                  |"|        | dz  } n|
|!   | k  s| dz  } n t#        |      dk(  r|g|fS t#        |      dk(  r||g|fS g g fS c c}w )Nc              3   ,   K   | ]  }|t         v  y wr  )rv   r  r   s     rW   r  z-TilingSelect.select_tiling.<locals>.<genexpr>  s     HEu//H   r   c              3   @   K   | ]  }t        |      d    k(    yw)r   N)r  )r  	loop_body_lowp_fp_dtypes     rW   r  z-TilingSelect.select_tiling.<locals>.<genexpr>  s(      "
 #9-a0NB"
   r5   r  c                     t        | d         S r   r   sizess    rW   r  z,TilingSelect.select_tiling.<locals>.<lambda>  s    #eAh- rV   rj  c                 L    ||d      }t        | ||      }|j                  r|S d S rU  )r  r  )r   rH  r  r  r  rN  s         rW   _try_get_stridez3TilingSelect.select_tiling.<locals>._try_get_stride  s4     '~a'89G0OF%+%5%56?4?rV   c                 2    | |vrd|| <   y || xx   dz  cc<   y r   rU   )	node_namenon_contig_indexing_op_counters     rW   _update_negative_op_countz=TilingSelect.select_tiling.<locals>._update_negative_op_count  s(     !(FFDE6yA6yAQFArV   c                     t        |      dk(  xr: t        |       dkD  xr* |d   dk\  r|d   n|d   t        |       z   t        |       k  S Nr5   r   r  )rH  r  s     rW   _is_valid_indicesz5TilingSelect.select_tiling.<locals>._is_valid_indices  sb    
 N+q0 (MA-(  .a0A5 +1-!/!2S]!Bh-(	rV   )r:  r  r  r:  r   r   r5   masked_subblock)r2   r  r4  r  gQ?#   ro  
   r  ))rC   rB   rf   r   rn   r  r"  r   rN  	nelements_select_tiling_indicesr_   r   r   r  enable_tiling_heuristicsrG  r   r.   r   r  r  r#  r  r  r  r  r  r@  indexing_from_argsr  r   ro   
startswithrc   r(   r   r4   r  r  guard_lt)$r.  fn_listvar_sizes_listloop_bodies
all_dtypesr   r  r  r  r  r|  r!  r%  r(  r  rH  r}  r  reduction_vars
op_counterr$  r  r
  r  r0  arg_idxr   rN  op_numnon_contig_indexing_op_numratio_thresholdquantity_thresholdfactor_lowptiling_indice
call_ranger  s$                                      @rW   select_tilingzTilingSelect.select_tiling  s    %W-/<
zHZHHr6M.{1~>qAc "
(_"
 
 #E#002<<5<I44^]
 %($?&"E?  ,)??Kzz22@G" #3{#34 34;;B  #&e*-o._-. % .0
 BD.( BE"'"2"2!3d5??;Q;Q;S6T!TJ%/ B	%.__%:%: BE$||/NN/4|||/K!QR(1(I(I%)>$:)""'**W"5":":1"=)? $5X~#N-<(-x.&F
 ,1<<<+G )/-36-A(A,1LL:X)*  *%,,< % 7 78I J#(<<#M$N $)<<z#A?@Ju||$<$.u||$<$A$<7BBB@ Z..01-0299;.* #'%'"-1CCQJ2V;N
 r6M (N+q0,!."34
 nQ/0=13DD r6M% *668BBBO%3 M$q((5K8H(H$q(MS=M,M '4%&WW%5%5%?%?'6 &@ &
 &3GG,,55j+N,71,<M!$]3kA(3q(8" >"a'%66>"a'%}5~EE2vQs   
!Qc           	         g }t        ||      D ]`  \  }}t        j                  |g| }|t        j                  |j
                  |j                        D cg c]  }|j                   c}z  }b t        t                  }	g }
t        t                  }t        t                  }|D ]  }|j                  D ]  }t        j                  d|j                        s$t        |||      }|dk(  r7|dk(  rO|	j                  t        |j                  dd               |
j!                  t        |j                  dd               t#        d |j                  D              r(|j                  t        |j                  dd               |j                  t        |j                  dd                
 |	|z
  |z
  }t%        |d       \  }}t'        |      t'        |      z   }t'        |	      dk(  r|dz
  gS |rt)        |      dd  S |	|z  |z
  }t)        |	      }t'        |      dk(  r|d   |v r|d   |dz
  k(  r|S t)        ||
j*                        dd  S c c}w )	Nz^d\d+$r   r5   c              3   P   K   | ]  }t        |t        j                           y wr  )r   r   SIZEr  r  s     rW   r  z6TilingSelect._select_tiling_indices.<locals>.<genexpr>w  s     S!4995S   $&c                     t        | d         S r   r  r  s    rW   r  z5TilingSelect._select_tiling_indices.<locals>.<lambda>|      s5QR8} rV   r  rH  r   )r  r
   extract_read_writes	itertoolschainreadswritesr   r   rm   r  r   searchr   r  r  r  r"  r_   r   sortedcount)r.  r3  r4  r  	all_indexfn	var_sizesrwdepcontig_varscontig_vars_listnon_contig_stride_constnon_contig_stride_otherr   r   rN  contig_onlyr  r  num_itervarscontig_and_const_stridecontig_vars_sorteds                         rW   r.  z#TilingSelect._select_tiling_indices_  sV    	 .9 	UMB	11"AyAByrxx/ST#))TTI	U !o'",S/"3",S/"3 	CE)) CyyCHH5,UCGQ;q[OOC$56$++C,=>Sv?R?RSS+//CHHQRL0AB+//CHHQRL0ABC	C "$;;>UU!$^9T!U5zC$88{q  1$%%+&rs++11##$ $K0"#q("2&*AA"2&,*::%%(.>.D.DEbcJJK Us   I)r  r  r  r  r-  r   r#  rm   rA  r.  r`  ra  s   @rW   r  r    s7    
i 
tCy$s)#	$	iV.KrV   r  c                        e Zd Z fdZd ZdefdZdefdZd Z	d Z
d	 Zd
ee   fdZd Zd Zddee   fdZdeded   fdZ xZS )r   c                     t         |   |j                  |j                  j                         || _        d | _        d | _        t        j                         | _
        g | _        y r  )r,  r-  r  wsr  rV  rH  r|  r   rN  picked_vec_isakernelsr.  rV  r1  s     rW   r-  zCppKernelProxy.__init__  sQ    **LOO,G,GH(2=2J2J2L(*rV   c                 `    |D ])  }t        |t              sJ t        j                  |       + y r  )r   r$   r:   propagate_scheduler_node)r.  r  r0  s      rW   data_type_propagationz$CppKernelProxy.data_type_propagation  s1     	@Ee]33388?	@rV   scheduler_nodec                     t        |j                  t              syt        j                  |       t        |j                        d   d uxr t        |j                        d    S )NTr   r5   )r   r  r   r:   re  r  )r.  rg  s     rW   is_lowp_fp_schedulerz#CppKernelProxy.is_lowp_fp_scheduler  s\    ...944^D!."6"67:$F C).*>*>?BB	
rV   r  c                     dt         j                  j                  fd}|j                  gt	        |j
                  j                               z   }|D ]  } ||j                          y )N	sub_graphc                 l   dt         j                  j                  dt        t         j                     fddt         j                  j                  dt        t         j                     fddt         j                  j                  dt         j                  ffddt         j                  j                  dt         j                  ffddt         j                  j                  dt         j                  ffd}t        | j                        }g |D ]9  }|j                  d	v r |      xt        v rt        fd
|j                  D              rB|j                  d   }| j                  |      5  | j                  d||t         j                  f      |j                  fd       t         xj"                  dz  c_        d d d        |j                  dk(  r |      xt        v r|j                  \  }}}}} ||      rt$        j&                  j)                  |      | j+                  |      5  | j                  d||f      |j-                  |       t         xj"                  dz  c_        d d d        u|j                  dk(  r|j                  \  }}}	}
|t        v st         j                  t         j.                  t         j0                  t         j2                  fv sJ |t        v rt         j                  nt         j                  |	|
f|_        |j                  dk(  r`|j                  d   t        v rK|j                  \  }}
t        fd|j                  D              rk||
t         j                  f|_        |j                  dk(  rq|j                  d   t        v r\|j                  \  }}t        fd|j                  D              rډj5                  |       ||t         j                  f|_        |j                  dk(  r%|j                  \  }}}|t        v rd |||      s[| j+                  |      5  | j                  d|||f      |j-                  |       t         xj"                  dz  c_        d d d        t        v st        fd|j                  D              r|j                  d   }| j                  |      5  | j                  d||t         j                  f      |j                  fd       t         xj"                  dz  c_        d d d        :< dt         j                  j6                  ffd} ||        y # 1 sw Y   pxY w# 1 sw Y   }xY w# 1 sw Y   xY w# 1 sw Y   xY w)Nr  r   c                 Z   | j                   dk(  r,t        j                  j                  | j                  d         S | j                   dk(  r| j                  d   S | j                   dk(  rCt        | j                        dkD  r| j                  d   S | j                  j                  dd      S y)	z6Get input dtype for nodes that may consumes lowp fp dtr  r5   r  rH  r  r   r   N)r  r4   r  r  r  r   r  r  r  s    rW   get_input_dtypez]CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.get_input_dtype  s    ;;')77,,TYYq\::[[$6699R=([[J.499~)#yy|+#{{{DAArV   c                 $   | j                   dk(  rFt        | j                        dk(  sJ t        j                  j                  | j                  d         S | j                   dv r| j                  d   S | j                   dk(  r| j                  d   S y)	z6Get output dtype for nodes that may produce lowp fp dtr  r   r5   )r  r4  r:  rH  r  r   N)r  r   r  r4   r  r  rn  s    rW   get_output_dtypez^CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.get_output_dtype  sz    ;;&(tyy>Q...77,,TYYq\::[[$JJ99R=([[$6699Q<'rV   dtc                 .    |t         v sJ  |       |k(  S )z]Check if the given node produces output with expected low precision floating point data type.)r   )r  rr  rq  s     rW   is_lowp_fp_sourcez_CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.is_lowp_fp_source  s!    ]***'-33rV   c                 X    |t         v sJ  |       x}r||k(  S | j                  dk(  ryy)zZCheck if the given node accept input with expected low precision floating point data type.r  TF)r   r  )r  rr  input_dtypero  s      rW   is_lowp_fp_sinkz]CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.is_lowp_fp_sink  s>    ]***"1$"77;7&",,[[J. rV   c                 Z     |       xr t        fd| j                  D              S )zCheck if the node is a lowp fp sources which are all directly fed to ops that accepts lowp fp input
                thus no need to promote to float
                c              3   0   K   | ]  } |        y wr  rU   r  userrr  rw  s     rW   r  z}CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.is_lowp_fp_source_no_promote.<locals>.<genexpr>  s      ;26OD"-;   r"  users)r  rr  rw  rt  s    `rW   is_lowp_fp_source_no_promotezjCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.is_lowp_fp_source_no_promote  s1     )r2 s ;:>**; 8 rV   )r  r:  c              3   0   K   | ]  } |        y wr  rU   rz  s     rW   r  zWCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.<genexpr>       M?44Mr|  r   r  r  c                     | uS r  rU   r  to_type_nodes    rW   r  zVCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.<lambda>  s    A\4I rV   r5   r  r  r4  rH  c              3   0   K   | ]  } |        y wr  rU   rz  s     rW   r  zWCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.<genexpr>  r  r|  c              3   0   K   | ]  } |        y wr  rU   rz  s     rW   r  zWCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.<genexpr>"  r  r|  r  c              3   0   K   | ]  } |        y wr  rU   )r  r{  r   rw  s     rW   r  zWCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.<genexpr>J  s     Ue <Ur|  c                     | uS r  rU   r  s    rW   r  zVCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.<lambda>R  s    A\<Q rV   rk  c                 V    dt         j                  j                  ffd} ||        y )Nrk  c                 B   dt         j                  j                  fd}| j                  D cg c]  }|j                  dk(  s| }}|D cg c]  } ||      s||j
                  i }}|D ]  }|j                         D ]q  \  }| j                  v st        fd|D              sv s.t        d |D              sAj                  d   }j                  |       | j                         s  | j                  | j                          y y c c}w c c}w )Nto_nodec                 :    t        d | j                  D              S )Nc              3   :   K   | ]  }|j                   d k(    yw)r  N)r  r  usrs     rW   r  zCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>._used_by_to.<locals>.<genexpr>a  s     "U3::#;"Us   r}  )r  s    rW   _used_by_tozCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>._used_by_to`  s    ""Uw}}"UUUrV   r  c              3   \   K   | ]#  }|j                   d    j                   d    k(   % ywrH  Nr  )r  r  r  s     rW   r  zCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>.<genexpr>l  s&     #ScCHHRLDIIbM$A#Ss   ),c              3   F   K   | ]  }|j                   d    t        v   ywr  )r  r   r  s     rW   r  zCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>.<genexpr>o  s"      ,&JM(E,&r  rH  )r   fxNoder  r  r~  r  r"  all_input_nodesreplace_all_uses_with
erase_nodeowning_modulelint)	rk  r  r  all_to_nodesall_to_nodes_and_users
node_usersr~  val_nodeto_lowp_fp_legalized_nodess	     `     rW   _eliminate_duplicate_to_nodezCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_nodeY  s-   VUXX]] V *3$!%DKK:<U$L $ 8D./3{SWGXtzz*.* . '= ;
+5+;+;+= ;KD%#y6 ##SU#S S$(,F$F(+ ,&QV,& )&
 ,0+?+?+C $ : :8 D ) 4 4T :;;, !..6!( 79$.s   DDDD)r   r  Graph)rk  r  r  s     rW   eliminate_to_dtypez`CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtypeX  s"    ')EHHNN ')R -Y7rV   )r   r  r  r   r   r#  r  r  r   r"  r~  r  inserting_aftercall_methodrn   r  r   cpp_to_dtype_countr4   r  r  inserting_beforereplace_input_withr2  r3  r  r  r  )rk  r  sub_graph_nodesr0  r2   r   r   	value_varr   r   r  r  r  rr  r   ro  rq  rw  rt  r  r  s                @@@@@@@@rW   add_to_dtypezDCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype  s    ehhmm  8M  
 uxx}} 
 %++9N 
 4 45;; 4
	!ehhmm 	! 	!588== ekk  #9??3O)+&( wLL$::/66=H MMM **Q-C"2259 8'0'<'<&c5%++-F (= ( 33(*I  22a728 8 LLG+.u55-G16.Cq)Q3IrB GG--d3E"33E: 8'0'<'<&c9e-D (= ( 00LI22a728 8 \\[0 

!& M1  %!KK!NN!MM!KK	)       +0M+AEKKu!KK*!&
 \\Z/EJJrNm4S',zz$S%MMM "%uekk!:EJ\\Z/EJJrNm4S#(::LS!RMMM  /55e<"%q%++!6EJ\\%779>6S)UI !M1  <IyQ!*!;!;E!B @/8/D/D$.c9i5P 0E 0" !& 8 8L Q ' : :a ? :@ -  UUU"'**Q-C!*!:!:5!A @/8/D/D$.c5%++5N 0E 0" !& ; ;$02Q!" !( : :a ? :@ @ owr*8ehhnn *8X y)y8 8 8 8J@ @ @ @s4   ?AVAV%AV:AV)V	V	V&	)V3	)r   r  r  r  r#  r  r  r  )r.  r  r  r
  r  s        rW   legalize_lowp_fp_dtype_loopbodyz.CppKernelProxy.legalize_lowp_fp_dtype_loopbody  s]    Z	*EHHNN Z	*x  **+d93F3F3M3M3O.PP
# 	*I)	*rV   c                     t         fd|D              r|D ]  }|j                  j                  gt        |j                  j                  j                               z   }|D ]  }|j                  j                  D ]n  }|j                  dv s|j                  sJ t        j                  |j                  v sJ |j                  t        j                     }|j                  t        v rnJ    y |D ]^  }t        |t              sJ t        |j                  t               sJ |j                  }|j#                         rN j%                  |       ` y )Nc              3   d   K   | ]'  }t        |t              xr j                  |       ) y wr  )r   r$   ri  )r  r0  r.  s     rW   r  z8CppKernelProxy.legalize_lowp_fp_dtype.<locals>.<genexpr>  s3      
 um,Q1J1J51QQ
   -0)r  r  )r"  r  r  r#  r  r  r  r  r  rk  rA   rj  r   r   r   r$   r   is_memory_copyr  )r.  r  r0  r
  r  fx_noderg  r@  s   `       rW   legalize_lowp_fp_dtypez%CppKernelProxy.legalize_lowp_fp_dtype  sD    

 

  B#kk445KK))0029 
 ", BI#,??#8#8 B">>->>#*<</<#6#:#:gll#JJ#J;B<< 3 7 7<G $+==M#AA#ABB	B  	;Ee]333ekk8444"[[D&&(44T:	;rV   c           	      ^   !"# t              t              k(  sJ | j                  !t        d       \   "| j                   "       !#fd} "fd# |t              }t
        j                  xj                  |j                  z  c_        t
        j                  xj                  |j                  z  c_        t        j                  |      | _        | j                  r| j                  s6|g| _        | j                  dd        | j                  j!                  |        y t"        j$                  j&                  j)                  d      5  t+               }|j-                        \  }}t        |      t        |      k(  sJ d}t/        t1                    }	t3        d |	D              rd}d}
d }|r{d}|d	   }|d
z   }t        | j                  j4                        |kD  rM| j                  j4                  |   j6                  }| j                  j4                  |   j6                  }|xr | }
t        |      d
k(  rt8        xj:                  d
z  c_        | j                  j=                  |d	   |d	         } |t>        |d	   |d	         }|j@                  |jB                  z
  }|jD                  d	|jB                  fi|_#        t&        jH                  jJ                  r|r |t>        |d	   |d	   |      }n|}|jD                  g|_&        |jD                  |jB                  |j@                  fi|_#        ||g| _        |}nt        |      dk(  r|d
   t        | j                        d
z
  k(  r|d	   |d
   k(  sJ t8        xj:                  dz  c_        | j                  j=                  |d	   |d	         }d	|jB                  f|jB                  |j@                  fd}|j@                  |jB                  z
  }| j                  j=                  |d
   |d	         }d	|jB                  f|jB                  |j@                  fd}|j@                  |jB                  z
  } |tN        |d	   |      }|jD                  |d   |jD                  |d   i|_#        g }t&        jH                  jJ                  rh|rfdD ]`  \  }}|dk(  r|nd }|dk(  r|nd } |tN        |d	   |||      }|jD                  ||   |jD                  ||   i|_#        |jQ                  |       b n |t>        |d	   |d	         }|jD                  |d   |jD                  |d   i|_#        |jD                  g|_&        |jQ                  |       |jD                  |d   |jD                  d	|j@                  fi|_#        |jD                  |jD                  g|_&        |jQ                  |       |g|z   | _        |}n|g| _        | j                  |
|       | j                  j!                  |        d d d        y # 1 sw Y   y xY w)Nc                     t        | d         S r   r  r  s    rW   r  z2CppKernelProxy.codegen_functions.<locals>.<lambda>  rH  rV   r  c                      j                   | g| 5 }t        xj                  dz  c_         |       |cd d d        S # 1 sw Y   y xY wr   )
new_kernelr   generated_kernel_count)r%  r  rZ  rV  runs      rW   codegen_kernelz8CppKernelProxy.codegen_functions.<locals>.codegen_kernel  sL    (((4t4  ..!3.F  s   #AAc           	      T   | j                        \  }}d}t        	      D ]u  \  }}|ft        t        j                              dffv r|rJ  |||       ;d}|dfk(  sJ d| d d        | j                         5   ||d       d d d        w y # 1 sw Y   xY w)NFrU   Tzunexpected group: rQ  r   )r  r  r   rJ  rK  r6  )
rZ  r  r7  	in_suffixrR  rS  r3  r  r  r4  s
         rW   r  z-CppKernelProxy.codegen_functions.<locals>.run  s    #)#4#4UO#L D.I!$Wn!= %IO,9??5/BCRH!   )(=t^, $I$)  V ,I;d5'OCTUV 
  //1 %4% %%% %s   	
BB'	Finplace_buffersTc              3   ,   K   | ]  }|t         v  y wr  )rw   r  s     rW   r  z3CppKernelProxy.codegen_functions.<locals>.<genexpr>  s     Su5 ::Sr  r   r5   )r  r   maintailr  )r  )r  r  )r  r  r  ))r   rV  r_   r  ru  r4   r  removed_buffersinplaced_to_removerF  r'  rH  ra  rH  rb  aggregate_reduction_buffers
set_kernelr   	_inductorr   patchr  rA  rB   rC   rf   rB  r  r   generated_cpp_vec_kernel_counttiler  r:  
tiled_sizer   rz  r  enable_loop_tail_vecr{  rr  r  )$r.  r3  r4  r  scalar_kerneltiling_selecttiling_factorsr  could_masked_vecr6  _inner_loop_reduction_outer_not_outer_loopinner_loop_reductionouter_loop_levelinner_loop_levelouter_loop_reductionr  
vec_kernelrX  tail_kernel
outer_looprR  r  
inner_loopinner_rangesr  tile2d_kernelouter_rinner_r_inner_tail_size_outer_tail_sizerZ  r  rV  r  r  s$    ``                             @@@@rW   codegen_functionsz CppKernelProxy.codegen_functions  s   7|s>2222((!$^9T!U/		%( 'y1	=#@#@@	""m&F&FF"!6""$--)?DL,,UD9NN%%d+ __##))%)@ C	,(NM-:-H-H.*NN ~&#n*====#3N74KLJS
SS#( .3+K',$#1!#4 #3a#7 t~~++,/??+/>>+?+?(,"l ) ,0>>+?+?(,"l ) -I5I1I 4 >"a'66!;6~~**>!+<^TUEV*W+ ."3^A5F
 !II7	,0HHq$//6J+K
(::227G"0$&q)&q)!	#K #0K48HH:M0-1XX7S,T) *K8"^$)"1%T]]);a)??&q)^A->>? 66!;6!^^00"1%nQ.? 1 
 
 5 56'22JOOD  #-//J4I4I"I!^^00"1%nQ.? 1 
 
 5 56'22JOOD  #-//J4I4I"I .#"1%"! NNL$8NNL$8/+ !::227G- 3( 07&/@Od ) 07&/@Od ) "0+*1-*,," 'NNL,A&NNL,A0, $**62-30 "0$nQ&79J"J #V(<"V(<0J, 2<0@J-&&z2"V(<"JOO(<3M/ 5?NNJNN3SM0&&}5 -<( -,,/ NN%%d+GC	, C	, C	,s   SX##X,c                     |D ](  }| j                  |       t        j                  |       * | j                  ||       y r  )r  r:   propagate_loopbodyr  )r.  r5  r4  r@  s       rW   codegen_loop_bodiesz"CppKernelProxy.codegen_loop_bodies`  s?     	9D0062248	9 	{N;rV   r  c                    | j                  |       | j                  |       t        |      dk\  sJ d }|D cg c]  }t        j                  ||       }}t        t        j                  t              r2t        j                  j                  rd }|D cg c]
  } ||       }}|D cg c]  }|j                  d    }}| j                  ||       y c c}w c c}w c c}w )Nr5   c                     | j                          | j                          t        t        j                  t
              r | j                  | S | j                  |      S r  )decide_inplace_updatemark_runr   r4   rZ  r1   r  codegen)r  
index_varss     rW   rR  z(CppKernelProxy.codegen_nodes.<locals>.fnl  sF    &&(MMO!(($56!tzz:..||J//rV   c                 R    t         j                  j                  |       }| |_        |S r  )r4   r  localize_functionoriginal_fn)rR  
wrapped_fns     rW   wrap_fnz-CppKernelProxy.codegen_nodes.<locals>.wrap_fn{  s+    33EE
 *,
&!!rV   )r  rf  r   rI  partialr   r4   r  rJ   r  r  r  )r.  r  rR  r  r3  r  r4  s          rW   codegen_nodeszCppKernelProxy.codegen_nodesf  s    ##E*""5)5zQ	0 <AA49$$R.AA q--/AB&&44" .55rwr{5G549:D$**Q-::w7# B 6:s   CC-C"c                 >    | j                  | j                  ||       y r  )r%  rH  )r.  r   r  s      rW   r(  zCppKernelProxy.codegen_loops  s    kBrV   c                 F    | j                   D ]  }|j                           y r  )rb  r  r.  rZ  s     rW   r  z4CppKernelProxy.update_stores_with_parallel_reduction  s!    ll 	;F88:	;rV   r   c                 (   |J d}| j                   D ]q  }t        j                         5 }|j                  ||      r@d}|j	                  |j                                |j                  |j                                d d d        s y # 1 sw Y   ~xY w)N
C10_LIKELYC10_UNLIKELY)rb  r   r   rG  r   r   r  r  )r.  r   	if_prefixrZ  r   s        rW   r  zCppKernelProxy.gen_body  s     	ll 	3F%%' 35,,T9= .I''6KK 12	3 3	33 3s   ABB	inner_loop_reduction_outer_notr  	LoopLevelc                 t    d fd} j                   d   }|r|sJ  ||       nZ|j                           j                  j                  |j                          j                  j                  |j                          j
                  j                  |j
                          j                  j                  |j                          j                  j                  |j                          j                  j                  |j                          j                  j                  |j                         y )Nc           
         t        j                        dk\  sJ j                  d   }j                  d   }t        |t              sJ t	        |      t
        k(  r^|j                  |j                         |j                          j                  j                  |j                  |j                  z          n5|j                          j                  j                  |j                         t               }t        j                         5 }|j                  |d| j                        r:|j                  |j!                                |j                  |j"                         d d d        t        j                         5 }|j                  |d| j                        r	|j                  |j!                                t	        |      t
        k(  r|j$                  }|D ]X  }| d| j                   dt'        | j(                         d}t+        |j,                  ||       t+        |j"                  ||       Z |j                  t/        |j"                  | j                  | j                   d	| j(                  | j0                               n|j                  |j"                         d d d        |_        y # 1 sw Y   VxY w# 1 sw Y   |_        y xY w)
Nr   r   rH  r  r  r   z_tail - r  r  )r   rb  r   r  r  ru  r  r  r~  r  r7   r   r   rG  r   r   r   r  r  rE   r  r   r  r   r:  )	r  main_loop_kerneltail_loop_kernel
suffix_bufr   r7  r   r   r.  s	           rW   !aggregate_reduction_prefix_suffixzUCppKernelProxy.aggregate_reduction_buffers.<locals>.aggregate_reduction_prefix_suffix  s   t||$)))#||A#||B/.=== $%2 !::$22 !::<%%,,$55&778
 !::<%%,,-=-N-NO &J%%' I5#66jnn ''
(9(9(;<%%&6&G&GHI %%' M5#66
 ''
(9(9(;<,-:)9)M)M$2 D*.uZ^^4DH[YcYnYnMoLppq'rH,-=-D-DdHU, 0 A A4 #))6 0 A A *#->>"2% 8 * 5 5 * #))*:*K*KL/M0 %/D!=I IM0 %/D!s   AJ;D(K;KKr   )r  r  )
rb  r  r~  r  r  r  r  r  r  r  )r.  r  r  r  main_kernels   `    rW   r  z*CppKernelProxy.aggregate_reduction_buffers  s    6	/p ll1o):-j9113!!(()E)EF!!(()E)EF&&--k.S.ST&&--k.S.ST!!(()I)IJ##**;+M+MN**1155	
rV   r  )r  r  r  r-  rf  r$   ri  r   r  r  r  r  r#  r  r(  r  r   r7   r  rk   r  r`  ra  s   @rW   r   r     s    +@
= 
_* _*B;<v,p<84#6 8BC;3Xl3 3L
.2L
@H@UL
rV   r   c                   $     e Zd Z fdZd Z xZS )rW  c                 p    t         |   |j                  |j                  j                         g | _        y r  )r,  r-  r  r`  r  rY  rc  s     rW   r-  zOuterLoopFusedKernel.__init__  s)    **LOO,G,GH%'
rV   c           
         g }| j                   D cg c]  }|j                          }}|D ]h  }|j                  }|J |j                  |j	                  t        t        |      |j                  z
  |j                        |      j                         j t        t        |j                  t        |            |j                        S c c}w )Nr/  )rY  r  r|  r  r  r	  r   r  r
  r^   r_   )r.  r  r  kernels_parallel_depthrH  nested_kernelsrZ  r|  s           rW   r  z*OuterLoopFusedKernel.decide_parallel_depth  s    !#48JJ+
'0I  "+
 +
 % 	F !,,K***")),,!,/A/M/MM$6$B$B	  !.
	  "1137M3N +66	
 	
)+
s   C)r  r  r  r-  r  r`  ra  s   @rW   rW  rW    s    (
rV   rW  c                       e Zd ZdZdZdZy)ReasonFusedNodessame_vars_reducecompatible_reductioncompatible_ranges_no_reductionN)r  r  r  SAME_VARS_REDUCECOMPATIBLE_REDUCTIONCOMPATIBLE_RANGES_NO_REDUCTIONrU   rV   rW   r  r  
  s    )1%E"rV   r  c                       e Zd ZdZ eej                  ej                  g      Ze	de
j                  dee   fd       Z fdZdefdZd Zd	 Zd
 Zdee   fdZd Zd Zd ZdededefdZd Zd Zd Zd Zdee    fdZ!de"fdZ#de$e"e%e f   fdZ&dedefdZ'dede(e   de(e   fdZ)d  Z*d! Z+d" Z,d%d#Z-d$ Z. xZ/S )&CppSchedulingi  devicer   c                     | j                   S r  )backend_features)r%  r  s     rW   get_backend_featuresz"CppScheduling.get_backend_features  s    ###rV   c                 V    t         |   |       |r| j                          d| _        y NF)r,  r-  reset_kernel_group_ready_to_flush)r.  r!  r1  s     rW   r-  zCppScheduling.__init__   s'    ###%$rV   statusc                     || _         y r  r  )r.  r  s     rW   _set_flush_statuszCppScheduling._set_flush_status&  s
    %rV   c                 &    t        d |D              S )Nc              3      K   | ];  }t        t        t        j                  j                  j
                  |             = y wr  )r   r  r4   r  r  r   rE  s     rW   r  z)CppScheduling.group_fn.<locals>.<genexpr>*  s,     M!U3qww//88!<=Ms   AA)r   )r.  r  s     rW   group_fnzCppScheduling.group_fn)  s    MuMMMrV   c                 "    t               | _        y r  )KernelGrouprV  r3  s    rW   r  z CppScheduling.reset_kernel_group,  s    'MrV   c                    |j                         s|j                         rt        j                  ||      S |j                         r(|j                         rJ t	        j                  ||      S | j                  ||      t        j                  k(  rt        |t        t        f      sJ t        |t        t        f      sJ |j                  \  }\  }}|j                  \  }\  }}|dk(  r|dk(  s	J ||f       fdt        |      t        |      k  r|n|}t        |t              sJ t        |      t        |      k  r|n|}	 |	      }
|j                  |
       |j                  \  }\  }}|j                  \  }\  }}||k(  rt	        j                  ||      S  |      }t        |	t              r|	j                  |       ngt        |	t              sJ |	j                  D ]&  }t        |t              sJ |j                  |       ( t	        |	j                  |	j                        }	|j                  \  }\  }}|j                  \  }\  }}||k(  s	J ||f       t	        j                  ||      S | j                  ||      r't         j                  ||| j#                  ||            S t	        j                  ||      S )NrU   c                 D   t        | t              rt        | j                        dkD  sJ | j                         d }t	        t
                  }| j                  D ];  } 	|      \  }}||}||k(  sJ ||| j                  f       |j                  |       = |t        |      fS t        | t              sJ | j                  }t        |t        j                        sJ |j                         \  }}}|j                  t        |j                  j                               fS rU  )r   r"   r   snodesr   r   updater#  r$   r  r   ComputedBufferget_default_sizes_bodyr  indexing_exprsr  )
r  r  r'  snodevexprscomp_bufferr   r@  get_indexing_ranges_exprss
            rW   r,  z5CppScheduling.fuse.<locals>.get_indexing_ranges_exprsA  s   !$(:;"4;;/!3@T[[@3%)
)3C):%)[[ 9E'@'GHAu)1-.
#-?PZDKK4PP?*11%89  *4+???)$>>>&*ii)+r7H7HIII%0%G%G%I
4#T5H5H5O5O5Q0RRRrV   )extra_indexing_constraints)
is_foreachr!   r'  is_templater"   _why_fuse_nodesr  r  r   r$   r  r   recompute_size_and_bodyr#  r!  can_fuse_vertical_outer_loopr  _get_outer_loop_fusion_depth)r.  r  r  r   vars1reduce1vars2reduce2node_to_recompref_noderef_indexing_constraints#node_to_recomp_indexing_constraintsr(  r,  s                @rW   r'  zCppScheduling.fuse/  s   !1!1!3-225%@@ ((***%**5%88 $$UE2#BBC "%-9K)LMMM!%-9K)LMMM&+kk##E7&+kk##E7"}BJ'8JJ6S& +.e*s5z*Au!.-@@@$'JU$;5+DX+N(66/G 7  !&:E1 %:E1E>-225%@@ 7P"73 h6443V 5  &h0BCCC!) )%???557Z 6 
  2(2D2DhooVH %:E1 %:E1~5u~5~)..ue<<225%@2775$"C"CE5"Q  *..ue<<rV   c                     |j                   \  }\  }}|j                   \  }\  }}||k(  r||k(  rt        j                  S |dk(  r|||z   k(  rt        j                  S | j	                  ||      rt        j
                  S y )NrU   )r  r  r  r  &_can_fuse_nodes_with_compatible_rangesr  )r.  r  r  r   r4  r5  r6  r7  s           rW   r0  zCppScheduling._why_fuse_nodes  s    #kkE7#kkE7E>g0#444b=Uego5#88866ueD#BBBrV   c                 <   |j                   \  }\  }}|j                   \  }\  }}|dk(  xr |dk(  }t        j                  |      t        j                  |      k(  }	t        |      dk(  xs t        |      dk(  }
|r|	r|
syt        |      t        |      k  r|n|}t        |      t        |      k  r|n|}t	        |t
              ryt	        |t              sJ t	        |j                  t        j                        ryt	        |j                  t        j                        sJ |j                  j                  j                         }d }t	        |t
              rt        t        t        df             }|j                   D ]  }t	        |j                  t        j                        r ndt	        |j                  t        j                        sJ |j#                  t        |j                  j                  j                                       t        |      dk7  ryt%        t'        t)        |                  }n\t	        |t              sJ t	        |j                  t        j                        sJ |j                  j                  j                         }||k7  ryy)NrU   r5   F.T)r  rY  rd   r   r   r"   r$   r  r   TemplateBufferr%  dataget_sizer   r   r   r#  r  r#  nextiter)r.  r  r  r   r4  r5  r6  r7  c1c2c3r8  r9  ranges2ranges1
ranges_setr(  s                    rW   r=  z4CppScheduling._can_fuse_nodes_with_compatible_ranges  s    $kkE7#kkE7],w"}YYu5!11Z1_/E
arb"%e*s5z"9uJU35 n&89 .-888n))2+<+<=.--r/@/@AAA !%%**335h 23#E#s(O46J! Bejj"*;*;<!%**b.?.?@@@uUZZ__%=%=%?@A	B :!#4Z 012Gh666hmmR->->???mm((113GgrV   c                     t        |t        t        f      sJ t        |t        t        f      sJ t        d ||fD              ry| j	                  ||      d uS )Nc              3   <   K   | ]  }t        |t                y wr  )r   r  r  s     rW   r  z:CppScheduling._can_fuse_horizontal_impl.<locals>.<genexpr>  s      
>BJt89
s   F)r   r"   r$   rf   r0  r.  r  r  s      rW   _can_fuse_horizontal_implz'CppScheduling._can_fuse_horizontal_impl  sd    %"4m!DEEE%"4m!DEEE 
GLen
 
 ##E51==rV   c                    |j                         s|j                         ryt        |j                               t        |j                               z   t        j                  j
                  kD  ry| j                  ||      S r  )r/  r   r+  r   r  max_horizontal_fusion_sizerM  rL  s      rW   can_fuse_horizontalz!CppScheduling.can_fuse_horizontal  sf    %"3"3"5!"S):%;;jj334 --eU;;rV   r  r  c                 p   |j                         x}rt        |j                  t        j                        xr~ t        |j
                  t        j                        xrX t        |j
                  j                        dk(  xr4 |j
                  j                  d   j                         |j                  k(  S y)Nr5   r   F)get_template_noder   layoutr   MultiOutputLayoutr  MultiOutputr   inputsr  r   )r.  r  r  template_bufs       rW   can_fuse_multi_outputs_templatez-CppScheduling.can_fuse_multi_outputs_template  s     !2244<4<..0D0DE Iuzz2>>:I

))*a/I JJ%%a(113|7H7HH	 rV   c                    d}t        d ||fD              s|S t        |t              r|j                         d   n|}t        |t        t
        f      sJ t        |t              r|j                         d   n|}t        |t        t
        f      sJ |j                  \  }\  }}|j                  \  }\  }	}
|dk(  r|	dk(  r|dk7  r|
dk7  r|S t        d ||fD              r'|j                  |j                  k(  r|j                  S |S t        t        |      t        |	            }|dk\  rI|d | |	d | k(  r>t        d ||fD              r(t        |      t        u r|n|}|j                  |k(  r|S |S |S |S )Nr   c              3   T   K   | ]   }t        |      t        t        t        fv  " y wr  )r  r  r"   r$   r  s     rW   r  z=CppScheduling._get_outer_loop_fusion_depth.<locals>.<genexpr>  s/      
  J+-?OP
r  rH  rU   c              3   >   K   | ]  }t        |      t        u   y wr  r  r  s     rW   r  z=CppScheduling._get_outer_loop_fusion_depth.<locals>.<genexpr>  r  r   r5   c              3   >   K   | ]  }t        |      t        u   y wr  r  r  s     rW   r  z=CppScheduling._get_outer_loop_fusion_depth.<locals>.<genexpr>  s      >BT
99r   )r"  r   r  r$  r"   r$   r  r&  r^   r   rf   r  )r.  r  r  DISABLE_OUTER_LOOP_FUSION_node1_node2r   r4  r5  r6  r7  r&  _compare_nodes                rW   r3  z*CppScheduling._get_outer_loop_fusion_depth  s   $%! 
 
 

 -, %!<= !!#B' 	
 &#5}"EFFF %!<= !!#A& 	
 &#5}"EFFF$llE7$llE7B;5B;7b=W],,TeU^TT 00E4Q4QQ -- /
 #&c%j#e*"=#q(../59Q:Q3RR GLen  "%[,GGEU  !88<SS2244 /.((rV   c                    |j                          xro |j                          xr\ |j                         |j                  z  xr= | j                  ||      xr |j	                           xr | j                  ||      dk\  S r   )r/  get_operation_names	ancestorsrM  r  r3  rL  s      rW   r2  z*CppScheduling.can_fuse_vertical_outer_loop  s    !!## E%%''E))+eoo=E ..ue< -**,,E 11%?1D		
rV   c                 *    | j                  ||      ryyr'  )r2  rL  s      rW   get_fusion_pair_priorityz&CppScheduling.get_fusion_pair_priority(  s    ,,UE:rV   c                     |j                         ry|j                         r%t        ||g      \  }}|j                          xr |S | j                  ||      xr |j                          xs | j	                  ||      S r  )r/  rM   r  rM  r2  )r.  r  r  template_fusion_supportedr   s        rW   can_fuse_verticalzCppScheduling.can_fuse_vertical/  s    +Sw,(%q ))++I0II**5%8UASASAU=U=..ue<	=rV   r  c                    t        d |D              r|S ddd}d}d}d}|D ]z  }t        |j                  t        j                        sJ |j                  j                         \  }}}|j                  j                         D ]  \  }	|	j                  t              D ]  t        fd|j                  D              r|k7  r}|dz  }|dkD  r|c c c S t        j                  d   t        j                  j                  j                        svj                  d   |j                  v st!        fd|j                  j                         D              sÉj                  d   dkD  s։j                  d   j                  d   d	}|}  } |s|S dfd
}
|D ]  }||k(  s	|j#                  |
        |D ]  }||k7  s	|j#                  |
        |S )aI  
        Apply loop split optimization.
        When one of the indexing_exprs contains a division, we eliminate the division by splitting the loop
        to avoid non-contiguous loads, subject to the following conditions:
            1. No reduction and no mudular index for all nodes.
            2. The indexing_exprs of all nodes contain only one (or more, but all the same) division,
               where the divisor is an integer and not too small (the divisor > 8), the dividend is
               one of the iter_vars, and this var, i.e. the dimension that needs to be split, is
               contiguous in all other indexing_exprs.

        For example, if the node's var_ranges: {z0: 2, z1: 9216, z2: 960} and indexing_exprs:
        {'index0': 8847360*z0 + 960*z1 + z2, 'index1': 32*z0 + (z2//30), 'index2': z2},
        we will split z2 -> 30*z2 + z3, then the node's var_ranges will be changed to
        {z0: 2, z1: 9216, z2: 32, z3: 30} and indexing_exprs will be changed to
        {'index0': 8847360*z0 + 960*z1 + 30*z2 + z3, 'index1': 32*z0 + z2, 'index2': 30*z2 + z3}.
        c              3      K   | ]X  }t        |j                  d    d          dk7  xs4 t        d |j                  j                  j                         D               Z yw)r5   r   c              3   F   K   | ]  }|j                  t                y wr  )r   r   )r  r  s     rW   r  z9CppScheduling.try_loop_split.<locals>.<genexpr>.<genexpr>Q  s      .2)r  N)r   r  rf   r  r'  r  r  s     rW   r  z/CppScheduling.try_loop_split.<locals>.<genexpr>O  sc      

 	 

1a !Q&  6:jj6O6O6V6V6X 
s   AA Nr   Fc              3   @   K   | ]  }j                  |        y wr  )r   )r  r   div_exprs     rW   r  z/CppScheduling.try_loop_split.<locals>.<genexpr>e  s     Q#HLL-Qr  r5   c              3   d   K   | ]'  \  }}|k7  rt        |j                  d          dv  ) yw)r   r)  N)r  r  )r  name_expr_rm  r   s      rW   r  z/CppScheduling.try_loop_split.<locals>.<genexpr>p  s9        ,u$} 0x}}Q7GHFR r  rX  Tc                    | \  }}|\  }}|j                        }|j                         }||   z  ||<   |j                  |dz          t        j                  ||d      \  \  }	}
}|	j                         }|j                  |dz         }||   z  |z   ||<   t        j                  |||g||	|      }s/|j                  t        |j                  j                               f||f||	|ffS )Nr5   r  )r  )r   copyinsertr
   index_vars_no_squeezepopr   r   r  r#  r'  r  )r  r@  r  
index_sizereduce_sizer  reduce_vars	split_idxnew_index_sizenew_index_varsr   r  	iter_varsdivisor_varr-  split_number	split_vars                 rW   
loop_splitz0CppScheduling.try_loop_split.<locals>.loop_split  s   &+#J&*#J"((3I'__.N(29(=(MN9%!!)a->.:.P.PC/+^Q '++-I#--	A6K#/)I2F#F#TIi ;;y+.
NKD .OO,,3356.*
  -- rV   )recompute_sizes_body_func)r-  r  )rf   r   r  r   r%  r&  r'  r  findr   r|  r  r   corenumbersrN  r"  r1  )r.  r  num_div	div_expr_	match_divmatched_noder  r   original_bodyr  r  rm  r-  r   r~  r  s              @@@@@rW   try_loop_splitzCppScheduling.try_loop_split<  s
   &  

 
 
 L			 	,Ddii):):;;;"&))"B"B"DA}a+::@@B ,
d $		( 3 ,HQ9P9PQQ$	1$,	1{$"8==#3UZZ5G5G5O5OP$MM!,0G0GG ,  0=0L0L0R0R0T  
 %MM!,q0$,MM!$4	'/}}Q'7$(	'+/,,	,< L%)"	:  	SD|#,,z,R	S  	D|#,,/I.8 - 	 rV   r  c                 H  	 | j                   t        j                  }g g 	t        |t              sJ dt        f	fd} ||      s|t        _        j                          	j                          t        j                  j                  j                  d      5  |j                         D ]X  }t        |t        t        f      sJ |j                         }t              }|j                  |       j!                  ||       Z 	 ddd       yy# 1 sw Y   yxY w)a  
        Generate the code for the outer loop fused scheduler node.
        1. Codegen with fused outer loop: depends on the analysis of
            the outer loop fused scheduler node, with or without the local buffer.
        2. If failed, fallback to standard codegen.
        r  c           	      v    t         t              sJ j                          j                          dt        fdg }i t	         fd j                         D              rt        t                   j                         D ]  t        t              sJ j                  j                                j                         st        j                               dk7  rbj                         d   t	         fdj                  D              sj                   }t        |t"        j$                        sJ |j'                         } j(                  t                     z
  }fd}|j+                         r |       s	t#        j,                  |j.                  |j0                  |j2                  |d |j4                  |d       }fd	}d
} |||      }	|	sEt#        j6                  | dt        |       |      }	|j9                  |	       g |	j:                  <   |	j:                     j9                  |        t=        j>                        5 }
t        |      dkD  r4|D ]/  }|j:                  J |
jA                  ||j:                            1  j                         D ]t  }t        |tB        t        f      sJ tE              }|jG                  |j                                j9                  |       j9                  |j                                v  jI                   j(                        sD|
jJ                  D ]+  }tL        jN                  jJ                  jQ                  |       - 	 ddd       ytR        jT                  j9                  tS        jV                  t              t        |
jX                                      j[                        }j]                  |g t^        j`                  jc                               ddd       y# 1 sw Y   yxY w)zN
            Codegen code with fused outer loop and local Buffer.
            r  c                     t        | t        t        f      sJ | j                         }t	        |d       j
                  \  }\  }}t        |      t        |      z   }|S )Nc                 4    t        | j                               S r  )rm   r  r  s    rW   r  z~CppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.get_call_ranges.<locals>.<lambda>  s    Q^^-=)> rV   r  )r   r$   r"   r+  r_   r  r   )r  r  r   r  r  r|  s         rW   get_call_rangeszlCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.get_call_ranges  s`    !$8J(KLLL-1^^-=.1>/% ,+E? $ElU?-CC""rV   c              3   `   K   | ]%  }t         |            j                  d z   k(   ' yw)r5   N)r   r&  )r  r0  r  r  s     rW   r  zfCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.<genexpr>  s3       OE*+t/K/Ka/OOs   +.r5   r   c              3   V   K   | ]   }|j                   j                         v  " y wr  )r  r+  )r  r{  r  s     rW   r  zfCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.<genexpr>  s&      :>		T^^%55   &)c                  B   dd} t        j                  j                  j                               D ]  \  }}| |z  z  | |z  }  j                  j	                  j                               }fd |      xr t        fdj                  D              S )Nr   r5   c                     | k(  S r  rU   )r  contiguous_index_exprs    rW   is_contiguous_indexzCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.is_all_write_read_contiguous.<locals>.is_contiguous_index  s    '(,A'A ArV   c              3      K   | ]Y  }t        |j                  t              xr9  |j                  j                  j	                  j                                      [ y wr  )r   r  r$   r  get_read_exprr  )r  r{  r  scheduler_buffers     rW   r  zCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.is_all_write_read_contiguous.<locals>.<genexpr>  s\      Q %) !+499m D !"$7$(IIOO$A$A(8(A(A(C%&%"!"Qs   AA")r  r  r  r  get_write_exprr  r"  r~  )rN  r   rG  write_index_exprr  r  r  rg  s       @@rW   is_all_write_read_contiguouszyCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.is_all_write_read_contiguous  s    451%&F.6 . 4 4 ? ? E E G/ 0
U !6# E 5 &%	0
 0>/C/C/R/R 0 9 9 ;0,B $77G#H $S Q -=,B,BQ N rV   Nc                 ~    |D ]7  }| |j                   k(  st        fd|j                     D              s5|c S  y )Nc              3      K   | ]]  }|j                   Ot        fdt        j                  j                  j
                  |j                      j                  D               _ y w)Nc              3   V   K   | ]   }|j                   j                         v  " y wr  )r  r  )r  r{  visited_scheduler_nodess     rW   r  zCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.try_share_local_buffer.<locals>.<genexpr>.<genexpr>  s,      (&,0 )-		(:(:(<@W(W(&r  )r   r"  r4   r  r!  name_to_bufr~  )r  global_bufferr  s     rW   r  zCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.try_share_local_buffer.<locals>.<genexpr>  s`      S" )6 (5'9'9'E %( (&45GG4E4E4Q4Q,9,>,>5**/%	(& %&S"s   A#A&)rS  r"  r   )local_buffer_layoutr  	local_buflocal_to_global_buffersr  s      rW   try_share_local_bufferzsCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.try_share_local_buffer  sS    -: 5	#6):J:J#Js S" :Q(1:&S" P" ,5$45 $(rV   local_buffer_datar   )r   rS  F)local_buffer_numberT)2r   r  clearr   r"  r$  r   ro   r+  r$   r  r  r  r   get_outputsr~  r  r   r%  r  r&  is_contiguousFixedLayoutr  r   r:  rN  Bufferr  r   rJ   r  add_local_bufferr"   r   r  rS  r  r4   r  remover   !cpp_outer_loop_fused_inner_countsCppOuterLoopFusedCountr  r^  finalize_kernelrJ  rK  from_iterable)r  r  r  global_buffer_layoutsize_offsetr  r  r  local_buf_prefixlocal_buffer_usedscoper  r0  rQ  removed_bufferouter_fusion_cpp_kernel_proxyr  r  r  rg  r  rO  rV  
nodes_lists   `               @@@@@rW   $try_outer_loop_fusion_with_local_bufzSCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf  s
    d$?@@@!'')#&7 # .0MBD# !113  +5S/*;'&*nn&6 [N%nmDDD+//0G0G0IJ&335~99;<A '5'A'A'CA'F$ BRBXBX  )9(=(=)-9J9JKKK/</G/G/I,&*&B&BS+N;F '4 1>>@ < >$.0nn077066055klC077E	/+(" ,?(,B/-)  102		(8'93};M:N%O':1- *001BCNP34E4J4JK/0A0F0FGNN)s[z $L$5$56 "%}%)(5 +00<<<..(*A,BSBS*T
 "113 9E%e.@--PQQQ'5l'C$$225??3DE)001AB%%eoo&789 >>)4+G+G +0*?*? G //66~FG !+" ", 99@@2212,/0C0C,D 150O0O)1- ,,1@ioo33J?@?"H I"H s   *D!P/BP//P8Fr  N)rV  r   r  r   r  r  r   r  r   r  r$  r"   r$   r+  r   r  r  )
r.  r  r  r  r0  _nodesrQ  rO  rV  r  s
          @@@rW   codegen_outer_loop_nodez%CppScheduling.codegen_outer_loop_node  s    (()0)O)O&6802
$ ;<<<_	7R _	B 4D95SG2!'') ''--e-D K!113 KE%e.@--PQQQ27//2CF'5l'C$$226: 001A6JKK K :K Ks   !A,DD!c                 h   | j                   }t        |t              r| j                  |       nO|j	                         }| j                  |      }t        |      }|j                  |       |j                  ||       | j                         }|t        j                  kD  r| j                  d       yy)zC
        Turn an set of pre-fused nodes into a C++ kernel.
        TN)rV  r   r  r  r+  r  r   r  r  _get_scheduled_num_argsr  MAX_FUSED_KERNEL_ARGS_NUMr  )r.  r  rV  r  rQ  args_nums         rW   codegen_nodezCppScheduling.codegen_nodel  s     ((d78((.)-)9E''.E-l;**51(()95A//1m===""4( >rV   c                 n    t        |t              xr$ t        |j                  t        j                        S r  )r   r$   r  r   CppTemplateBuffer)r.  r  s     rW   is_cpp_templatezCppScheduling.is_cpp_template  s,    $. 
:IIr++4
 	
rV   template_nodeepilogue_nodesprologue_nodesc                    |rJ |D cg c]  }t        |t        t        f      r| }}t        d   dxx   dz  cc<   t        d   dxx   t	        |      z  cc<   | j                  |      sJ d       t        t        |      }|j                  \  }\  }}|dk(  sJ t        t        j                  |j                        }|D cg c]  }|j                   }	}t        d |	D              sJ d       d	 }
 |
||j                  |	      }|j                  |||	
      \  }}|5  t        |j                        s|j                          |D ]  }|j                            |       }ddd       t!        j"                  |      5  |g|}| j%                  ||j&                        }ddd       t        |j                        rt	        |j(                        dk(  sJ d       |j(                  d   j*                  D ]r  }t        |j                  t,              sJ d       t        |j                  j                  t        j.                        sJ d       |j                  j                          t |j1                  |       t         j2                  xj4                  |j4                  z  c_        | j7                          yc c}w c c}w # 1 sw Y   fxY w# 1 sw Y   3xY w)zG
        Codegen a CPP template, possibly with fused epilogues
        inductorcpp_templated_kernel_counterr5   cpp_epilogue_fusion_counterzlTemplate node passed to CppScheduler.codegen_template must be a SchedulerNode that wraps a CppTemplateBufferrU   c              3   P   K   | ]  }t        |t        j                           y wr  )r   r   r%  )r  r  s     rW   r  z1CppScheduling.codegen_template.<locals>.<genexpr>  s     O:a!2!23OrF  z9Epilogue nodes must all be instances of ir.ComputedBufferc                     sy| j                         |v sJ || j                            j                  }t        fd|D               S )NFc              3      K   | ]8  }t        |j                  t              xr |j                  j                  v  : y wr  )r   r  r   )r  r{  r  s     rW   r  zZCppScheduling.codegen_template.<locals>.template_buffer_has_other_users.<locals>.<genexpr>  sA        499&78 5IINNn45s   >A)r  r~  r"  )template_bufferoutputs_by_namer  r~  s     ` rW   template_buffer_has_other_userszGCppScheduling.codegen_template.<locals>.template_buffer_has_other_users  s\     ""++-@@@#O$<$<$>?EEE  "   rV   )$flag_template_buffer_has_other_usersr  NzSMulti outputs template should be with 1 output template buffer of MultiOutputLayoutr   z?Multi outputs template should be with ExternKernelSchedulerNodez7Multi outputs template has multi users with MultiOutput)r   r$   r"   r   r   r  r   r  r   r  r  r"  r  make_kernel_renderr)   r  r4   set_kernel_handlerdefine_kernelr  outputsr~  r    rU  call_kernelr  r  free_buffers_in_scheduler)r.  r  r  r  epilogue_noder   rnumelctbr  epilogue_ir_nodesr  r  rZ  renderr  src_codenode_schedulekernel_namer{  s                      rW   codegen_templatezCppScheduling.codegen_template  s    "!!
 "0
--9K)LM 
 
 	;<A<:;s>?RR;##M2 	
z	
2 ]M:&,,;Av||$()=)=}?Q?Q$R*;
AFF;
 ;
 O=NOO 	
G	
O	 0O..0A0
, //1U, 0 

  	 ,]-?-?@&&(&   xH	  !!&) 	S*<^<M,,X}fkkRK	S %]%7%78 },,-2 e2 &--a066 %!$))-FG UG "$)).."..A MA 		""$% 	;,	6#9#99&&(S
 ;
:	  	 	S 	Ss$   K K"AK
#K
KK!c                 6    | j                   j                         S r  )rV  get_num_argsr3  s    rW   r  z%CppScheduling._get_scheduled_num_args  s      --//rV   c                     | j                   S r  r  r3  s    rW   ready_to_flushzCppScheduling.ready_to_flush  s    ###rV   c                      y r  rU   r3  s    rW   codegen_synczCppScheduling.codegen_sync  s    rV   c                    t         j                  j                  }t        j                  j
                  r$t        |t        j                  j
                        nd}dj                  d||j                         g      }t         j                  j                  r|nd}|j                  t        t        j                        |      }|j                  t        t        j                        |      }|j                  dd      }|j                  d      }|j!                  d|      }	|||	d	z     d
}
t#               }|| j$                  j&                  n|}|j)                         \  }}}t         j                  j                  s|j+                  d|d       |j-                  |d       t         j                  j                  s|j+                  d       |j/                  ||j1                         d|
       |S )NrR   r   r  rZ  z#pragma CMTz//z
extern "C"r   r5   z;
zasync_compile.cpp_pybinding(z, '''T)stripz''')F)gpucpp_definition)r4   r  wrapper_coder   r  descriptive_namesr'   rD  next_kernel_suffixcpp_wrapperr   ro   r,   KERNEL_NAMEDESCRIPTIVE_NAMErfindr  r=   rV  r  cpp_argdefsr   r  r  getvalue)r.  r  r  kernel_argsr  
fused_namer  kernel_decl_name
first_char	last_charkernel_definitioncompile_wrapperr  r   	arg_typess                  rW   r  zCppScheduling.define_kernel  s   ''&& zz++ "%)E)EF 	
 hhz73M3M3OPQ*+''*=*=;8##C(?(?$@BRS##C(D(D$E{S ##M48 ^^L1
MM#z2	'
Y]CDCH(*)4)<t  %%+**,1iww""%%(DYMQV&WXxt4ww""%%f-$$&,	 	 	
 rV   c                 2   | j                   j                         }|rZ| j                  || j                   j                        }| j                   j	                  t
        j                  j                  |       | j                          | j                  d       y r  )
rV  codegen_groupr  scheduled_nodesr  r4   r  r  r  r  )r.  r  r  s      rW   flushzCppScheduling.flush  sv    $$224,,$++;;K ))!''*>*>L!u%rV   r  )0r  r  r  r  r   r6   INPLACE_BUFFERSREDUCE_TO_SINGLE_ELEMENTr  r_  r   r  r  r-  rk   r  r  r  r'  r   r  r0  r=  rM  rP  r   rX  r3  r2  re  rh  r#  r$   r  r  r  r	   r"   r  r  r   r  r  r  r  r  r  r`  ra  s   @rW   r  r    sp    !$!**33	
 $%,, $:n;U $ $%& &N*P=dx8H/I 6p>	<
&
/@
	
4)l

=mD$7 m^K)KB)/1C]RS),
$5 
$ 

U)(U) !!23U) !!23	U)n0$#J&rV   r  c                   D     e Zd Z fdZd Zd Zd ZddefdZd Z	 xZ
S )	r   c                    t         |           t               | _        t	               | _        t        | j
                        | _        t        j                         | _
        | j                  j                  | j                         g | _        y r  )r,  r-  r?   r  r7   
loops_codeWorkSharingr`  r   r   r   r   r  r  s    rW   r-  zKernelGroup.__init__  s^    L	&.doo.))+


  )!rV   c                 :     || j                   t               g| S r  )r  r+   )r.  r%  r  s      rW   r  zKernelGroup.new_kernel!  s    49924<t<<rV   c                     | xj                   |z  c_         | j                  }| j                  }|j                  ||       y r  )r  r  r`  r(  )r.  r  r  r   r`  s        rW   r  zKernelGroup.finalize_kernel$  s5    %WW  r*rV   c                 X    | j                   j                         \  }}}t        |      }|S r  )r  r  r   )r.  arg_defs
_call_args
_arg_typesr  s        rW   r  zKernelGroup.get_num_args*  s)    +/99+@+@+B(*jx=rV   r   c           	      .   | j                   j                          | j                  syt               }t        j
                  j                  xr t        j                  dv }|r|j                  dg       |j                  t        j                                |t        t        j                        n|}|t        t        j                         n|}| j"                  j%                         \  }}}dj'                  d      j)                  |      }t+               }|j                  d| d| d| d	       |j-                         5  |rHt.        j0                  j2                  }	|	d
t        |	      z   dz   nd}
|j                  d|
|z    dg       | j"                  j5                         D ]  \  }}|j                  d| d| d        |j7                  | j8                         d d d        |j;                         S # 1 sw Y   |j;                         S xY w)NrR   )linuxrP   z!#include <ATen/record_function.h>z,
   zextern "C" z void r  r   graph_r   zRECORD_FUNCTION("z#", c10::ArrayRef<c10::IValue>({}));r   r   r   )r   r	  r  r7   r   r  enable_kernel_profilesysplatformr   r   r   
cpp_prefixro   r,   r  r  r  r  ljustrD  rX   r   r4   r  graph_idaliasesr  r  r  )r.  r   r   r  r  r  r  r   func_export_declr  r  oldnews                rW   r   zKernelGroup.codegen_group/  s   

##~ !'

 @ @ !
S\\ V
 F
 !OO@ABy++-. <@<3{667T;?<c+667T..0!Q;;r?''113*+62B1C1XJaP	

 [[] 	)$77++;C;OCM1C7UW+F[,@+AAfg
 !II--/ 7Sse3se1567KK(	) }}	) }}s   B G<<Hc                 j    | j                   j                         \  }}}|j                  ||d|       y )NF)tritonr  )r  r  generate_kernel_call)r.  r  r  r   	call_argsr  s         rW   r  zKernelGroup.call_kernelW  s7    "&))"7"7"99i$$5I 	% 	
rV   r  )r  r  r  r-  r  r  r  ro   r   r  r`  ra  s   @rW   r   r     s)    "=+
&# &P
rV   r   c                   0    e Zd Zd Zd Zd Zd Zd Zd Zy)r  c                 `    || _         d| _        d | _        t        j                         | _        y r  )r   in_parallelr  r   r   r   )r.  r   s     rW   r-  zWorkSharing.__init___  s)    	 ))+
rV   c                    | j                   r|| j                  k7  r| j                          | j                   s|| _        d| _         t        j                  j
                  r| j                  j                  d       n| j                  j                  d| d       | j                  j                  | j                  j                                | j                  j                  d       y y )NTz#pragma omp parallelz!#pragma omp parallel num_threads(r   zint tid = omp_get_thread_num();)r#  r  r	  r   r  r  r   r   r   r   r   )r.  r  s     rW   r  zWorkSharing.parallele  s    4+;+; ;JJL&D#Dzz))		##$:;		##&GyPQ$RSJJ$$TYY%5%5%78II1  rV   c                 h    | j                   r| j                  j                  d       | j                   S )Nz#pragma omp single)r#  r   r   r3  s    rW   r  zWorkSharing.singleu  s*    II 45rV   c                 F    | j                   j                          d| _        y r  )r   r	  r#  r3  s    rW   r	  zWorkSharing.closez  s    

 rV   c                 :    | j                   j                          | S r  )r   rm  r3  s    rW   rm  zWorkSharing.__enter__~  s    

rV   c                 >    | j                   j                  |||       y r  )r   rs  ro  s       rW   rs  zWorkSharing.__exit__  s    

Hgv6rV   N)	r  r  r  r-  r  r  r	  rm  rs  rU   rV   rW   r  r  ^  s     ,  
!7rV   r  c                      e Zd ZU dZeej                     ed<   dZeej                     ed<   ej                  j                  Zej                  ed<   ej                  j                  Zej                  ed<   ej                  j                  Zej                  ed<   dZeed<   d	Zeed
<   d	Zeed<   d	Zeed<   d	Zeed<   d Zd Zd Zy)r  Nr   r:  r;  r  r<  r   r  Fsimd_ompsimd_vec	collapsedr  c                 j    t        j                         }|r|j                         | _        y d| _        y rU  )r   rN  r-  simd_nelements)r.  ra  s     rW   __post_init__zLoopLevel.__post_init__  s-     .9-E-E-GAO>#;#;#=UVrV   c                    t        j                  |      }t        | j                  | j                        }||_        d|_        t        |j                  |      |z  |_        | j                  |_	        d|_
        | j                  |_        |S )NTF)r   rN  r  r   r:  r<  r+  r   r  r  r,  r  )r.  r  sympy_factorr  s       rW   r  zLoopLevel.tile  sn    }}V,499-!
"499l;lJ --rV   c                    t        | j                        }t        | j                        }t        j                  j
                  r||k(  ry | j                  r| j                  dkD  rd| j                   dnd}| j                  rFd}| j                  dkD  r|d| j                   dz  }| j                  r\|j                  dd|       }nF| j                  rd}n7| j                  rd	| }n%| j                  st        j                         rd
}nd}t         d| j                   d| }| j                   d| }| j                   j"                  r%| j                   dt        | j                          }n;| j                   dt        | j                          dt        | j                          d}d| d| d| d}| j$                  s|s|gS ||gS )Nr5   zsimd simdlen(z) rR   z#pragma omp forz
 collapse(r   z for z#pragma omp z#pragma GCC ivdepr   r~  <r}  z+=(z == 0 ? 1 : zfor(r   )rE   r;  r:  r   r  no_redundant_loopsr*  r.  r  r   r+  r  r   r  rI   r   r<  r  r,  )	r.  offset_expr	size_exprsimdline1
offset_strr  	steps_strline2s	            rW   r  zLoopLevel.lines  s   !$++.		*	::(([I-E }}!4!4q!8 D//03 	
 ==%E}}q :dmm_A66}}gtf~>]]E]]"4&)E""{'9'9';'EE"|1TXXJa}=
hhZq,::88*B{4::'>&?@I
 88*CDJJ 78 9"4::./q2  zl"XJb1=>>7Nu~rV   )r  r  r  r   r   r   r  r  r:  r   r   r;  r  Oner<  r  rm   r*  rk   r+  r,  r  r/  r  r  rU   rV   rW   r  r    s     $C%**	$!%D(5::
%FEJJ% #WW\\J

)E5::#HcHdHdItL$
W	'rV   r  c                       e Zd ZU dZdZeee      ed<   dZ	ee
   ed<   ede
fd       Zd Zed        Zd Zd	 Zd
e
fdZd ZdefdZy)rF  aV  
    A loop-nest-like structure. It is built with the `build` method
    as a loop nest and then will perform loop-tiling at some depth.

    A typical case is for vectorization, where we typically do loop-tiling
    at the innermost loop level. A more complicated case is when we do
    2D tiling at both the innermost and outer levels.
    NrB  rZ  c                 $   | j                   }| j                  }| j                  }|J d}t        t	        ||            D ]B  \  }\  }}t        ||      }|s|g}n|j                  |       ||k\  s2| j                  |_        D t        |      }	|	S )z4Build a LoopNest with the given `kernel` as the leafN)	rH  rM  r}  r   r  r  r  r  rF  )
rZ  rH  rM  r}  rB  loop_idxr   r:  r  rH  s
             rW   r'  zLoopNest.build  s     ?? 00***+/%.s8V/D%E 	8!HksDS$'DT"?*$*$7$7!	8 UO	rV   c                 ,    t        | j                        S r  )rk   rB  r3  s    rW   __bool__zLoopNest.__bool__  s    DJJrV   c                    | j                   t        dd      S d}d}| j                   d   j                  }t        j                  d      }| j                   D ]'  }|j                  |k7  r n||j
                  z  }|dz  }) |t        | j                         k  rt        |t        j                        rt        | j                   |   j
                  t        j                        r|dz  | j                   |   j
                  k  rd|}d}| j                   |   j                  }t        |t        | j                               D ]%  }| j                   |   j                  |k7  r n|dz  }' t        ||      S )a  
        Maximal allowed depth for parallelism: All reduction or non-reduction levels.
        When the range of the first inner loop beyond the maximum parallel depth is much
        larger than the range of all outer loops within the maximum parallel depth,
        change the starting depth of parallelism to the first inner loop and recalculate
        the maximum parallel depth.
        r   r/  r5   rE  )	rB  r	  r  r   rN  r:  r   r   rG  )r.  r  	max_depthr  
loop_sizesr  r   s          rW   r  zLoopNest.max_parallel_depth  sG    :: qAA	zz!}11]]1%
JJ 	D  L0#dii/JNI		 DJJ':u}}54::i055u}}ES 4::i#8#=#==#KI::k2??L;DJJ8 ::a=--=Q	 I;OOrV   c                    |j                   | j                         j                   k  sJ d       | j                  J t        | j                        |j                   k\  sJ | j                  |j                     }|j                   |_        |j                  rt        xj                  dz  c_        t        |j                  dz   |j                         D ]  }d| j                  |   _
         y )Nz?Parallel depth cannot exceed the maximal allowed parallel depthr5   T)r
  r  rB  r   r  r  r  r   parallel_reduction_countrG  r,  )r.  r  r  r   s       rW   r  zLoopNest.mark_parallel)  s    ''4+B+B+D+S+SS 	
M	
S zz%%%4::)":"::::zz)//0!00,,1,y,,q0)2J2JK 	+A&*DJJqM#	+rV   c                     | j                   sJ | j                   |   j                  |      | j                   |<   | j                   |   S )z
        Do loop-tiling at the `depth` level with `factor`.
            for (x0 = 0; x0 < x0_end; x0++)
            ->
            for (x0 = 0; x0 < x0_end; x0 += factor)
        See details in Note [tiled_size].
        )rB  r  )r.  r  r  s      rW   r  zLoopNest.tile6  sA     zzz JJu-226:

5zz%  rV   r   c                 6    | j                   sJ | j                   S r  rZ  r3  s    rW   r  zLoopNest.get_kernelB  s    {{{{{rV   c                     || _         y r  rI  r  s     rW   r  zLoopNest.set_kernelF  s	    rV   levelc                     | j                   sJ t        | j                         |k\  sJ |t        | j                         k(  rd n| j                   |d  }t        || j                        S r  )rB  r   rF  rZ  )r.  rK  rB  s      rW   rX  zLoopNest.from_loop_levelI  sV    zzz4::%'''TZZ0djj6Ht{{++rV   )r  r  r  r  rB  r   r#  r  r  rZ  ru  r  r'  rA  r%   r  r  r  r  r  rm   rX  rU   rV   rW   rF  rF    s     (,E8DO$+"&FHY&i  (  $P $PL+
!I ,S ,rV   rF  rI  r  )r   dataclassesrI  rJ  rY  rK  r   r  r  collections.abcr   enumr   typingr   r   r   r   r	   r   r   torch.fxtorch._inductorr
   torch._prims_commonr   r   torch.utils._ordered_setr   torch.utils._sympy.functionsr   r   r   torch.utils._sympy.symbolr   r   r   _dynamo.utilsr   rR   r   r   r   r   r   r   r  r   r!  r   r   r    r!   r"   r#   r$   utilsr%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   virtualizedr1   r2   r3   r4   commonr6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   	cpp_utilsrB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   r  rT   	lru_cacherX   _logginggetArtifactLoggerr  schedule_logNATIVE_OMP_RTYPESRTYPE_TO_CPPr  PYTHON_TO_CPPCONTAINER_PYTHON_TO_CPPr2  r3  r   r1  rn   rk   r4  r5  r
  r  rv   r#  r   r  rw   r   r   r   r   r   r  ro   r   rm   r   r   r   r  r  	dataclassr	  r  rc  ry  _initialize_pointwise_overridesr  rm  rp  ru  r  rr  r   r  r  r   rW  r  r  r   r  r  rF  rU   rV   rW   <module>rf     s/         	 
  $  7 7    ( @ / K K O O % G G        > =      $ llg% T: : ~~//*EBC   !   #&   
NN	MM 
MM	KK	NN	MM	JJ	KK	JJ	KK	KK
* T%++& 
 
KK	NN	MM	KK	JJ1 D- )D %)+) ELL!	+)\-
-jj- - 

	-
 jj- -`3#$  ;;	
 
sCxBV^ V3 V# V -UZZ -ell - - ;uzz ;

 ;PS ; ;| FJ!::!!LL!6>sm! !   ]!"4 ]!@! !Bk; k\  , ,U 3r7l r7j  / / 9  % % '7 7w wtk9 k\Y
l Y
x)$ )$eHU[[4I44O.P )$XbK bKJT	
Y T	
n 
9  
FFt FD&N D&ND
 D
N%7 %7P R R Rj p, p, p,rV   