
    Vhi                       d dl mZ d dlZd dlZd dlmZmZmZ d dlZd dl	m
Z
 d dlZd dlmZ d dlmZ ddlmZmZ dd	lmZmZmZ d
dlmZmZmZmZmZmZ d
dlm Z m!Z!m"Z" erd dlm#Z# ddl$m%Z%m&Z& ddl'm(Z(m)Z) d
dlm*Z* ejV                  dejX                  dejZ                  dej\                  dej^                  dej`                  dejb                  dejd                  dejf                  di	Z4d%dZ5 G d de      Z6 G d de      Z7e7jq                  d        G d  d!e!      Z9ejt                  d&d"       Z; G d# d$e"      Z<y)'    )annotationsN)AnyOptionalTYPE_CHECKING)
PRECEDENCE)ExprPrinter)ValueRanges   )get_bounds_index_exprget_kernel_metadata)ops
OpsWrapperV   )CSEVariableDeferredLineDTYPE_TO_COMPUTATION_DTYPEIndentedBufferOpOverridesPythonPrinter)IterationRangesEntry
SIMDKernelSIMDScheduling)Union)ReductionType	StoreMode)	SchedulerSchedulerNode)OpVarTboolcharshortintlongucharfloathalfbfloatc                    t        | t              r:| t        j                  k(  ry| t        j                   k(  ry| | k7  ryt	        |       S t        | t
              r| rdS dS t	        |       S )N	HUGE_VALFz
-HUGE_VALFNANtruefalse)
isinstancer&   torchinfstrr    )vals    K/home/dcms/DCMS/lib/python3.12/site-packages/torch/_inductor/codegen/mps.pyvalue_to_metalr4   2   s_    #u%))UYYJCZ3x	C	v)')s8O    c                  L    e Zd Zd
dZd
dZd
dZd
dZd
dZd
dZd
dZ	d
dZ
y	)MetalExprPrinterc                    |j                   \  }}| j                  |      }| j                  |      }|j                  r	d| d| dS d| d| dS )N() / ()metal::floor(argsdoprint
is_integer)selfexprxdivs       r3   _print_FloorDivz MetalExprPrinter._print_FloorDivA   s[    3LLOll3??qcse1%%qcse1--r5   c                    |j                   \  }}}| j                  |      }|dk7  r0| j                  |      }|j                  r
d| d| d}n	d| d| d}| j                  |      }d| d| dS )Nr   r9   r:   r;   r<   z) % (r=   )rA   rB   rC   rD   mods        r3   _print_ModularIndexingz'MetalExprPrinter._print_ModularIndexingI   s    ii3LLO!8,,s#Cs%uA&#A3eC52ll31#U3%q!!r5   c                    t        |j                        dk7  rt        d      ddj                  t	        | j
                  |j                               dS )Nr
   z$metal::min only supported for 2 argszmetal::min(, r;   lenr>   RuntimeErrorjoinmap_printrA   rB   s     r3   
_print_MinzMetalExprPrinter._print_MinU   F    tyy>QEFFTYYs4;;		'BCDAFFr5   c                    t        |j                        dk7  rt        d      ddj                  t	        | j
                  |j                               dS )Nr
   z$metal::max only supported for 2 argszmetal::max(rJ   r;   rK   rQ   s     r3   
_print_MaxzMetalExprPrinter._print_MaxZ   rS   r5   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr   metal::abs(r   r;   rL   r>   rP   rQ   s     r3   
_print_AbszMetalExprPrinter._print_Abs_   s9    499~"""T[[167q99r5   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr   zstatic_cast<long>(metal::rint(r   z))rX   rQ   s     r3   _print_RoundToIntz"MetalExprPrinter._print_RoundToIntc   s9    499~"""/DIIaL0I/J"MMr5   c                    t        |j                        dk(  sJ |j                  \  }}|j                  r|dk  sJ t        d| d      | j	                  |t
        d         }d| d| d|  d	S )
Nr
   r   zOFor integer inputs, only non-negative ndigits are currently supported, but got .Mulz!static_cast<float>(metal::rint(1e * z) * 1er;   )rL   r>   r@   
ValueErrorparenthesizer   )rA   rB   numberndigits
number_strs        r3   _print_RoundDecimalz$MetalExprPrinter._print_RoundDecimalg   s    499~"""))Q;;abiajjkl  &&vz%/@A
27)3zl&RYQYPZZ[\\r5   c                n    |j                   \  }}d| j                  |       d| j                  |       dS )Nstatic_cast<float>(z) / static_cast<float>(r;   )r>   rP   )rA   rB   lhsrhss       r3   _print_IntTrueDivz"MetalExprPrinter._print_IntTrueDivs   s;    99S$T[[%5$66MdkkZ]N^M__`aar5   N)rB   
sympy.Exprreturnr1   )__name__
__module____qualname__rE   rH   rR   rU   rY   r[   re   rj    r5   r3   r7   r7   @   s/    .
"G
G
:N
]br5   r7   c                     e Zd Ze	 	 d5	 	 	 	 	 	 	 	 	 d6d       Ze	 	 	 	 	 	 	 	 d7d       Zed8d       Zed9d       Zed:d       Zed;d       Z	ed<d       Z
ed=d	       Zed=d
       Zed=d       Zed=d       Zed>d       Zed>d       Zed>d       Zed>d       Zed>d       Zed>d       Zed>d       Zed>d       Zed>d       Zed>d       Zed>d       Zed>d       Zed>d       Zed>d       Zed?d       Zed>d       Zed>d       Zed>d       Z ed>d       Z!ed>d        Z"ed>d!       Z#ed>d"       Z$ed>d#       Z%ed>d$       Z&ed=d%       Z'ed>d&       Z(ed>d'       Z)ed=d(       Z*ed>d)       Z+ed=d*       Z,ed>d+       Z-ed@d,       Z.ed@d-       Z/e	 	 	 	 	 	 	 	 	 	 dAd.       Z0ed>d/       Z1ed=d0       Z2ed=d1       Z3ed>d2       Z4ed>d3       Z5ed>d4       Z6y)BMetalOverridesNc                "    dt         |    d|  dS )Nzstatic_cast<>(r;   DTYPE_TO_METAL)rC   dtype	src_dtypeuse_compute_typess       r3   to_dtypezMetalOverrides.to_dtypez   s     nU34Bqc;;r5   c                "    dt         |    d|  dS )Nz*reinterpret_cast<thread z*>(&r;   ru   )rC   rw   rx   s      r3   to_dtype_bitcastzMetalOverrides.to_dtype_bitcast   s     +>%+@*AaSJJr5   c                    t        |       S Nr4   )r2   rw   s     r3   constantzMetalOverrides.constant   s    c""r5   c                @   t         j                  j                  t         j                  j                  |             }t         j                  j                  j                  t         j                  j                  |t        |             }t        j                  ||      S )N)bounds)
r   kernelindex_to_strprepare_indexingcsegeneratecomputer   r   rz   )rB   rw   idx_strvars       r3   
index_exprzMetalOverrides.index_expr   sl    ((''(A(A$(GHhhll##HHg.CD.I $ 
 ||C''r5   c                    t         j                  j                  | |      5 } |       }d d d        j                  j                  rt        |      }t        j                  ||      S # 1 sw Y   AxY wr~   )r   r   
mask_loadsr   is_boolr    r   where)maskbodyothernew_maskresults        r3   maskedzMetalOverrides.masked   sa     XX  u- 	VF	 ==  KEyy6511	 	s   A))A2c                (    |  d| dt        |       S )Nz ? z : r   )abcs      r3   r   zMetalOverrides.where   s    Cs#nQ/011r5   c                d   t        |t              r)|j                  |j                  j                  s|  d| S t        | t              r#| j                  t        j
                  k7  rd|  dn| }t        |t              r#|j                  t        j
                  k7  rd| dn|}| d| d| d| dS )N % rg   r;   z - z * metal::floor( / )r.   r   rw   is_floating_pointr/   r&   r   r   float_afloat_bs       r3   	remainderzMetalOverrides.remainder   s     q+&#GG--SA3< ![)agg.D "!A& 	 ![)agg.D "!A& 	
 #gY&6wis7)1MMr5   c                D    d|  d| d|  d}d|  d| d| d}d| d| dS )Nstatic_cast<decltype(+)>(r;   zc10::metal::max(rJ   rp   r   r   
typecast_a
typecast_bs       r3   maximumzMetalOverrides.maximum   K    ,QCq3qc;
,QCq3qc;
!*R
|1==r5   c                D    d|  d| d|  d}d|  d| d| d}d| d| dS )Nr   r   r   r;   zc10::metal::min(rJ   rp   r   s       r3   minimumzMetalOverrides.minimum   r   r5   c                    |  d| S )Nz || rp   r   r   s     r3   
logical_orzMetalOverrides.logical_or       D}r5   c                    |  d| S )Nz && rp   r   s     r3   logical_andzMetalOverrides.logical_and   r   r5   c                    d|  dS )Nzmetal::isnan(r;   rp   rC   s    r3   isnanzMetalOverrides.isnan       qc##r5   c                    d|  dS )Nzmetal::isinf(r;   rp   r   s    r3   isinfzMetalOverrides.isinf   r   r5   c                    d|  dS )Nzmetal::log(r;   rp   r   s    r3   logzMetalOverrides.log       QCq!!r5   c                    d|  dS )Nzmetal::exp(r;   rp   r   s    r3   expzMetalOverrides.exp   r   r5   c                    d|  dS )NrW   r;   rp   r   s    r3   abszMetalOverrides.abs   r   r5   c                    d|  dS )Nzmetal::signbit(r;   rp   r   s    r3   signbitzMetalOverrides.signbit        1%%r5   c                    d|  dS )Nzmetal::precise::sin(r;   rp   r   s    r3   sinzMetalOverrides.sin       %aS**r5   c                    d|  dS )Nzc10::metal::sinc(r;   rp   r   s    r3   sinczMetalOverrides.sinc       "1#Q''r5   c                    d|  dS )Nzmetal::precise::cos(r;   rp   r   s    r3   coszMetalOverrides.cos   r   r5   c                    d|  dS )Nzc10::metal::i0(r;   rp   r   s    r3   i0zMetalOverrides.i0   r   r5   c                    d|  dS )Nzc10::metal::i1(r;   rp   r   s    r3   i1zMetalOverrides.i1   r   r5   c                    d|  dS )Nzc10::metal::erf(r;   rp   r   s    r3   erfzMetalOverrides.erf   s    !!A&&r5   c                    d|  dS )Nzc10::metal::erfinv(r;   rp   r   s    r3   erfinvzMetalOverrides.erfinv   s    $QCq))r5   c                    d|  dS )Nzc10::metal::log_gamma(r;   rp   r   s    r3   lgammazMetalOverrides.lgamma  s    's!,,r5   c                    d|  d| dS )Nzc10::metal::polygamma(rJ   r;   rp   )rC   ys     r3   	polygammazMetalOverrides.polygamma  s    's"QCq11r5   c                    d|  dS )Nzc10::metal::digamma(r;   rp   r   s    r3   digammazMetalOverrides.digamma	  r   r5   c                    d|  dS )Nzmetal::tan(r;   rp   r   s    r3   tanzMetalOverrides.tan  r   r5   c                    d|  dS )Nzmetal::asin(r;   rp   r   s    r3   asinzMetalOverrides.asin      aS""r5   c                    d|  dS )Nzmetal::acos(r;   rp   r   s    r3   acoszMetalOverrides.acos  r   r5   c                    d|  dS )Nzmetal::atan(r;   rp   r   s    r3   atanzMetalOverrides.atan  r   r5   c                    d|  dS )Nzmetal::sqrt(r;   rp   r   s    r3   sqrtzMetalOverrides.sqrt  r   r5   c                    d|  dS )Nzmetal::rsqrt(r;   rp   r   s    r3   rsqrtzMetalOverrides.rsqrt!  r   r5   c                    d|  dS )Nzmetal::tanh(r;   rp   r   s    r3   tanhzMetalOverrides.tanh%  r   r5   c                    d|  dS )Nzmetal::atanh(r;   rp   r   s    r3   atanhzMetalOverrides.atanh)  r   r5   c                H    |  d| }|  d| }d|  d| d| d| d| d| d	S )
Nr   r   z((z
 < 0) != (z	 < 0) ? (z != 0 ? z - 1 : z) : r;   rp   )r   r   quotrems       r3   floordivzMetalOverrides.floordiv-  sR     Cs|3qclA3j9SE$wtfDQUPVVWXXr5   c                    d|  dS )Nr<   r;   rp   r   s    r3   floorzMetalOverrides.floor4  r   r5   c                    d|  dS )Nzmetal::sign(r;   rp   r   s    r3   signzMetalOverrides.sign8  r   r5   c                D    d|  d| d|  d}d|  d| d| d}d| d| dS )Nr   r   r   r;   zmetal::fmod(rJ   rp   r   s       r3   fmodzMetalOverrides.fmod<  sK    ,QCq3qc;
,QCq3qc;
j\J<q99r5   c                    d|  dS )Nmetal::trunc(r;   rp   r   s    r3   trunczMetalOverrides.truncB  r   r5   c                    | j                   t        j                  k7  rd|  dn| }|j                   t        j                  k7  rd| dn|}d| d| dS )Nrg   r;   r   /)rw   r/   r&   r   s       r3   truncdivzMetalOverrides.truncdivF  sY     125;;0F's!,A015;;0F's!,Awiq	33r5   c                    d|  dS )Nzmetal::ceil(r;   rp   r   s    r3   ceilzMetalOverrides.ceilN  r   r5   c                    d|  d| dS )Nzc10::metal::rand(rJ   r;   rp   seedoffsets     r3   randzMetalOverrides.randR  s    "4&6(!44r5   c                    d|  d| dS )Nzc10::metal::randn(rJ   r;   rp   r   s     r3   randnzMetalOverrides.randnV  s    #D6F8155r5   c           	          d|  d| d| d| d	S )Nzc10::metal::randint64(rJ   r;   rp   )r   r   lowhighs       r3   	randint64zMetalOverrides.randint64Z  s%     (vRxr#baHHr5   c                    d|  dS )Nzmetal::round(r;   rp   r   s    r3   roundzMetalOverrides.round`  r   r5   c                D    d|  d| d|  d}d|  d| d| d}d| d| dS )Nr   r   r   r;   zmetal::pow(rJ   rp   )r   r   cast_acast_bs       r3   powzMetalOverrides.powd  sK    (1QCs1#Q7(1QCs1#Q7VHBvha00r5   c                    d|  d| dS )Nzc10::metal::zeta(rJ   r;   rp   r   s     r3   zetazMetalOverrides.zetaj  s    "1#Rs!,,r5   c                    d|  dS )Nz c10::metal::spherical_bessel_j0(r;   rp   r   s    r3   spherical_bessel_j0z"MetalOverrides.spherical_bessel_j0n  s    1!A66r5   c                    d|  dS )Nzc10::metal::xlog1py(r;   rp   r   s    r3   xlog1pyzMetalOverrides.xlog1pyr  r   r5   c                    d|  dS )Nzc10::metal::entr(r;   rp   r   s    r3   entrzMetalOverrides.entrv  r   r5   )NT)
rC   r   rw   torch.dtyperx   zOptional[torch.dtype]ry   r    rl   r1   )rC   r   rw   r  rx   r  rl   r1   )r2   zUnion[bool, float, int]rw   r  rl   r1   )rB   rk   rw   r  rl   r1   )r   r   r   rk   r   r   rl   r1   )r   r   r   r   r   r   rl   r1   )r   r   r   r   rl   r1   )r   r   r   r   rl   r1   )rC   r   rl   r1   )rC   r   r   r   rl   r1   )r   r   r   r   rl   r1   )
r   r   r   r   r  r   r  r   rl   r1   )7rm   rn   ro   staticmethodrz   r|   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r	  r  r  r  r  r  rp   r5   r3   rr   rr   y   s    ,0"&	<<< )<  	<
 
< < KK*K7BK	K K
 # # ( ( 2 2 2 2 N N( > >
 > >
     $ $ $ $ " " " " " " & & + + ( ( + + & & & & ' ' * * - - 2 2 + + " " # # # # # # # # $ $ # # $ $ Y Y $ $ # # : :
 $ $ 4 4 # # 5 5 6 6 II#.I5@IHSI	I I
 $ $ 1 1
 - - 7 7 + + ( (r5   rr   mpsc                  H    e Zd ZeZdZdZdZ e       j                  Z
 e       j                  ZeZ	 	 	 	 	 	 d fdZddZddZ	 d	 	 	 	 	 	 	 	 	 ddZd ej&                         f	 	 	 	 	 	 	 dd	Z	 	 	 	 	 	 	 	 	 	 dd
ZddZddZdddZdddZ	 	 	 	 	 	 	 	 	 	 ddZ xZS )MetalKernel;auto i   c                f    t        |   |fi | t        j                         | _        d| _        y )NF)super__init__	itertoolscountacc_var_idsmultistage_reduction)rA   tilingkwargs	__class__s      r3   r  zMetalKernel.__init__  s.    
 	*6*$??,$)!r5   c                    t         |   S r~   ru   )rA   rw   s     r3   dtype_to_strzMetalKernel.dtype_to_str  s    e$$r5   c                   | j                   j                  |      }| j                  |      }| d| j                  |       d}| j                  j                  | j                  |t        j                  j                  |            S )z"Codegen a load from an InputBuffer[]rw   )
r>   inputr   r   r   r   loadsr   graph	get_dtype)rA   nameindexr   lines        r3   loadzMetalKernel.load  sq    iiood#%%e,a))%013xx  T9J9J49P QQr5   Nc                   | j                   j                  |      }| j                  |      }| j                  t        j
                  j                  |            }| d| j                  |       d| d| d}| j                  r&| j                  j                  t        ||             y | j                  j                  t        ||             y )Nr*  z] = static_cast<rt   );)r>   outputr   r(  r   r/  r0  r   inside_reductionr   	writeliner   stores)rA   r1  r2  valuemoder   	dtype_strr3  s           r3   storezMetalKernel.store  s     iit$%%e,%%agg&7&7&=>	a))%011A)BugUWX  LL""<d#;<KK!!,tT":;r5   c           	     J   dt        | j                         }t        j                  j	                  |||      }|r6| j
                  j                  d| j                  |       d| d| d       |S | j
                  j                  d| j                  |       d| d       |S )Ntmp_acc_zthreadgroup  r*  z];r  )nextr"  r   r   create_cse_varindexing_coder9  r(  )rA   rw   
elem_countr   var_namer   s         r3   _new_accvarzMetalKernel._new_accvar  s     d4#3#3456hh%%h>((t0078(1ZLPRS 
 ((t0078(1E 
r5   c                b	   t        d | j                  D              }t        |j                  | j                        }|dk(  r| j                  |      }| j                  j                  | d       | j                  j                  d       | j                  j                  d| d| d       | j                  j                  d       |S |dv r| j                  ||      }| j                  rl|d	k(  rd
nd\  }	}
| j                  j                  | d|j                   d|	 d       | j                  j                  | d|j                   d|
 d| d       n.| j                  j                  | d|j                   d| d       | j                  j                  | j                  d| d| d| dt        |         S |dv r$| j                  ||      }| d|j                   d}t         |   }| j                  sV| j                  j                  | d| d| d       | j                  j                  | j                  d| d| d| d|      S |j#                  d      rdnd}| j                  j                  | d| d| d       |j%                  d       rt        d! | j&                  j)                         D              }| j                  t*        j,                  |      }|d"k(  rd#nd$}| d|j                   d}| j                  j                  | d%       | j                  j                  d| d&| d&| d| d'| d(| d'|j                   d)       | j                  j                  | j                  | d*| d| d| d+|      S | j                  j                  | d,| d| d| d       | j                  j                  | j                  d| d| d| d|      S |d-k(  r| j                  r
J d.|        | j                  ||      }| j                  j                  | d|j                   d| d       | j                  j                  | j                  d| d| d| d      }t/        j0                  | d/| d0| j2                  j4                  f      S t7        |      )1zCodegen a reduction operationc              3  :   K   | ]  }|j                   s|  y wr~   is_reduction.0ts     r3   	<genexpr>z(MetalKernel.reduction.<locals>.<genexpr>  s     K1ANNQK   anyz	 = false;z7threadgroup_barrier(metal::mem_flags::mem_threadgroup);z
                if (z) {
                    z' = true;
                }
            )prodsumrS  )r   r   )r   *r*  z] = r  z] z= zc10::metal::threadgroup_r9   rJ   r;   r,  )maxminargminargmaxr+  z = static_cast<rt   r6  rU  lowestz = ::metal::numeric_limits<z>::z();argc              3  :   K   | ]  }|j                   s|  y wr~   rJ  rL  s     r3   rO  z(MetalKernel.reduction.<locals>.<genexpr>  s      ArP  rX  ><z = -1;rA   = z;
                    z$;
                }
                z[c10::metal::threadgroup_z)]z = ::c10::metal::welford_reducez+Multistage reduction not yet supported for z.xz.y)rB  range_treesrV  numelmax_threadgroup_sizerG  rD  r9  r   splicer:  r#  r1  r   r   r   rv   endswith
startswithrange_tree_nodesvaluesr/   r$   r   _unwrapfeaturesreduction_numelNotImplementedError)rA   rw   rx   reduction_typer;  reduction_dimacc_buf_sizeaccacc_bufdefault_valreduction_opacc_thread_varsrc_metal_typelim_fnidx_varidx_acc_bufcmp_opidx_thread_varwf_ress                      r3   	reductionzMetalKernel.reduction  sm    K(8(8KK=..0I0IJU"""5)C((C5	):;((I LLG E  KK!!I J_,&&y,?G(( .% 7HX *\ "",,iq!3!3 4DQG ##iq!3!3 4B|nBugQO ##wiq1C1C0DDq$QR88$$*>*:!G9B|nTUV07 %  
 ??&&y,?G 'y-*<*<)=Q?N+I6N,,##%&on5ERwbQ xx((KK.~.>ay<.XYZ )  
 "0!8!8!?XUF((!""=n=MSQWPXX[\ ((/ #44;;=  #..uzz<H .( :$/=-2D2D1EQ!G""))^,<F*CD## )G1VHAn%5 6#$Cw /#$C~ 6%  xx((KK"m#<^<LAgYVXYeXffhi )  
 LL""!""3N3C1^DTTVW\V]]_` 88$$*>*:!G9B|nTUV %  
 --00 =n=MN0 &&y,?GLL7)1]-?-?,@UG1 MNXX&&*>*:!G9B|nTUVF %%826("t}}/L/LM  ".11r5   c                   | j                  |j                        }| j                  |      }|j                  r(|j                  j
                  | j                  kD  | _        |j                  r| j                  s9| j                  j                  | j                   d|j                   d| d       y |j                  j
                  | j                  z   dz
  | j                  z  }| j                  j                  d|j                   d|j                   d| d|j                   d		       | j                  j                         5  | j                  j                  | j                   d|j                   d| d
| d|j                   d
       || j                  z  |j                  j
                  k7  r@| j                  j                  d|j                   d|j                  j
                   d       d d d        y # 1 sw Y   y xY w)NrA  r^  r  r   z	for(auto z
_cnt = 0; z_cnt < z; ++z_cnt) {r_   z + z_cnt;if ( >= z) break;)rename_indexingrB   sexprrK  rootra  rb  r#  rD  r9  index_dtyper1  r   indent)rA   entryr   	index_str	loop_sizes        r3   codegen_iteration_ranges_entryz*MetalKernel.codegen_iteration_ranges_entry!  s   ))%**5
JJz*	(-

(8(84;T;T(TD%!!)B)B((##$Aejj\YKqA 
 JJt8881<&&'	 			

|:ejj\4PUPZPZ|[cd	
 YY 	WII##$Aejj\YKs9+SQVQ[Q[P\\ab 4444

8H8HH		##d5::,d5::;K;K:LH$UV	W 	W 	Ws   B/G==Hc                   | j                   r| j                  j                         5  | j                  j                  | j                         | j                  j                  | j
                         ddd       | j                  j                  d       d| _         nJ| j                  j                  | j                         | j                  j                  | j
                         | j                  j                  | j                         | j                  j                          | j
                  j                          | j                  j                          y# 1 sw Y   xY w)a  
        Concat output code from index_code, loads, compute, stores,
        suffix into self.body.

        For pointwise kernels, this is called just once at the end.

        For reduction kernels, this generates a loop over the reduction
        axis.
        N}F)	r#  r   r  rc  r.  r   r9  r:  clear)rA   s    r3   codegen_bodyzMetalKernel.codegen_body<  s     $$!!# /		  ,		  ./ II$(-D%IITZZ(IIT\\*		%

/ /s   AEE$c                   | j                          t               }|j                  d       | j                         }|j	                         5  |j                  dd       | j                  r|j                  d       |j                  d       |j	                         5  | j                  j                  j                         D ]Z  \  }}|| j                  v r| j                  t        j                  j                  |            }|j                  d| d| d	       \ | j                  j                  j                         D ]K  \  }}| j                  t        j                  j                  |            }|j                  d
| d| d	       M | j                  j                   j                         D ]  \  }}|j                  d| d	        t#        |      dk  sJ d       t#        |      dkD  rdt#        |       nd}t#        |      dk(  r|d   j$                  nd}| j                  rd	nd}	|j                  | d| d|	        | j                  r|j                  | d       ddd       |j                  d       |j	                         5  t#        |      dkD  rAt'        |      D ]3  \  }
}|j                  d|j$                   dt)        d|
z          d       5 |j                  | j*                         |j                  | j,                         ddd       |j                  d       ddd       |j                  d       |j/                         S # 1 sw Y   xY w# 1 sw Y   OxY w# 1 sw Y   BxY w)z3Called at the end to generate a final kernel stringzcompile_mps_shader("""z
            #include <c10/metal/random.h>
            #include <c10/metal/special_math.h>
            #include <c10/metal/utils.h>
            T)stripz&#include <c10/metal/reduction_utils.h>zkernel void generated_kernel(zdevice z* ,z	constant zconstant long&    z%Up to 3 index variables are supportedr   uintr   
thread_pos rA  z [[thread_position_in_grid]]z- group_pos [[thread_position_in_threadgroup]]Nz) {r  z = thread_pos.x   r  r  z"""))r  r   r9  active_range_treesr  rc  r8  r>   output_buffersitemsremoved_buffersr(  r   r/  r0  input_bufferssizevarsrL   r1  	enumeratechrrD  r   getvalue)rA   r1  codeidx_varsouterinnerr=  thread_pos_dtypethread_pos_var_namethread_pos_suffixidxr   s               r3   codegen_kernelzMetalKernel.codegen_kernelT  s@   /0**,[[] /	 KK
    $$GHNN:; $(II$<$<$B$B$D DLE5 4 44  $ 1 1!''2C2CE2J KINNWYKr%#BC	D
 %)II$;$;$A$A$C FLE5 $ 1 1!''2C2CE2J KINNYykE7!#DEF %)II$6$6$<$<$> ?LE5NN_UG1#=>?8}q(Q*QQ(.1(ma.?d3x=/*V ! ),H(:HQK$$ $ ,0+@+@Cb!'(*=)>>Z[lZmn ((NN+,,YZ/4 NN5! 'x=1$$-h$7 S#CHH:^Cc	N;K1M D../DII&' NN3_/	 ` 	v}}M 6' 'O/	  /	 s?   AM7GM.)M7BM+M7M(	$M7+M4	0M77N c           	     L   t         j                  j                  }g | j                  j                  j                         | j                  j                  j                         }|D cg c]  }|| j                  vs| }}|| j                  j                  j                         D cg c]  }t        |       c}z  }t        | j                               dkD  r| j                         D cg c]S  }| j                  |j                  r*t        j                  |j                   | j"                        n|j                         U }}|ddj%                  |       dgz  }| j&                  r{| j                         D cg c]I  }|j                  r9| j                  t        j                  |j                   | j"                              ndK }}|ddj%                  |       dgz  }|j)                  ||t+        j,                  d      d	       y
c c}w c c}w c c}w c c}w )zCodegen a call to this kernelr   z	threads=[rJ   r+  1zgroup_size=[cpuF)devicetritonN)r   r/  wrapper_coder>   r  keysr  r  r  r1   rL   r  pexprrK  sympyMinra  rb  rN   r8  generate_kernel_callr/   r  )rA   r1  nodewrapperr>   rZ  vthreadss           r3   call_kernelzMetalKernel.call_kernel  s   ''&&R))..0R4993J3J3O3O3QR#Gs$2F2F'FGG!3!3!8!8!:;AQ;; t&&()A- 002  

~~ IIaggt'@'@AG  y7!3 4A677D  
 002	  >> 

599QWWd.G.GHIG  |DIIg$6#7q9::D$$<<&	 	% 	
/ H;s    )H=H*H/AH AH!c                    |s|sy | j                  |      }|r| dnd}|r| d| j                  |       nd}|r|r
d| d| d}nd| | d}| j                  j                  | j                  |d	
       y )Nz < 0r  r~  zif ((z) && (z	)) returnr}  z) returnF)
assignment)r   r   r   r   )	rA   rB   sizelowerupperexpr_str
lower_expr
upper_exprr3  s	            r3   check_boundszMetalKernel.check_bounds  s      $$T**/z&R
CHzd&7&7&=%>?b
U:,fZL	BD*j\:D$,,?r5   )r$  zdict[str, sympy.Expr]r%  r   rl   None)rw   r  rl   r1   )r1  r1   r2  rk   rl   r   r~   )
r1  r1   r2  rk   r;  r   r<  r   rl   r  )rw   r  rE  zOptional[int]r   zValueRanges[Any]rl   r   )
rw   r  rx   r  rl  r   r;  +Union[CSEVariable, tuple[CSEVariable, ...]]rl   r  )r  r   rl   r  rl   r  )r1  zOptional[str]rl   r1   )r1  r1   r  r   rl   r  )
rB   rk   r  rk   r  r    r  r    rl   r  )rm   rn   ro   rr   	overridessuffixnewvar_prefixrb  r   r?   r  r7   r  kexprr  r(  r4  r>  r	   unknownrG  r{  r  r  r  r  r  __classcell__r&  s   @r3   r  r  ~  s\   IFMO##E&&EE*%* * 
	*%R SW
<
< *
<3>
<FO
<	
< %)#6;#6#6#8	 " !	
 
$g2g2 g2 &	g2
 ;g2 
5g2RW608t 
D@@&0@9=@FJ@	@r5   r  c                 2    dd l } | j                  dd       y )Nr   ztorch.compile for Metal is an early protoype and might not work as expected. For details see https://github.com/pytorch/pytorch/issues/150121r
   )
stacklevel)warningswarn)r  s    r3   _warn_prototyper    s    MM	L  r5   c                  <     e Zd ZeZd fdZ	 	 	 	 	 	 	 	 ddZ xZS )MetalSchedulingc                    t         |   |       t                t        j                  j
                  }||j                  j                  d       y y )NzDfrom torch._inductor.runtime.runtime_utils import compile_mps_shader)r  r  r  r   r/  r  headerrc  )rA   	schedulerr  r&  s      r3   r  zMetalScheduling.__init__  sE    #''&&NN!!V r5   c                   t         j                  j                  }||j                  v r|j                  |   }|S d|j	                          }| d}||j                  |<   t        ||      \  }}| d| }	|j                  |||	       |S )Nmps_lib_z.generated_kernel
)r   r/  r  src_to_kernelnext_kernel_suffixr   define_kernel)
rA   src_codenode_scheduler   r  kernel_namemps_lib_nameoriginsdetailed_originsmetadata_comments
             r3   r  zMetalScheduling.define_kernel  s     ''&&w,,,!//9K  &g&@&@&B%CDL)N*;<K.9G!!(+(;M7(S%G%")"-=,>?!!,:JKr5   )r  zOptional[Scheduler]rl   r  )r  r1   r  zlist[SchedulerNode]r   r  rl   r1   )rm   rn   ro   r  kernel_typer  r  r  r  s   @r3   r  r    s2    K,?IT	r5   r  )r2   z)Union[float, int, bool, str, CSEVariable]rl   r1   r  )=
__future__r   	functoolsr   typingr   r   r   r  sympy.printing.precedencer   r/   torch.utils._sympy.printersr   ExprPrinter_torch.utils._sympy.value_rangesr	   utilsr   r   virtualizedr   r   r   commonr   r   r   r   r   r   simdr   r   r   r   ops_handlerr   r   r  r   r   r   r    int8int16int32int64uint8r&   r'   bfloat16rv   r4   r7   rr   _initialize_pointwise_overridesr  cacher  r  rp   r5   r3   <module>r     s   #   / /  0  C 7 > , ,  C B 64 
JJ	JJ	KK	KK	KK	KK	KK	JJ	NNH
6b| 6br([ (D  . .u 5@@* @@F
  n r5   