
    Vh                    ^   d dl mZ d dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
 d dlmZmZmZmZmZmZ d dlZd dlZd dlZddlmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlm Z  ddl!m"Z"m#Z# ddl$m%Z% ddl#m&Z& ddl'm(Z(m)Z) ddl*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0m1Z1m2Z2m3Z3m4Z4 ddl5m6Z7m8Z8 ddl9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZB ddlCmDZD ddlEmFZF ddlGmHZHmIZImJZJ erd dlKmLZL ddl*mMZMmNZN  ej                  eP      ZQd ZR G d deS      ZT G d d e@      ZU eU       j                  ZW e@       j                  ZXej                  d!ej                  d"ej                  d#ej                  d$ej                  d%ej                  d&ej                  d'ej                  d(ej                  d)ej                  d*ej                  d+ej                  d,ej                  d-iZfd. Zgd/ Zh G d0 d1e?      Zieij                  d2        G d3 d4e;      Zkej                   G d5 d6             Zmd7 Znd8 Zo G d9 d:eI      Zp G d; d<eJ      Zqy)=    )annotationsN)defaultdict)inf)AnyCallablecastOptionalTYPE_CHECKINGUnion   )is_integer_dtype)
OrderedSet)FloorDivModularIndexing)symbol_is_typeSymT)ValueRanges   )configir)HalideCodeCache)get_reduction_combine_fn)is_metric_table_enabledlog_kernel_metadata)AddParenHandler)HalideInputSpec
HalideMeta)get_bounds_index_exprget_kernel_metadataparallel_num_threadssympy_index_symbol
sympy_subs)_opsV   )	BackendFeatureCSEVariableDeferredLineIndentedBufferKernelArgTypeOpOverridesPythonPrinterSizeArg	TensorArg)DTYPE_TO_CPP)cexpr)constant_repr
SIMDKernelSIMDScheduling)Sequence)ReductionType	StoreModec                "   t        | t              rVd| cxk  rdk  sKn t        j                  t        j                        }| |j
                  k(  ry| |j                  k(  ryd| dS t        | t              rdt        |        dS t        |       S )Ni   izhl.Int(64).min()zhl.Int(64).max()zhl.i64()zhl.f64()

isinstanceinttorchiinfoint64minmaxfloatr1   repr)valinfos     N/home/dcms/DCMS/lib/python3.12/site-packages/torch/_inductor/codegen/halide.pyhalide_constantrE   =   s~    #s[C%E:%E{{5;;'$((?%$((?%q!!#us+,A..9    c                        e Zd Zd fdZ xZS )Unsupportedc                *    t         |   d|        y )Nz!halide backend does not support: )super__init__)selfthing	__class__s     rD   rK   zUnsupported.__init__K   s    <UGDErF   returnNone)__name__
__module____qualname__rK   __classcell__rN   s   @rD   rH   rH   J   s    F FrF   rH   c                       e Zd Zed        Zed        Zd Zd Zd Zd Z	e	Z
d Zd Zd	 Zd
 Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Z fdZd ZeZd Zd Z xZS )HalidePrinterc                D    dt         j                  j                   d|  dS )Nhl.cast(, r8   )r$   kernelindex_dtypeexprs    rD   
cast_indexzHalidePrinter.cast_indexP   s"    !((../r$q99rF   c                    d|  dS )Nhl.cast(hl.Float(32), r8    r^   s    rD   
cast_floatzHalidePrinter.cast_floatT   s    'vQ//rF   c                    d| dS )Nhl.f32(r8   rc   rL   r_   s     rD   _print_FloatzHalidePrinter._print_FloatX   s    a  rF   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr%   rf   r   r8   lenargs_printrg   s     rD   _print_ToFloatzHalidePrinter._print_ToFloat[   s9    499~"""TYYq\23155rF   c                    t        |j                        dk(  sJ | j                  d| j                  |j                  d          d      S )Nr%   	hl.floor(r   r8   rk   rl   r`   rm   rg   s     rD   _print_floorzHalidePrinter._print_floor_   B    499~"""4;;tyy|+D*EQGHHrF   c                    t        |j                        dk(  sJ | j                  d| j                  |j                  d          d      S )Nr%   	hl.trunc(r   r8   rq   rg   s     rD   _print_TrunczHalidePrinter._print_Truncc   rs   rF   c                    t        |j                        dk(  sJ | j                  d| j                  |j                  d          d      S )Nr%   hl.ceil(r   r8   rq   rg   s     rD   _print_ceilingzHalidePrinter._print_ceilingi   sB    499~"""$++diil*C)DAFGGrF   c                J    d| j                  | j                  |             dS Nzhl.sqrt(r8   )rd   rm   rg   s     rD   _helper_sqrtzHalidePrinter._helper_sqrtm   s$    $//$++d*;<=Q??rF   c                    | j                  |j                  d         }| j                  |j                  d         }| j                  |j                  d         }d| d| d| dS )Nr   r%   r   
hl.select(r[   r8   )doprintrl   )rL   r_   cpqs        rD   _print_WherezHalidePrinter._print_Wherep   s_    LL1&LL1&LL1&A3b2aS**rF   c                h   t        |j                        dk(  r| j                  |j                  d         S t        |j                        dz  }| j                  t        j                  |j                  d |        }| j                  t        j                  |j                  |d         }d| d| dS )Nr%   r   r   hl.min(r[   r8   )rk   rl   rm   sympyMinrL   r_   midabs        rD   
_print_MinzHalidePrinter._print_Minv   s    tyy>Q;;tyy|,,$))n!KK		499Tc?34KK		499ST?342aS""rF   c                h   t        |j                        dk(  r| j                  |j                  d         S t        |j                        dz  }| j                  t        j                  |j                  d |        }| j                  t        j                  |j                  |d         }d| d| dS )Nr%   r   r   hl.max(r[   r8   )rk   rl   rm   r   Maxr   s        rD   
_print_MaxzHalidePrinter._print_Max   s    tyy>Q;;tyy|,,$))n!KK		499Tc?34KK		499ST?342aS""rF   c                    t        |j                        dk(  sJ | j                  d| j                  |j                  d          d      S )Nr%   hl.abs(r   r8   rq   rg   s     rD   
_print_AbszHalidePrinter._print_Abs   sB    499~"""TYYq\)B(C1EFFrF   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr%   zhl.cos((r   r8   rj   rg   s     rD   _print_OpaqueUnaryFn_cosz&HalidePrinter._print_OpaqueUnaryFn_cos   9    499~"""$++diil34A66rF   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr%   z	hl.cosh((r   r8   rj   rg   s     rD   _print_OpaqueUnaryFn_coshz'HalidePrinter._print_OpaqueUnaryFn_cosh   9    499~"""4;;tyy|45Q77rF   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr%   z	hl.acos((r   r8   rj   rg   s     rD   _print_OpaqueUnaryFn_acosz'HalidePrinter._print_OpaqueUnaryFn_acos   r   rF   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr%   zhl.sin((r   r8   rj   rg   s     rD   _print_OpaqueUnaryFn_sinz&HalidePrinter._print_OpaqueUnaryFn_sin   r   rF   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr%   z	hl.sinh((r   r8   rj   rg   s     rD   _print_OpaqueUnaryFn_sinhz'HalidePrinter._print_OpaqueUnaryFn_sinh   r   rF   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr%   z	hl.asin((r   r8   rj   rg   s     rD   _print_OpaqueUnaryFn_asinz'HalidePrinter._print_OpaqueUnaryFn_asin   r   rF   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr%   zhl.tan((r   r8   rj   rg   s     rD   _print_OpaqueUnaryFn_tanz&HalidePrinter._print_OpaqueUnaryFn_tan   r   rF   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr%   z	hl.tanh((r   r8   rj   rg   s     rD   _print_OpaqueUnaryFn_tanhz'HalidePrinter._print_OpaqueUnaryFn_tanh   r   rF   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr%   z	hl.atan((r   r8   rj   rg   s     rD   _print_OpaqueUnaryFn_atanz'HalidePrinter._print_OpaqueUnaryFn_atan   r   rF   c                   |j                   rt        | 	  |      S |j                  \  }}| j	                  | j                  |            }| j	                  | j                  |            }| j                  d| d| d      S )Nrp   z / r8   )
is_integerrJ   _print_FloorDivrl   rd   r   r`   )rL   r_   xdivrN   s       rD   r   zHalidePrinter._print_FloorDiv   sr    ??7*4003OODLLO,oodll3/01#SQ788rF   c                    t        |j                        dk(  sJ | j                  d| j                  |j                  d          d      S )Nr%   	hl.round(r   r8   rq   rg   s     rD   _print_RoundzHalidePrinter._print_Round   rs   rF   c                2    |j                   \  }}d| d| dS )N() / (z+hl.f32(0)))rl   )rL   r_   r   r   s       rD   _print_IntTrueDivzHalidePrinter._print_IntTrueDiv   s$    yy11#U1#[))rF   c                ~    |j                   \  }}| j                  |      }t        |      }dd| z  d| dd|z  dS )Nrf   g      $@z)*hl.round((z	)*hl.f32()))rl   rm   r:   )rL   r_   rB   ns       rD   _print_RoundDecimalz!HalidePrinter._print_RoundDecimal   sL    Qkk#F1"(SE47+RPPrF   ) rR   rS   rT   staticmethodr`   rd   rh   rn   rr   rv   _print_TruncToIntry   r|   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   _print_RoundToIntr   r   rU   rV   s   @rD   rX   rX   O   s    : : 0 0!6II %H@+##G7887887889I %*
QrF   rX   z	hl.Bool()zhl.BFloat(16)zhl.Float(16)zhl.Float(32)zhl.Float(64)z	hl.Int(8)z
hl.Int(16)z
hl.Int(32)z
hl.Int(64)z
hl.UInt(8)zhl.UInt(16)zhl.UInt(32)zhl.UInt(64)c                    t         |    S N)_halide_typedtypes    rD   halide_typer      s    rF   c                    t        |       r/| j                  r#| t        j                  k7  rt        j                  } | t        j
                  t        j                  fv rt        j                  } t        |       S r   )	r   	is_signedr;   r=   int32float16bfloat16float32r   r   s    rD   halide_acc_typer      sM    5??u7K//urF   c                     e Zd Ze	 	 dB	 	 	 dCd       ZedDd       Zed        Zed        Zed        Z	ed        Z
ed        Zed	        Zed
        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Z ed        Z!ed        Z"ed         Z#ed!        Z$ed"        Z%ed#        Z&ed$        Z'ed%        Z(ed&        Z)ed'        Z*ed(        Z+ed)        Z,ed*        Z-ed+        Z.ed,        Z/ed-        Z0ed.        Z1ed/        Z2ed0        Z3ed1        Z4ed2        Z5ed3        Z6ed4        Z7ed5        Z8ed6        Z9ed7        Z:ed8        Z;ed9        Z<ed:        Z=ed;        Z>ed<        Z?ed=        Z@edEd>       ZAed?        ZBed@        ZCedA        ZDy)FHalideOverridesNc                X    |t         j                  k(  rd|  dS dt        |       d|  dS )Nr   z != 0)rZ   r[   r8   )r;   boolr   )r   r   	src_dtypeuse_compute_typess       rD   to_dtypezHalideOverrides.to_dtype   s9     EJJqc= +e,-Rs!44rF   c                    |t         j                  t         j                  fv rdt        |       d|  d} dt        |       d|  d}|t         j                  t         j                  fv rd| d}|S )NrZ   r[   r8   zhl.reinterpret(rb   )r;   r   r   r   )r   r   r   lines       rD   to_dtype_bitcastz HalideOverrides.to_dtype_bitcast   ss    77;y12"QCq9A U!3 4Bqc;U]]ENN33+D63DrF   c                8    | j                  t        |      |      S r   )r   rE   )clsvaluer   s      rD   constantzHalideOverrides.constant  s    ||OE2E::rF   c                    d|  dS )Nr   r8   rc   r   s    rD   abszHalideOverrides.abs      1~rF   c                R    t        | d      sd|  dS d|  d| j                   d|  dS )Nnamehl.exp(r8   z"hl.fast_exp(hl.cast(hl.Float(32), z)) if z!.type().bits() <= 32 else hl.exp(hasattrr   r   s    rD   expzHalideOverrides.exp
  s=    q&!QCq>!3A3fQVVHDefgehhijjrF   c                    d|  dS )Nr   r8   rc   r   s    rD   libdevice_expzHalideOverrides.libdevice_exp  r   rF   c                    d|  dS r{   rc   r   s    rD   sqrtzHalideOverrides.sqrt      !ArF   c                    t        | d      s	d|  d| dS d| j                   d| d}d|  d| d	|  d
|  d| d| j                   d|  d| dS )Nr   r   r[   r8   rZ   	.type(), hl.select((<)|hl.is_nan(), ) if z.type().is_float() else hl.min(r   r   r   s     rD   minimumzHalideOverrides.minimum       q&!QCr!A&&qvvhis!,QCq<s#aS1#U166(JijkillnopnqqrssrF   c                    t        | d      s	d|  d| dS d| j                   d| d}d|  d| d	|  d
|  d| d| j                   d|  d| dS )Nr   r   r[   r8   rZ   r   r   >r   r   r   z.type().is_float() else hl.max(r   r   s     rD   maximumzHalideOverrides.maximum   r   rF   c                X    t        |d      rd|j                   d| d}d|  d| d| dS )Nr   rZ   r   r8   r~   r[   r   )r   r   r   s      rD   wherezHalideOverrides.where(  s?    1f166()A3a0AA3b2aS**rF   c                    d|  dS )Nzhl.cos(r8   rc   r   s    rD   coszHalideOverrides.cos.  r   rF   c                    d|  dS )Nzhl.sin(r8   rc   r   s    rD   sinzHalideOverrides.sin2  r   rF   c                    t        d      )NlgammarH   r   s    rD   r   zHalideOverrides.lgamma6      (##rF   c                    d|  dS )Nzhl.erf(r8   rc   r   s    rD   erfzHalideOverrides.erf:  r   rF   c                    d|  dS )Nzhl.cosh(r8   rc   r   s    rD   coshzHalideOverrides.cosh>  r   rF   c                    d|  dS )Nzhl.sinh(r8   rc   r   s    rD   sinhzHalideOverrides.sinhB  r   rF   c                    d|  dS )Nzhl.acos(r8   rc   r   s    rD   acoszHalideOverrides.acosF  r   rF   c                    d|  dS )Nz	hl.acosh(r8   rc   r   s    rD   acoshzHalideOverrides.acoshJ      1#QrF   c                    d|  dS )Nzhl.asin(r8   rc   r   s    rD   asinzHalideOverrides.asinN  r   rF   c                    d|  dS )Nz	hl.asinh(r8   rc   r   s    rD   asinhzHalideOverrides.asinhR  r  rF   c                    d|  d| dS )Nz	hl.atan2(r[   r8   rc   r   ys     rD   atan2zHalideOverrides.atan2V      1#Rs!$$rF   c                    d|  dS )Nzhl.atan(r8   rc   r   s    rD   atanzHalideOverrides.atanZ  r   rF   c                    d|  dS )Nz	hl.atanh(r8   rc   r   s    rD   atanhzHalideOverrides.atanh^  r  rF   c                    t        d      )Ncopysignr   r  s     rD   r  zHalideOverrides.copysignb  s    *%%rF   c                    t        d      )Nerfinvr   r   s    rD   r  zHalideOverrides.erfinvf  r   rF   c                    d|  d| dS )Nz	hl.hypot(r[   r8   rc   r  s     rD   hypotzHalideOverrides.hypotj  r  rF   c                    t        d      )N	nextafterr   r  s     rD   r  zHalideOverrides.nextaftern  s    +&&rF   c                    |  d| S Nz & rc   r   s     rD   logical_andzHalideOverrides.logical_andr      Cs|rF   c                    |  dS )Nz == 0rc   r   s    rD   logical_notzHalideOverrides.logical_notv  s    E{rF   c                    |  d| S Nz | rc   r   s     rD   
logical_orzHalideOverrides.logical_orz  r  rF   c                    d|  d| dS )Nr    ^ r8   rc   r   s     rD   logical_xorzHalideOverrides.logical_xor~  s    1#S1~rF   c                    |  d| S r  rc   r   s     rD   bitwise_andzHalideOverrides.bitwise_and  r  rF   c                    d|  S )N~rc   r!  s    rD   bitwise_notzHalideOverrides.bitwise_not  s    1#wrF   c                    |  d| S r$  rc   r   s     rD   
bitwise_orzHalideOverrides.bitwise_or  r  rF   c                    |  d| S )Nr'  rc   r   s     rD   bitwise_xorzHalideOverrides.bitwise_xor  r  rF   c                    |  d| S )Nz << rc   r   s     rD   bitwise_left_shiftz"HalideOverrides.bitwise_left_shift      D}rF   c                    |  d| S )Nz >> rc   r   s     rD   bitwise_right_shiftz#HalideOverrides.bitwise_right_shift  r4  rF   c                    d|  d| dS )Nzhalide_helpers.rand(r[   r8   rc   seedoffsets     rD   randzHalideOverrides.rand  s    %dV2fXQ77rF   c                    d|  d| dS )Nzhalide_helpers.randn(r[   r8   rc   r8  s     rD   randnzHalideOverrides.randn  s    &tfBvha88rF   c           	          d|  d| d| d| d	S )Nzhalide_helpers.randint64(r[   r8   rc   )r9  r:  lowhighs       rD   	randint64zHalideOverrides.randint64  s#    *4&6("SED6KKrF   c                    t        j                  | d       dt        j                  j                  j                  d|       S )Nr    + load_seed_offset)opsloadr$   r\   rl   seed_offset)r   r:  s     rD   	load_seedzHalideOverrides.load_seed  s7    ((4#$C(A(ABTV\(]'^__rF   c                    d|  dS )Nz1./hl.sqrt(r8   rc   r   s    rD   rsqrtzHalideOverrides.rsqrt  s     QCq!!rF   c                    d|  dS )Nzhl.tan(r8   rc   r   s    rD   tanzHalideOverrides.tan  r   rF   c                    d|  dS )Nzhl.tanh(r8   rc   r   s    rD   tanhzHalideOverrides.tanh  r   rF   c                    d|  dS )Nz3(hl.reinterpret(hl.UInt(32), hl.cast(hl.Float(32), z)) >> 31) != 0rc   r   s    rD   signbitzHalideOverrides.signbit  s    DQC~VVrF   c                    |  d|  d| d| S )Nz - hl.trunc(/z)*rc   r   s     rD   fmodzHalideOverrides.fmod  s!     L1QCr!--rF   c                    d|  d| dS )Nzhl.pow(r[   r8   rc   r   s     rD   powzHalideOverrides.pow  s    2aS""rF   c                    d|  dS )Nzhl.log(r8   rc   r   s    rD   logzHalideOverrides.log  r   rF   c                    d|  dS )Nz hl.is_inf(hl.cast(hl.Float(32), r   rc   r   s    rD   isinfzHalideOverrides.isinf       2!B77rF   c                    d|  dS )Nz hl.is_nan(hl.cast(hl.Float(32), r   rc   r   s    rD   isnanzHalideOverrides.isnan  rZ  rF   c                    d|  dS )Nr   r8   rc   r   s    rD   roundzHalideOverrides.round  r  rF   c                    d|  dS )Nrp   r8   rc   r   s    rD   floorzHalideOverrides.floor  r  rF   c                    d|  d| dS )Nr   r   z + hl.f32(0))rc   r   s     rD   int_truedivzHalideOverrides.int_truediv  s    1#U1#]++rF   c                .    d| j                    d|  d| dS )Nz"hl.floor(hl.cast(hl.Float(max(32, .type().bits())), ) / r8   r   r   s     rD   floordivzHalideOverrides.floordiv  s)     18J1#TRSQTTUV	
rF   c                4   t        j                  t        j                  d|      t        j                        }t        j                  t        j                  |d      t        j                        }t        j
                  ||      }d|j                   d| dS )N0rZ   r   r8   )rE  r   ltr;   int8subr   )r   r   leftrightrl  s        rD   signzHalideOverrides.sign  sg    ||CFF3NEJJ7SVVAs^UZZ8ggdE"!&&3%q11rF   c                    d|  dS )Nru   r8   rc   r   s    rD   trunczHalideOverrides.trunc  r  rF   c                .    d| j                    d|  d| dS )Nz"hl.trunc(hl.cast(hl.Float(max(32, rd  re  r8   rf  r   s     rD   truncdivzHalideOverrides.truncdiv  s)    
 18J1#TRSQTTUV	
rF   c                    d|  dS )Nrx   r8   rc   r   s    rD   ceilzHalideOverrides.ceil  r   rF   c                    d|  dS )Nr   z, 0)rc   r   s    rD   reluzHalideOverrides.relu  s    4  rF   c                ~   t         j                  j                  |      }t         j                  j                  t         j                  j	                  |      t         j                  j                  |      t        |            }|t        j                  t        j                  fvrt        j                  ||      S |S Nbounds)r$   r\   prepare_indexinggenfuncindex_to_strused_dims_from_indexr   r;   r   r=   rE  r   )r   r_   r   indexvars        rD   
index_exprzHalideOverrides.index_expr   s    ))$/hhHH!!%(HH))%0(.  

 ekk22<<U++
rF   c                    t        j                  |t        j                        }t        j                  |||      }||_        t        t        |            S r   )rE  r   r;   r   halide_clampindirect_indexing_sizer!   str)r   	index_varsizecheckwrap_negs        rD   indirect_indexingz!HalideOverrides.indirect_indexing  sC     LLEKK8	$$Ye<	+/	(!#i.11rF   c                    t         j                  j                  t         j                  j                  |      dz
        }t	        |t
        t        j                  f      sd|j                   d| d}d| d| dS )Nr%   rZ   r   r8   z	hl.clamp(z, 0, )	r$   r\   kexprrename_indexingr9   r:   r   Integerr   )r   r   r  r  ends        rD   r  zHalideOverrides.halide_clamp  sj    hhnnQXX55d;a?@$emm 45UZZL	#a8C 5'se1--rF   c                   t         j                  j                  | |      5 } |       }d d d        j                  j                  rt        |      }t         j                  j                  d|j                   dt        |       dg t        j                  |            }t        j                  ||      S # 1 sw Y   xY w)NrZ   r   r8   rz  )r$   r\   
mask_loadsr{  is_boolr   r}  r   rE   r   wraprE  r   )maskbodyothernew_maskresults        rD   maskedzHalideOverrides.masked  s    XX  u- 	VF	 ==  KE   v{{m9_U-C,DAF##E* ! 
 yy6511	 	s   B88Cc                    t        d      )Nfrexp)NotImplementedErrorr   s    rD   r  zHalideOverrides.frexp.  s    !'**rF   )NT)r   torch.dtyper   Optional[torch.dtype])r   r  r   r  )TT)ErR   rS   rT   r   r   r   classmethodr   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r
  r  r  r  r  r  r  r  r  r"  r%  r(  r*  r-  r/  r1  r3  r6  r;  r=  rA  rH  rJ  rL  rN  rP  rS  rU  rW  rY  r\  r^  r`  rb  rg  ro  rq  rs  ru  rw  r  r  r  r  r  rc   rF   rD   r   r      s&    ,0	55 )5 5   ; ;   k k
     t t t t + +
     $ $                   % %       & & $ $ % % ' '                     8 8 9 9 L L ` ` " "     W W . . # #   8 8 8 8         , , 
 
 2 2     
 
   ! ! 	 	 2 2 . . 2 2  + +rF   r   halidec                  l     e Zd Z ej                  d      Z	 d	 	 	 	 	 d fdZd Zd Zd	dZ	d Z
 xZS )
HalideCSEVariablez\b(tmp\d+)\[\?\]c                6    t         |   |||       d | _        y r   )rJ   rK   	used_dims)rL   r   r{  r   rN   s       rD   rK   zHalideCSEVariable.__init__9  s     	vu-7;rF   c                T   t        | j                  xs d      }t        j                  ||j	                               D ]D  }t        |t              s|j                  
J |||f       |j                  |j                         F t        j                  j                  |      | _        y )Nrc   )r   r  	itertoolschainvaluesr9   r  updater$   r\   sort_used_dims)rL   r   rl   kwargsusedargs         rD   update_on_argsz HalideCSEVariable.update_on_argsB  s    $...B/??49 	+C#01}}0C4d2CC0CMM*	+ 006rF   c                    t        |      dk(  r| j                   dS | j                   ddj                  t        t        |             dS )Nr   z[()][r[   ])rk   r   joinmapr  )rL   dimss     rD   	index_strzHalideCSEVariable.index_strJ  sE    t9>ii[%%))AdiiC78::rF   c                n    | j                   | j                   dS | j                  | j                         S )Nz[?])r  r   r  )rL   s    rD   __str__zHalideCSEVariable.__str__P  s0    >>!ii[$$~~dnn--rF   c           	         | j                   t        d | j                   D              sJ | j                  | j                   D cg c]  }|j                  ||       c}      S c c}w )Nc              3  P   K   | ]  }t        |t        j                           y wr   r9   r   Expr.0r   s     rD   	<genexpr>z-HalideCSEVariable.subs_str.<locals>.<genexpr>W  s       2
*+Jq%**%2
   $&)r  allr  get)rL   replacementsr   s      rD   subs_strzHalideCSEVariable.subs_strV  s^    ~~)c 2
/3~~2
 /
 	
 
 ~~t~~N!|//15NOONs   A$r   )r{  zValueRanges[Any]r   r  rP   rQ   )rP   r  )rR   rS   rT   recompileundefined_rerK   r  r  r  r  rU   rV   s   @rD   r  r  6  sQ    2::12L (,	< !< %	<
 
<7;.PrF   r  c                  H     e Zd ZU ded<   ded<   ded<   d fdZd	dZ xZS )
DimensionInfozOptional[sympy.Expr]r_   
sympy.Exprr  stridec                    t         |           t        j                  j                  j                  |d      r| }| }|| _        || _        || _        y Nr   )	rJ   rK   r$   graphsizevarsstatically_known_ltr_   r  r  )rL   r_   r  r  rN   s       rD   rK   zDimensionInfo.__init__c  sK    77//:WF5D		rF   c                   | j                   J | j                   }|r|dk(  ry|ri |}|j                  D ]  }t        |t        j                        st        |t        j                        sJ t        j                  j                  |j                        }t        |t              sJ t        |j                  |            ||<    t        ||      }t        j                  j!                  |      S )Nr   hl.Var())r_   free_symbolsr   r   TMPr9   r   Symbolr$   r\   lookup_cse_varr   r  r!   r  r"   r~  )rL   r  	zero_varsr_   symr  s         rD   r  zDimensionInfo.index_strl  s    yy$$$yy+l+L(( W!#txx0%c5<<888((11#((;C%c+<===(:3<<;U(VL%W dL1Dxx$$T**rF   rO   NF)rR   rS   rT   __annotations__rK   r  rU   rV   s   @rD   r  r  ]  s    

+rF   r  c                   t         j                  j                  j                  | |      ry	 t         j                  j                  j	                  |       }t         j                  j                  j	                  |      }||k(  r*t         j                  j                  j                  | |       ||k(  S # t
        $ r Y yw xY wNTF)r$   r  r  statically_known_equals	size_hint	TypeErrorguard_equals)rm  rn  r   r   s       rD   eqr  }  s    ww//e<GG&&t,GG&&u- 	Av	%%dE26M	  s   AB3 3	B?>B?c                   t         j                  j                  j                  | |      ry	 t         j                  j                  j	                  |       }t         j                  j                  j	                  |      }||k  r*t         j                  j                  j                  | |       ||k  S # t
        $ r% t        j                  | |      }|| k(  r| |k7  cY S Y yw xY wr  )	r$   r  r  r  r  r  r   gcdguard_lt)rm  rn  r   r   r  s        rD   rj  rj    s    ww++D%8GG&&t,GG&&u- 	1u	!!$.q5L  iie$$;5= 	s   AB3 3)C! C!c                      e Zd ZU eZeZded<   	 	 	 	 d" fdZd#dZ	d$dZ
d% fdZd Zd Z	 	 d& fd	Zd
 Zd'dZd Zd Zd&dZd Zd(dZd)dZd*dZ	 d+	 	 	 	 	 	 	 	 	 d,dZ	 	 	 	 	 	 	 	 	 	 d-dZd Z	 	 	 	 	 	 	 	 d.dZ ej:                         d	 d/dZd/dZd*dZ d Z!d0dZ"d+dZ#e$d        Z%d+d*dZ&d  Z'	 	 	 	 	 	 	 	 d1d!Z( xZ)S )2HalideKernelzCallable[[sympy.Expr], str]r  c                x   t        |   |fi | | j                  | _        | j                  | _        | j                  | _        t               | _        | j                  | _	        | j                  | _
        i | _        i | _        i | _        i | _        i | _        i | _        t#        t$              | _        d| _        y r  )rJ   rK   r  computeloadsstoresr)   indexing_code_dominside_reductionneeds_dom_indexinghas_reductionbuffer_dimensionsbuffer_offsetshalide_varsindex_replacementsreduction_renamesdom_renamesr   listbuffer_aliaseshas_indirect_indexing)rL   tilingr  rN   s      rD   rK   zHalideKernel.__init__  s    
 	*6*yyYY
ii!/!1"&"7"7!22AC57;=@BCEHJ4?4E%*"rF   c                    t        |      S r   )r   )rL   r   s     rD   dtype_to_strzHalideKernel.dtype_to_str  s    5!!rF   c                ^    | j                   j                  | d|d       t        |||      S )Nz = hl.Func(r8   )r  	writeliner  )rL   r   r{  r   s       rD   create_cse_varzHalideKernel.create_cse_var  s0    		tfKxq9: vu55rF   c           
        | j                   s| j                  s| j                  rJ t        j                  t
        j                  j                  j                  t              t        j                  t        t         | 8  |            }t        t                   t"        j$                  j'                  | j(                  D cg c]  }|j*                  j-                          c}      D ci c]  }|j/                         | c}d }fd}fd}|D ]  }|j1                  t2              rV|j5                  t3        t7        j8                  d      t7        j8                  d      t7        j8                  d            |       |j1                  t:              rB|j5                  t;        t7        j8                  d      t7        j8                  d            |       j=                  t         | 9  |      j>                          tA        d D              | _!        d	}tE        | j(                        D ]%  }|j*                  j-                         D cg c]  }|j/                         v s| }	}|	jG                  fd
       |	s+|	jI                  |jK                  d|jL                               d}
t6        jN                  jP                  }g }|
tS        |	      k  rtU        |jL                  |      s|	D cg c]+  }tU        |jV                  |      s ||jX                        - }}|
tS        |      z  }
|sJ |	       |t        jZ                  t
        j                  j                  j\                  |      z  }|j_                  |	D cg c]C  }ta        ||jV                        r+ta        |jV                  |      r ||jV                  |z        E c}       |rt        jZ                  t6        jb                  |      }tU        |d      r2 ||jL                  |z        }tU        |d      rJ g }tS        |	      }
d}te        dtS        | j                               }|jf                  r.te        dtS        | j                               | j                  |<   || j                  |<   |jI                  ||f       ||z  }|	D cg c]%  }tU        |jV                  |      s|jX                  ' }}|
tS        |      z  }
tS        |      }|D cg c]&  }tU        ||      st7        jh                  ||z        ( }}tS        |      |k  s|dk(  sJ |j_                  |       |r|
tS        |	      k  rtU        |jL                  |      s|	D ]  }	 d}d}tU        |jV                  |      s)||   \  }}|dz  }||z  }tU        |jV                  |      s)d}t6        jN                  jj                  }tU        |jX                  |      s1||   \  }}|dz  }|||z  z  }||z  }tU        |jX                  |      s1|| j                   |j/                         <    ( | j                  D ]-  }| jp                  js                  | d|jt                  d       / | j                  rL| jw                  d| j                  jy                         D ci c]  \  }}|| j                  |    c}}       yyc c}w c c}w c c}w c c}w c c}w c c}w c c}w # tl        $ r |sJ t6        jN                  jj                  }t6        jN                  jP                  }|D ]  \  }}|||z  z  }||z  } t
        j                  j                  jo                  t3        ||jV                  |jX                        | j                        | j                   |j/                         <   Y Mw xY wc c}}w )a  
        Hook called right before codegen with every index that will be
        used in the fused kernel.

        This populates self.halide_vars/index_replacements/reduction_renames which is an alternate indexing
        scheme that avoids using divide and modulus.  Instead of xindex/yindex/rindex
        we base indexing on a larger number of vars whose product combines to those.

        This function populates self.halide_vars, self.index_replacements, and self.reduction_renames
        fallbackc                z    t        j                  t        j                  j                  j                  |             S r   )r   simplifyr$   r  r  remove_precomputed_replacementsr^   s    rD   r  z0HalideKernel.finalize_indexing.<locals>.simplify  s+    >>  @@F rF   c                   | v r|    }j                  |j                  j                  |j                  |z  t        j
                  j                  j                  |t        |j                  |                  j                                y y r   )addrootlookupdivisorr$   r  r  evaluate_minr   lengthsymbol)baser  modulusnodeall_used_symbolssym_to_nodes       rD   visit_modular_indexingz>HalideKernel.finalize_indexing.<locals>.visit_modular_indexing  sw    {""4( $$II$$w.((55#Xdkk7%C
 fh #rF   c           	         | v r`|    }j                  |j                  j                  |j                  |z  t	        |j
                  |            j                                y y r   )r  r  r  r  r   r
  r  )r  r  r  r  r  s      rD   visit_floor_divz7HalideKernel.finalize_indexing.<locals>.visit_floor_div  s]    {""4( $$II$$w. g6 fh	 #rF   r  r  r  c              3  P   K   | ]  }t        |t        j                           y wr   )r   r   INDIRECT)r  r  s     rD   r  z1HalideKernel.finalize_indexing.<locals>.<genexpr>  s       )
36N3.)
r  Fc                (     | j                         S r   )r  )r   r  s    rD   <lambda>z0HalideKernel.finalize_indexing.<locals>.<lambda>  s    Yqyy%9 rF   keyr%   r   Thhrz
 = hl.Var(r8   rdomN)=r  r  r  	functoolspartialr$   r  r  r  r   dictfromkeysr  rJ   r|  r   r   r  r  from_iterablerange_treesnodesr  r  hasr   replacer   Wildr   r  r  anyr  reversedsortappendr  numelSOnerk   r  r  r
  reduceevaluate_maxextendrj  r  r!   is_reductionr  Zero
IndexErrorsimplify_with_rangesindexing_coder  r   codegen_rdomitems)!rL   indicestreer   r  r  r  r  had_fallbackr#  handled_countr  added_sym_sizesizes_to_addr  	next_sizer  	new_sizes	prior_lensr  idxr  r
  r_   
full_indexr  vrvr  r  r  rN   s!                                @@@rD   finalize_indexingzHalideKernel.finalize_indexing  s    ##t'7'74;Q;Q	
 
 %%agg&6&6&@&@3O	--EG$<g FG%c?, __22151A1AB""$B
 HHJM
	

		  	REyy)#

6*

9-

9-
 + yy"

6*

9- $ ##EG$<U$C$P$PQ%	R( &) )
:J)
 &
" T--. S	D $

 1 1 3V1qxxzEU7UQVEVJJ9J:T[[DJJ78MggkkGN#e*,R

G5L05 +,AIIw9OHQXX&    \!22#*U*|	 0 0GG$$11<!  ## "'gqyy1bC6H !W!45 # ) 0 0L II)Q' %-TZZ'-A$B	#%i#333')(+E
'+,qT5E5E1F0G-HIC((6H T%5%5!6 787..s3 -6D$$S)"))3	*:;y(G38 SaBqyy'<R SI S!S^3M #L 1I ".$!!Y/ q9}5$L $
 |,y8INJJ ''	27 #!  #e*,R

G5L\  CG w7$23$7	Tq4 !w7 F 77<<D f5$23$7	Tq,$	 !f5
 >BD++DKKM:oS	l ## 	JC((C5
388,a)HI	J!!6:6L6L6R6R6TUUQT%%a((U "c C
z W 0 !T$2 " ''<!&J"WW[[F%3 '	T"fsl2
$' ((==+JdkkR ,, ++DKKM:( Vsd   ![81[=\2\\\A\
\+\+\A\A"\\_(C_%$_%c           
        | j                   rdnd}|| j                  v r| j                  |   S i }| j                  j                         D ]c  }| j                   s|| j                  v rt        j                  d|j                        }|sJ t        d| |j                  d             ||<   e | j                  | d|j                         D ci c]  \  }}|| j                  |    c}}       || j                  |<   |S c c}}w )zCRDom based indexing uses explicit iteration ranges for Func updatesioz^h(\d+)$r  r%   dom)r  r  r  keysr  r  matchr   r!   groupr6  r7  )rL   prefixrenamesr  mrD  rE  s          rD   setup_dom_indexingzHalideKernel.setup_dom_indexingk  s   --3T%%%##F++##((* 	HC((SD4J4J-Jchh/AH1-&!''!*.FGGCL	H 	hcN'--/RBR!1!1!!44R	
 $+ 	 Ss   Dc           	     v   |j                         D cg c]&  }d| j                  | j                  |             d( }}| j                  j	                  | ddj                  |       d       t        |j                               D ])  \  }}| j                  j	                  | d| d| d       + y c c}w )	Nhl.Range(0, r8   z = hl.RDom([r[   ]) = r  r  )r  r  r  r5  r  r  	enumeraterK  )rL   r   varsr  rsizesrH  rsyms          rD   r6  zHalideKernel.codegen_rdom  s     
 4::d&:&:4&@AB!D
 
 	$$v\$))F:K9LB%OP - 	BGAt((D6TF!A3a)@A	B
s   +B6c                    t         |   |      }t        || j                        }t        j
                  j                  j                  || j                        S r   )	rJ   r|  r"   r  r$   r  r  r4  r  )rL   r  rN   s     rD   r|  zHalideKernel.prepare_indexing  sI     (/5$"9"9:ww44UD<L<LMMrF   c                    t        |t        j                        r%| j                  |j                        j
                  S | j                  |   S )zThe size of an index symbol)r   r   r  r  r   r  r  )rL   r  s     rD   sym_sizezHalideKernel.sym_size  s<    #txx(&&sxx0GGG$$rF   c           	     	    g t        |j                  d       D ]~  }t        |t        j                  t        j
                  f      rj                  |       ?t        |t        j                  t        j                  t        j                  f      ryJ |        t        j                  j                  }D ci c]  }|t        j                  j                   }}g }t        j                   j                  |            }t        |t        j                         r|j"                  n|gD ]  }	|	j                  D 
cg c]	  }
|
|v s|
 }}
t%        |      dk(  r||	z  }5t%        |      dk(  r||d   xx   |	z  cc<   Tg }t'        t%        |            D ]e  }||   J ||   \  }}t)        |      t)        |      z  r*|j+                  |D 
cg c]	  }
|
|vs|
 c}
       |	|z  }	S|j                  ||f       g g |||	f}  fd}g }|D ]8  \  }}|D ]  }
||j-                  |
      z  } |j                   |||             : |j/                         D ]  \  }}|j                   |||g               |j1                  d        |sA j2                  r|j                  t5        t        j                  j                  dd             n}t6        j8                  j:                  j=                  |d   j>                  d      sF|jA                  dt5        t        j                  j                  rdn|d   j>                  d             |rs| jB                  v rit6        j8                  j:                  jE                  | jB                  |         r2 jG                  || jB                  |   z
          jB                  |   }n>t6        j8                  j:                  jI                  |d      r jG                  ||       d}|}tK        jL                         D ]W  } jO                  |||      r||fc S rJ | d| }| jP                  |   vs: jP                  |   j                  |       Y yc c}w c c}
w c c}
w )	zEConvert address-based indexing into dimensions using self.halide_varsc                    | j                   S r   rf  r   s    rD   r  z5HalideKernel.indexing_to_dimensions.<locals>.<lambda>  s
    AFF rF   r  r   r%   Nc                    t        j                  |       } t        |      dk(  rUt        j                  d
      }| j	                  ||d   z        }|r%t        |d   	j                  |d         ||         S rJ |        t        j                  t        | |D ci c]  }|	j                  |      dz
   c}      dz         }t         j                  j                  }t        | t         j                        rt| j                  D ]e  }t        |t         j                        s||z  }t        j                  | |z        } t        j                  t        j                  ||z              }g t        | ||      S c c}w )Nr%   wild)excluder   )r   factorrk   r&  rL  r  r\  r  r"   r,  r-  r9   Mulrl   r  ceiling)r_   symsstride_wildrP  r  r
  r  termis_storerL   symbolss           rD   expr_to_dimensionz>HalideKernel.indexing_to_dimensions.<locals>.expr_to_dimension  sG   <<%D4yA~#jjAJJ{T!W45(QtAw!7;   %%<^^4!N##t}}S'9A'="=!NORSSF WW[[F$		* II ND!$6$$~~dTk:!&ftm0L!M	N
 !vv66 "Os   E;c                t    t         j                  j                  j                  | j                  t
              S )Nr  )r$   r  r  r  r  r   )ds    rD   r  z5HalideKernel.indexing_to_dimensions.<locals>.<lambda>  s$     0 0 : :188c : R rF   _view))sortedr  r   r   HALIDEr  r*  UNBACKED_INTSIZEPRECOMPUTED_SIZEr   r,  r2  expandr  r9   Addrl   rk   ranger   r0  popr7  r)  r  r  r$   r  r  r  r  insertr  statically_known_geqapply_offset_to_dimensionstatically_known_gtr  countinstall_dimsr  )rL   r  r  rh  r  r:  rA  
split_exprsplit_failedpartrD  	part_varsnew_split_failedrH  
other_vars
other_partrj  r  re  r_   orig_varri  s   `  `                 @rD   indexing_to_dimensionsz#HalideKernel.indexing_to_dimensions  s5   %,,2BC 	CcDKK#:;s#%))		--   		 /67!ao7
7DFT11%89",UEII">EJJUG 	FD$($5$5IqjIII9~"$Y1$9Q<(D0(#% s<01 JA'?666-9!_*J
!*-
90EE!((Z)V1ICU!)VW
*(//Z0HIJ  F!1EIt3DE!	F$	7. & 	7JD$ *
q))*KK)$56	7 $))+ 	8ICKK)$67	8		R	S))M%'',,1=>!!99$q'..!LKK=Hq$q'..RST d)))agg.>.>.S.S++C0/ ..tVd>Q>QRU>V5VW,,S1!!55 ..tV<" 	:A  dFH=Dy <JeA3'C$--h77##H-44S9	:] 8 J *Ws   ="R 	RR	R
R
c                f   || j                   vr|| j                   |<   || j                  |<   y| j                  |   |k7  s$t        | j                   |         t        |      k7  ry|r| j                   |   |k(  S t        | j                   |   |      D ]  \  }}|j                  |j                  k7  r y|j
                  |j
                  k7  s|j                  |j                  k7  sTt        j                  j                  j                  |j
                  |j
                        |_        d|_         y)z>Try to set self.buffer_dimensions[var], return True on successTFN)r  r  rk   zipr  r  r_   r$   r  r  r/  )rL   r  r  r:  rh  oldnews          rD   r|  zHalideKernel.install_dims  s   d,,,*.D""3''-D$s#v-""3'2
Y2 ))#.$66D2237> 	 HCzzSZZ'xx388#sxx388';77++88388L	  rF   c                   |dk(  ry t        t        t        |                  D ]  }||   j                  dk(  s8t        j
                  j                  j                  |||   j                        sMt        |||   j                        }||||   j                  z  z  }||   xj                  |z  c_	         |dk(  sJ y )Nr   r%   )
r(  ru  rk   r  r$   r  r  rx  r   r_   )rL   r  r:  rH  r  s        rD   ry  z&HalideKernel.apply_offset_to_dimension  s    Q;%D	*+ 	%AAw~~"agg&6&6&K&KQ'  Q7$a//Q$	% {{rF   c                   t        t        j                            }|j                  D ]  }t	        |t        j                        sJ t        |t        j                        rU| j                  |j                        }t	        |t              r|j                  J |j                  |j                         t        |t        j                        r|j                  |       t        |t        j                  t        j                   t        j"                  t        j$                  f      rt'        d|        | j)                  |      S )zIDetect which range trees are used to populate HalideCSEVariable.used_dimszunhandled symbol )r   r   r  r  r9   r   r   r  r  r   r  r  r  ro  r  rp  rq  rr  INDEXr  r  )rL   r  r  r  cse_vars        rD   r  z!HalideKernel.used_dims_from_index  s   u||,.	%% 	ECc5<<000c488,--chh7w(9:))56   !2!23T[[1c"d''D4I4I4::V ),=cU*CDD#	E$ ""9--rF   c                    t        d |D              sJ t        j                  | j                  | j                  j                               D cg c]  }||v r|
 }}t        |      t        |      k(  sJ |S c c}w )Nc              3  P   K   | ]  }t        |t        j                           y wr   r  r  s     rD   r  z.HalideKernel.sort_used_dims.<locals>.<genexpr>7  s     @:a,@r  )r  r  r  r  r  r  rk   )rL   r  r  ordereds       rD   r  zHalideKernel.sort_used_dims6  s    @i@@@@ !  $"8"8"?"?"A
 i	 
 
 7|s9~---
s   A9c                    dj                  fd|D              }t        |      dk(  rd}|S t        |      dk(  r| d}|S )Nr[   c              3  B   K   | ]  }|j                          y wr   )r  )r  rl  r  r  s     rD   r  z.HalideKernel.make_index_str.<locals>.<genexpr>C  s     Qqakk,	BQs   r   ()r%   ,)r  rk   )rL   r  r  r  r  s     `` rD   make_index_strzHalideKernel.make_index_strB  sM    IIQDQQ	t9>I  Y!^$+QIrF   c                "   | j                   j                  |      }| j                  |      }| j                  ||d      \  }}| d| j	                  |       d}t
        j                  j                  |      }|t        j                  t        j                  fv rt        j                  }d| d}| j                  rt        | j                  t              r| j                  j                  J t!        g | j#                  |      | j                  j                        }| j%                  | j'                  |            }|j                  r| j(                  j+                  |j,                   d       | j(                  j+                  |j,                   d| j                   d       | j/                  | j0                  xs d      }	| j(                  j+                  | d	t3        |       d
|	 d       | j(                  j+                  | d| dt3        |       d
|j,                   d       |S | j(                  j+                  | d| j                   d
| dt3        |       d       |S | j5                  || j#                  |            S )z"Codegen a load from an InputBufferFr  r  rb   r8   z!_mask = hl.RDom([hl.Range(0, 1)])z_mask.where(r   z = hl.cast(r[   rU  z + hl.cast(z_mask)z = hl.select(z
, hl.cast(z, 0)))rl   inputr|  r  r  r$   r  	get_dtyper;   r   r   r   
_load_maskr9   r  r  r   r  newfuncr  r  r  r   r  _load_otherr   r}  )
rL   r   r  r  r  r   r   r  r  r  s
             rD   rF  zHalideKernel.loadK  sL   iiood#%%e,//UEB	Ta++D12!4!!$'U]]ENN33MME+D63D??4??,=>OO--9: #O$++E2OT__5N5NOI \\$"5"5i"@AF		##v{{m3T$UV		##v{{m<?PPQ$RS

4#3#3#8q9		##hk+e*<)=RwaH 		##hc${;u3E2FbU[\ M 		##hmDOO+<BtfJ{[`OaNbbgh M<<d&?&?&FGGrF   c                ^    | j                   j                  t        j                  dd|         S )Nz\[.* )csevarname_mapr  rl  rL   r   s     rD   r  zHalideKernel.lookup_cse_varr  s$    xx##BFF7B$=>>rF   c                (   t        |t              sJ | j                  j                  |      }| j	                  |      }| j                  ||d      \  }}| j                  |      s|| j                         }| j                  ||      }|j                  |      }	dj                  dgt        |      z        xs d}
| j                  j                  t        || d|
 d| d             n| j                  |d	      }t        |      }	t         j"                  j%                  |      }|| d| d
t'        |       d|	 d}n+|dk(  r| d| dt'        |       d|	 d}nt)        d|       | j                  j                  t        ||             y)z"Codegen a store to an OutputBufferTNr[   r  r  r  z] = hl.undef(z.type()))r  z] = hl.cast(r8   
atomic_addz] += hl.cast(zstore mode=)r9   r  rl   outputr|  r  is_indirect_indexingrQ  r  r  r  rk   r  r  r(   r  r$   r  r  r   r  )rL   r   r  r   moder  r  r  r  	value_str
undef_dimsr   r   s                rD   storezHalideKernel.storeu  s    %!2333iit$%%e,//UDA	T$$U+t/?224L++D,?I|4I))ZL3t9$<=F$JIITcU!J<}SE#RS ++DD+AIE
I!!$'<U!I;l;u3E2FbSTUD\!U!I;mK4F3Gr)TUVD%D6&:;;		Lt45rF   c           	     ^   | j                   sJ | j                  rJ |||f}|| j                  j                  v r| j                  j                  |   S t	        |t
              r1|dk(  sJ  | j                  | x| j                  j                  |<   }|S t	        |t              r|j                  J t        | j                        }| j                  |j                  D cg c]	  }||vs| c}      }	|t        |j                        z
  r:| j                  | | j                  t        g |j                  |                  }|j                  | j                        }
t        j                   j#                  ||      }t%        |      }|dv r|	j&                   d| }| j(                  j+                  | d| d|
 d       g }d}t-        | j                        D ]C  \  }}|j/                  | d	| d
       |dk7  r|dxx   d| z  cc<   || j0                  |   z  }E | j(                  j+                  |	 ddj3                  |              n|dk(  r| j5                  ||      }	nt7        ||      }t9        j:                  t=        t?                           5   ||	|
      }ddd       d| dtA        |       d}| j(                  j+                  |	 d|        | j(                  j+                  |	 d        |	| j                  j                  |<   |	S c c}w # 1 sw Y   {xY w)zCodegen a reduction operationwelford_combineN)argmaxargmin_z = hl.z(rdom, r8   r%   r  r  *rU  rC  welford_reducerZ   r[   )!r  r  r  reduction_cacher9   tuplewelford_combine_implr  r  r   r  r  r}  r  r  r   	Reductiondefault_accumulatorr   r   r  r  rV  r*  r  r  welford_reduce_fallbackr   r$   set_ops_handlerr   r   rE   )rL   r   r   reduction_typer   	cache_keyresult_tuplereduction_varsrD  
result_varr  defaultacc_typer  partsr  rH  r  
combine_fncombine_strdefault_strs                        rD   	reductionzHalideKernel.reduction  s!    $$$$??""6	00088++I66eU#!%6666)))51DHH$$Y/,  %!238SSS#D$:$:;\\C11N+BQC

 Ju77LL'##J/R/R>/R$STE NN4#9#9:	,,22>9M"5)11!'q(89EII5'/?wykQR STEF#D$:$:; 03was!_-Q;"I1VH-I$**3//	0
 II:,c%**U2C1D EF//55eUCJ1.(KJ""??3D#EF @(Y?@$XJb1I0J!LKII:,c+ ?@II:,c+ ?@.8  +G D8@ @s    	L*L
L##L,c                   t        |t              r|j                  J t        |t              r|j                  J t        |t              r|j                  J t        g |j                  |j                  |j                  xs | j                        }|t        | j
                        z  }| j                  | j                  |            }|||fD cg c]  }d|j                   d }}|j                  }| j                  j                  | ddj                  |       d       | j                  j                  | d| d       | j                  j                  | d| d	       | j                  j                  | d
| d       | j                  j                  | d|j                  | j
                                | j                  j                  | d|j                  | j
                                | j                  j                  | d|j                  | j
                                | j                  j                  | d| d| d       | j                  j                  | d| d| d       | j                  j                  | d| d| d| d       | d| d| d| d| d| d| d| d| d| dg}	| j                  j                  | ddj                  |	       d       g }
t        d       D ]S  }|
j                  | j                  |j                               | j                  j                  |
d!    d"| d#| d$       U t        |
      S c c}w )%NrZ   z.type(), 0)z = hl.Tuple([r[   rT  z
_mean_1 = z[0]z_m2_1 = z[1]z_weight_1 = z[2]z
_mean_2 = z_m2_2 = z_weight_2 = z	_delta = z
_mean_2 - _mean_1z_new_weight = z_weight_1 + 	_weight_2z_w2_over_w = hl.select(z_new_weight == 0.0, 0.0, z_weight_2 / z_new_weight)z
_mean_1 + z	_delta * 
_w2_over_wz_m2_1 + z_m2_2 + z_weight_1 * _new_weightr   r  rU  r  r  )r9   r  r  r   r  r  r  r  r   r  r  r  r  ru  r*  r  )rL   meanm2weightr  r  r   r  pfxr  unpackedrH  s               rD   r  z!HalideKernel.welford_combine_impl  se   $ 12t~~7QQQ"/0R\\5MMM&"349I9I9UUU?dnn?r||?f.>.>?S4CSCS
	 	Z 6 677	\\$"5"5i"@A
<@"f;MNaXaffX[1NNoo		zl-		'8J7K2NO		se:j\=>		se8J<s;<		se<
|3?@		se:dmmD<R<R.S-TUV		se8BKK8N8N,O+PQR		e<0F0F GHI	
 			se9SEC5HI		se>#l3%yQR		e*3%/H\Z]Y^^jk	
 e:cU)C5
;e8C5Yse9SEVYUZZdee;

 			zl-		&8I7J"MNq 	GAOODLL)=)=>?II8B<.J<q1 EF	G X7 Os   M+c           
     6   | j                   sJ t        |      t        |      k(  sJ g }t        t        j                            }|D ]  }t        |t              r|j                  J t        |j                        t        | j                        z  r|j                  |       n?|j                  | j                  | g |j                  g | j                  d d              |j                  |j                          | j                  | j                  |            }|j                  r+t        |j                        t        | j                        z  sJ t        ||      D cg c]  \  }}dt        |       d| d }	}}| j!                  | j#                  | j$                  d   j&                              }
|j(                   d}| d}| j*                  j-                  | d|
 d	       t        | j                        dk(  sJ d
       g | j                  \  }|t/        |      i}|t/        |      dz
  i}t        |      dk(  r(d }|j1                  |      g}|j1                  |      g}nqd }t3        t        |            D cg c]  }|j1                  |      d| dz    }}t3        t        |            D cg c]  }|j1                  |      d| dz    }}| j*                  j-                  | d ||	              t5        j6                  t9        t;                           5   |||      }d d d        | j*                  j-                  |j1                  |       d |              t        |      dk(  r|fS |D cg c]"  }| j                  | j                  |            $ }}t=        |      D ])  \  }}| j*                  j-                  | d| d| d       + t?        |      S c c}}w c c}w c c}w # 1 sw Y   xY wc c}w )Nr%   rZ   r[   r8   r  _rdomz.xz = hl.RDom([hl.Range(1, z)])z&multi-dimensional scan not implementedc                    | d   S r  rc   r   s    rD   maybe_tuplez&HalideKernel.scan.<locals>.maybe_tuple$  s    trF   c                ,    ddj                  |        dS )Nz
hl.Tuple([r[   rT  )r  r   s    rD   r  z&HalideKernel.scan.<locals>.maybe_tuple+  s    #DIIaL>44rF   r  r  rU  ) r  rk   r   r   r  r9   r  r  r  r*  r}  r  r  r  r  r   r  r  r"  r+  r   r  r  r!   r  ru  r$   r  r   r   rV  r  )rL   dtypesr  values_origr  all_used_dimsr   r  r   initialr
  scan_domscanscan_varscan_renames_curscan_renames_prir  	read_left
read_rightrH  r  r  unpack_varsrD  s                           rD   r  zHalideKernel.scan  s+    $$$$6{c+....*,"5<<02  
	2Ee%67EOO<WWW%//*Z8N8N-OOe$LL '$Ueoo$U7P9O9O7PQSRS7T$U
   1
	2 \\$"5"5m"DE
##
:3G3G(H:""L
 )
 	
 

 !$FF 3
u u-.bq9
 

 D001A1A"1E1K1KLM oo&e,2		xj(@LM4))*a/ 	
4	
/ 0../$&8&>?$&8&>&BCv;! $,,-=>?I$--.>?@J5
 s6{+ ##$45!A3a@I  s6{+ ##$45!A3a@J 
 			zl#k'.B-CDE /@AB 	<$Y
;K	<		""#345S[9Q8RS	
 v;!= QWXAt||D$7$7$FGXXk* 	<DAqII1#SAaS :;	<[!!k
:	< 	< Ys$   )O:P P3
P
'P
Prz  c                   | j                   j                  | j                  ||      }t        |t              sJ ||_        |S ry  )r  generater  r9   r  r  )rL   r   r  r{  r  s        rD   r}  zHalideKernel.genfuncH  s@     hh		4?#0111!
rF   c                l    | j                   j                         }t        |t              sJ ||_        |S r   )r  newvarr9   r  r  )rL   r  r  s      rD   r  zHalideKernel.newfuncP  s/    hhoo#0111!
rF   c                x    t         j                  j                  |      j                         j	                         S )a  
        We map all tensors to 1D buffers in Halide since Halide has trouble representing some strides that PyTorch
        supports.  If there are gaps in the underlying layout the numel we pass to Halide includes the gaps while
        PyTorch's numel excludes them.
        )r$   r  
get_buffer
get_layoutstorage_sizer  s     rD   halide_buffer_numelz HalideKernel.halide_buffer_numelV  s+     ww!!$'224AACCrF   c                   d }g }| j                   j                         \  }}}}t        t        ||      |      D ]  \  }|j	                  |f       t        t              s*j                  dk(  rj                  J |j                  fd| j                  j                  j                  d      D                |S )zX
        Halide requires scalar inputs before outputs, so need to reorder args.
        c                n    | \  }}t        |t              ryd|j                  v ryd|j                  v sJ y)Nr%   out_ptrr   in_ptrr   )r9   r-   r   )	arg_tuple	_call_strr  s      rD   	arg_orderz.HalideKernel.halide_argdefs.<locals>.arg_orderc  s<    &NIs#w'chh&388+++rF   r  r   c           	   3     K   | ]>  }d t        |j                  j                  j                  j                        f @ y w)N)alias_of)r.   bufferr   r:  r   )r  aliasr  s     rD   r  z.HalideKernel.halide_argdefs.<locals>.<genexpr>s  sG        !!JJIIJJ%(XX	s   AArc   )rl   python_argdefsrn  r  r*  r9   r.   r:  r  r0  r  r  r   )rL   r  r  r  r   r   call_strr  s          @rD   halide_argdefszHalideKernel.halide_argdefs^  s    
	 =?YY--/
1a#C1I9= 	MHcMM8S/*#y)zzQ3<<+???  "&!4!4!8!82!F 		" rF   c                   g }| j                         D ]4  \  }}t        |t              r	d}d}d}d}n| j                  |j                     D cg c]&  }t        | j                  |j                              ( }}| j                  |j                     D cg c]&  }t        | j                  |j                              ( }}t        |      t        |      k(  sJ t        | j                  |j                           }t        |j                      d}|j                  t        ||j                  ||||j                               7 t         j"                  j%                         }	|	j&                  dk(  rDt(        j*                  j,                  g}
t(        j*                  j.                  }dt1               i}d}n|	j&                  dk(  sJ d       |	j2                  d	k  sJ d
       t(        j*                  j4                  g}
t(        j*                  j6                  }t8        j:                  j=                  |	      }d|
d	   vrAdD ]<  \  }}|j>                  |k\  s|j@                  |k\  s&|
j                  d| |         n |
j                  d       d|jB                  i}tE        d	|	j2                        }|
j                  d       |
j                  d       t(        j*                  jF                  s|
j                  d       t(        j*                  jH                  r|
j                  d       d| jJ                  v r|
j                  d       tM        |djO                  |
      |||      S c c}w c c}w )z)Compute metadata required by codecache.pyNlongr  )shaper  r:  r  cpuparallelismcudazonly cpu/cuda supportedr   zonly default device supportedcuda_capability))      )r  r   )      )r  r   )r  r%   cuda_capability_user_contextstrict_float
no_runtime
no_assertsdebug64large_buffers-)target	schedulerscheduler_flagscuda_device)(r  r9   r-   r  r   r0   r  r  r  rk   r  r/   r   r*  r   r  r$   r  get_current_device_or_throwtyper   r  
cpu_targetscheduler_cpur    r  
gpu_targetscheduler_cudar;   r  get_device_propertiesmajorminormulti_processor_countr?   assertsr  r]   r   r  )rL   argtypesr  r  r  r  r:  r   r   current_devicer
  schdulerr  r  
capabilityr  r  s                    rD   halide_kernel_metazHalideKernel.halide_kernel_meta  s   ))+ 	FAs#w' "33CHH= $..qvv67  "33CHH= $..qxx89  5zS[000t22388<='		2315OOHH!! \\	%	: <<>%'mm../F}}22H35O K!&&&0K2KK0!''1,M.MM,mm../F}}33H99.IJ q	1$L LE5!''50Z5E5E5N(8w&GH MM.)z??O
 a!5!56K 	n% 	l#}}$$MM,'==MM'"4###MM/*88F#+#
 	
Cs   +M/+M4c                
     j                   j                  rt        d       j                         }t	               }|j                  dd       |j                           j                         D ]  \  }}t        |t              r,|j                  |j                   d j                   d       B|j                  sJ |       d|j                  v rdnd	}t        |j                        }t!         j"                  |j                           }|j                  |j                   d
| d| d| d        |j                  d       |j                           j                         D ]/  \  }}|j                  |j                   d|j                          1  j                   j%                         D ]  \  }	}
|j                  |	 d
|
         |j                   j&                          fd} j(                  j*                  D ]C  }t        |t,              r t.        j0                  j3                  ||      }|j                  |       E |j                  d       |j                  d        j                         D ]  \  }}t        |t              rWt4        j6                  j8                  j;                  |j<                  d      }|j                  |j                   d| d       n j"                  |j                     }g }t?        |      D ]   \  }} jA                  t4        j6                  j8                  j;                  |jB                  d      |      }|jE                  d| d       d|j                  vsp|j                  |j                   d| d       	 |j                  |j                   d| dtG        |jH                         d       	 |j                  |j                   d| dtG        |jB                         d        |j                  |j                   ddjM                  |       d        |jO                  d       |j                  djQ                                |jR                  rk|j                  dtU        jV                  |jR                        d|jX                  d |jR                  d|jZ                  d!	d       |j]                         S |j                  d"|jX                  d#d       |j]                         S # tJ        $ r Y Rw xY w# tJ        $ r Y +w xY w)$z3Called at the end to generate a final kernel stringinplace_buffersz
            import halide as hl
            from torch._inductor.runtime import halide_helpers
            from math import inf, nan

            @hl.generator(name="kernel")
            class Kernel:
        Tstripz = hl.InputScalar(r8   outzhl.OutputBufferzhl.InputBufferrU  r   r[   z&
            def generate(g):
        z = g.c                    t        t        j                  j                  | j	                  d               }|j
                  J |       t        |      S )Nr%   )r   r  r  r  rM  r  r  )rP  r  rL   s     rD   update_indexz1HalideKernel.codegen_kernel.<locals>.update_index  sE    ($((*>*>qwwqz*JKC==,1c1,s8OrF   r  zassert g.using_autoscheduler()r%   r  z.set_estimate(rS  z.dim(z).set_min(0)z).set_stride(z).set_extent(z.set_estimates([rT  r   zN
            if __name__ == "__main__":
                hl.main()
            z:
                else:
                    hl.load_plugin(z))
                    target = hl.Target(z=)
                    autoscheduler = hl.AutoschedulerParams(a  )
                    with hl.GeneratorContext(target, autoscheduler):
                        gen = Kernel()
                        pipeline = gen._build_pipeline()
                        # gen.compile_to_callable() does not run the autoscheduler
                        pipeline.apply_autoscheduler(target, autoscheduler)
                        kernel = pipeline.compile_to_callable([
                                gen._get_input_parameter(a.name)._to_argument()
                                for a in gen._get_arginfos()
                                if a.dir == hl.ArgInfoDirection.Input
                            ], target)
                zR
                  else:
                      with hl.GeneratorContext(hl.Target(zX)):
                          kernel = Kernel().compile_to_callable()
                  )/rl   r  rH   r  r)   splice	do_indentr  r9   r-   r  r   r]   r  r   r   rk   r  aliasesr5  r  _linesr  r  r  rl  r$   r  r  r  r_   rV  _autoscheduler_workaroundsr  r*  r:   r  r  r  do_unindentrstripr  r   find_libautoscheduler
  r  getvalue)rL   r   metacoder  r  argclsargtypendimr  r  r$  r   hintr  range_hintsrH  dims   `                 rD   codegen_kernelzHalideKernel.codegen_kernel  s   99$$/00&&(  	 
	
 	))+ 	LFAs#w'#((+=d>N>N=OqQRzz&3&z.3sxx.?*EU%cii0411#((;<#((3vhay4&JK	L 		

 	))+ 	9FAsNNchhZuSXXJ78	9		))+ 	-HCNNcU#cU+,	-D&&'	
 II$$ 	!D$$(5599,MNN4 		!
 	r78))+ 	XFAs #w'ww''11#((Q1G#((>$qAB--chh7 'o !FAs::((22388a2H$D  &&dV1'=>CHH,#((5<'HI! NN#&88*E!M#cjj/ARRS T
! NN#&88*E!M#chh-PQ R!& #((+;DIIk<R;SSUVW;	X> 	 		
 >>KK$$3$H$H$X#[ \((, 7<<@NN;MRPTPdPdOg h	  #  8 }} KK::>++ I
    }}]  ) ! !  ) ! !s$   &7T#7T3#	T0/T03	U ?U c                    t        |      dk(  rTt        j                  j                  dk(  r7t        j
                  j                         j                  dk(  rt        d|       } | S )Nr%   Anderson2021r  r   )	rk   r   r  r  r$   r  r  r  r?   )r   r  s     rD   r)  z'HalideKernel._autoscheduler_workaroundsN  sN     IN,,>335::fD Aq	ArF   c                   t         j                  j                  }| j                         D cg c]  \  }}|j                  |  }}}t         j                  j                         }|j                  dk(  r;|j                  |j                  t         j                        }|j                  |       |j                  |||d       yc c}}w )zCodegen a call to this kernelNr  F)devicetriton)r$   r  wrapper_coder  r  r  r  write_get_raw_streamr  r*  generate_kernel_call)	rL   r   r  wrapperr   r  	call_argsr  stream_names	            rD   call_kernelzHalideKernel.call_kernelY  s    ''&&*.*=*=*?X33<<CWsVX	X<<>&(!66~7K7KQWWUK[)$$!	 	% 	
 Ys   C	C	c                     yr  rc   )rL   r  s     rD   generate_assertzHalideKernel.generate_asserth  s    rF   c                     y r   rc   )rL   r_   r  loweruppers        rD   check_boundszHalideKernel.check_boundsk  s     	rF   )r  zdict[str, sympy.Expr]rP   rQ   )r   r  rP   r  )NN)r8  zSequence[sympy.Expr])r  r  )r  r  r  r  rh  r   r  )r   r  r  r  )r   r  r   )
r   r  r  r  r   r'   r  r6   rP   rQ   )
r   r  r   r  r  r5   r   +Union[CSEVariable, tuple[CSEVariable, ...]]rP   rI  )r  ztuple[torch.dtype, ...]r  zUCallable[[tuple[CSEVariable, ...], tuple[CSEVariable, ...]], tuple[CSEVariable, ...]]r  tuple[CSEVariable, ...]rP   rJ  )rP   r  )rP   r   )r_   r  r  r  rF  r   rG  r   )*rR   rS   rT   r   	overridestexprr  r  rK   r  r  rF  rQ  r6  r|  r\  r  r|  ry  r  r  r  rF  r  r  r  r  r  r   unknownr}  r  r  r  r  r6  r   r)  rB  rD  rH  rU   rV   s   @rD   r  r    s   I).E&.+%+ 
	+6"6iV*BNN%f:P(
..
%HN? SW66 *63>6FO6	6:;; ; &	;
 ;; 
5;z$LQ"'Q"
Q" -Q" 
!Q"h *=)<)<)>	D"HQ
fwr  
&09=FJrF   r  c                  (    e Zd ZeZedd       Zd Zy)HalideSchedulingc                    t        t        j                  t        j                  t        j                  g      }t
        j                  j                  r|j                  t        j                         |S r   )
r   r&   TUPLE_REDUCTIONPREFER_STORE_LOOP_ORDERREDUCE_TO_SINGLE_ELEMENTr   r  scan_kernelsr  SCAN)r   r:  r  s      rD   get_backend_featuresz%HalideScheduling.get_backend_featurest  sR    ..6677
 ==%%JJ~**+rF   c                   t         j                  j                  }||j                  v r|j                  |   }|S d|j	                          }||j                  |<   |j                  d       t               }|j                  d|j                         d       |j                  |d       |j                  d       t        ||      \  }}| d| }	|j                  ||j                         |	       t        d	      rt        |d
|       |S )z6Codegen kernel definition to go in output wrapper codehalide_kernel_zEfrom torch._inductor.runtime.hints import HalideMeta, HalideInputSpeczasync_compile.halide(z, '''Tr   z''')
kernel_metadatar  )r$   r  r<  src_to_kernelnext_kernel_suffixadd_import_oncer)   r  r  r%  r   define_kernelr-  r   r   )
rL   src_codenode_scheduler\   r?  kernel_namecompile_wrapperoriginsdetailed_originsmetadata_comments
             rD   r^  zHalideScheduling.define_kernel  s"   ''&&w,,,!//9K. + +7+E+E+G*HIK.9G!!(+##W -.O%%'(A(A(C'FeL ""84"8%%f-(;M7(S%G%")"-=,>?!!_5579I ''89#KX>rF   N)r:  ztorch.devicerP   zOrderedSet[BackendFeature])rR   rS   rT   r  kernel_typer  rV  r^  rc   rF   rD   rO  rO  q  s    K
 
rF   rO  )r
__future__r   dataclassesr  r  loggingr  collectionsr   mathr   typingr   r   r   r	   r
   r   r   r;   torch._logging_prims_commonr   utils._ordered_setr   utils._sympy.functionsr   r   utils._sympy.symbolr   r   utils._sympy.value_rangesr   r  r   r   	codecacher   r   metricsr   r   ops_handlerr   runtime.hintsr   r   utilsr   r   r    r!   r"   virtualizedr#   rE  r$   commonr&   r'   r(   r)   r*   r+   r,   r-   r.   cppr/   	cpp_utilsr0   simdr1   r2   r3   collections.abcr4   r5   r6   	getLoggerrR   rW  rE   RuntimeErrorrH   rX   r   rL  pexprr   r   r   r   float64rk  int16r   r=   uint8uint16uint32uint64r   r   r   r   _initialize_pointwise_overridesr  	dataclassr  r  rj  r  rO  rc   rF   rD   <module>r     s   "     	 #  F F    - , ? 7 4  ' ) B ) 7  )
 
 
   ; ; (6g!
F, F
zQM zQz 	 
JJ	NNO	MM>	MM>	MM>	JJ	KK	KK	KK	KK	LL-	LL-	LL-"C+k C+L
  / / 9$P $PN + + +>
 T: Tn+~ +rF   