
    VhW                       U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZmZmZmZmZ d dlmZ d dl	mZ d dlm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* d dl+m,Z,m-Z-m.Z.m/Z/m0Z0 d dlm1Z1 d dl2Z2d dl3Z3d d	l4m5Z5 d d
l6m7Z7 d dl8m9Z9 e(r\d dlm:Z:m;Z;m<Z< d dl3m=Z=m>Z>m?Z? d dl@mAZA d dlBmCZC d dlDmEZE d dlFmGZG ddlHmIZI ddlJmKZK ddlLmMZM ddlNmOZOmPZPmQZQmRZRmSZSmTZT ddlUmVZV ddlWmXZXmYZY g dZZ e)d      Z[ ej                  d      dd       Z]d dl^m_Z_ d dl`maZa d dlbmcZc d dldmeZe d d lfmgZg d d!lhmiZi d d"ljmkZkmlZlmmZmmnZnmoZo d d#lpmqZqmrZr d d$lsmtZtmuZu dd%lvmwZw dd&lxmyZz ej                  d'k(  Z{ ej                  e}      Z~ e)d(      Zee2j                  e2j                  f   Ze&e*e3j                  ee3j~                  f      Zd)d*d+Zd,Zd,Zd,Zd-Zd.Zeedz
  z  d k(  red/k\  sJ d0       dd1Zdd2Z G d3 d4e2j                        Z	 d	 	 	 	 	 	 	 dd5Z ej                  d      dd6       Zdd7Zdd8Zdd9Zdd:Z	 	 	 	 	 	 dd;Zydd<Z	 	 	 	 dd=Z	 	 	 	 dd>Zd d?Zd@ f	 	 	 	 	 ddAZ	 	 	 	 	 	 	 	 ddCZdddDZ	 	 d	 	 	 	 	 	 	 	 	 ddEZ	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 ddFZd	dGZd
dHZddIZddJZddKZ e.dL      Z e)dMdNO      Z G dP dQe'e#eef         ZddRZ	 	 	 	 ddSZ	 	 	 	 	 	 ddTZ	 	 	 	 	 	 ddUZ	 d	 	 	 	 	 ddVZ	 	 	 	 	 	 ddWZddXZddYZddZZdd[Zdd\Zdd]Zdd^Zdd_Zdd`Z	 	 	 	 ddaZddbZg ZdBedc<   d ddZd!deZd dlZd!dfZej|                  	 	 	 d"	 	 	 	 	 	 	 d#dg       Zd$dhZ	 	 	 	 	 	 d%diZ ej                  d/      d&dj       Z G dk dle%      Zej                   G dm dn             Z G do dp      Z G dq dreƫ      Zej|                  d'ds       Z G dt du      Z G dv dweɫ      Z ej                  d      d(d)dx       Zej                  d*dy       Z̐d*dzZ	 	 	 	 	 	 d+d{Zdd|Z	 	 	 	 	 	 d,d}ZАd-d~Zѐd-dZddd	 	 	 	 	 	 	 d.dZӐd/dZԐd0dZ ej                  d      d1d       Z ej                  d      d2d       Zאd3dZؐd0dZِd3dZڐd3dZ	 	 	 	 	 	 	 	 d4dZ	 	 	 	 d5	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d6dZddZ G d d      Z	 	 	 	 	 	 	 	 d7dZ	 	 	 	 	 	 	 	 d8dZd9dZd:dZd;dZd;dZ	 	 	 	 	 	 	 	 d<dZej|                  	 	 	 	 	 	 d=d       Z	 d	 	 	 	 	 d>dZd?dZd@dZdAdZdAdZdBdZdCdZej|                  dDd       Zd*dZ ej                  d      d*d       Z ej                  d      d&d       Z ej                  d      d*d       Zd*dZdEdZdFdZddZddZdGdZddZ G d dej                        Z	 	 	 	 	 	 	 	 	 	 dHdZdIdZ	 	 	 	 dIdZ	 d	 	 	 	 	 dJdZ dKdZdLdZdLdZ	 	 	 	 	 	 dMdZ	 	 	 	 	 	 	 	 dNdZd f	 	 	 	 	 	 	 	 	 	 	 dOdZd f	 	 	 	 	 	 	 	 	 	 	 dOdZdPdZdQdZ	ej                   G d d             Z
ej|                  dRd       ZdSdZdTdZdUdZdVdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 dWdÄZdXdĄZdYdńZdZdƄZd[dǄZ	 	 	 	 	 	 	 	 d\dȄZd]dɄZ	 	 	 	 	 	 d^dʄZd_d˄Z	 	 	 	 	 	 d`d̄Z	 	 	 	 	 	 dad̈́Zdbd΄Z	 	 	 	 	 	 dcdτZddЄZdSdфZddddddd؜ZejA                         D  ci c]  \  } }|| 
 c}} Z! ejD                  d٫      Z#dddڄZ$dedۄZ%dfd܄Z&dfd݄Z' ej                  d      dgdބ       Z(ej                   G d߄ d             Z)i Z*ded<   	 	 	 	 	 	 	 	 dhdZ+didZ, e)d      Z- e)d      Z. G d dee-e.f         Z/ e-dN      ddNddjd       Z0dkdZ1	 	 	 	 	 	 dldZ2 G d dej                        Z3 ej                  d      dmd       Z4ddZ5yc c}} w (n      )annotationsN)
CollectionIteratorMappingMutableMapping
MutableSet)datetime)StringIO)AnyCallablecastGenericLiteral
NamedTupleOptionalProtocolTYPE_CHECKINGTypeVarUnion)Concatenatedataclass_transform	ParamSpecSelf	TypeGuard)mock)DeviceProperties)
OrderedSet)tree_map_only)IterableSequence
ValuesView)SymBoolSymFloatSymInt)ELEMENTWISE_TYPE_PROMOTION_KIND)GraphModule)ShapeEnv)Node   )WorkspaceArgPythonWrapperCodegenGraphLowering)BufferExternKernelIRNodeLayout	OperationReinterpretViewCompiledFxGraph)BaseSchedulerNodeSchedulerBuffer)cudampsxpuTc                     t         D  cg c]#  } t        t        |       j                         s"| % }} t	        |      dk  sJ t	        |      dk(  rd}|S |j                         }|S c c} w )Nr)   r   r9   )	GPU_TYPESgetattrtorchis_availablelenpop)x
avail_gpusgpu_types      E/home/dcms/DCMS/lib/python3.12/site-packages/torch/_inductor/utils.pyget_gpu_typerH   P   sg    &K'%*;*H*H*J!KJKz?aZA-vHO 4>>>3CHO Ls
   #A'A')get_interface_for_device)detect_fake_mode)
DeviceType)	EventList)GraphTransformObserver)	ShapeProp)CeilDivCleanDivFloorDivIdentityModularIndexing)make_symbolSymT)bound_sympyValueRanges)config)ceildivwin32_Tz.cubinz.spv)r9   r;         @      zmust be power of 2c                *    | t         z   dz
  t          z  S )z/Round up to the nearest multiple of ALIGN_BYTESr)   )ALIGN_BYTES)nbytess    rG   _alignrc      s    [ 1$44    c                   t        | t        j                  t        j                  f      r#t	        t        t        | j                              S t        | t              xs! t        j                  | t              t        k(  S )z:v can be statically proven to be a multiple of ALIGN_BYTES)
isinstancesympyAddMaxallmap_is_alignedargsaligngcdra   )vs    rG   rl   rl      sQ    !eii+,3{AFF+,,aK599Q#<#KKrd   c                  *    e Zd ZdZdZdZedd       Zy)rn   z<Symbolically round up to the nearest multiple of ALIGN_BYTESr)   Tc                    t        |t        t        j                  f      rt	        t        |            S t        |      r|S y N)rf   intrg   Integerrc   rl   )clsvalues     rG   evalz
align.eval   s6    ec5==12#e*%%uL rd   N)rx   
sympy.ExprreturnzOptional[sympy.Expr])__name__
__module____qualname____doc__nargs
is_integerclassmethodry    rd   rG   rn   rn      s!    FEJ rd   rn   c                    |         t         j                  j                          t        j                  t	        d      t         j                  d      }t         j                  j                  d      }t         j                  j                  d      }|j                          t        d      D ]  }|j                           |          |j                          t         j                  j                          |j                  |      dz  }t        dt	        ||z              }t        dt	        ||z              }	t        |      D ]	  } |          t         j                  j                  t         j                  j                  j                  g      5 }
t        |	      D ]  }|j                           |          t         j                  j                          d	d	d	       t        j!                  d
       t        j!                  
j#                         j%                  dd             t'        |
j)                         D cg c]0  }|j*                  t,        j                  k(  r|j.                  dk7  r|2 c}      }t1        |      |	z  dk7  rt3        dt1        |      |	      t1        |      |	z  }t'        t5        |      D cg c]  \  }}||z  dk7  r| c}}      }|j7                          |j#                         }t        j!                  d       t        j!                  |j%                  d             t9        d |D              dz  |	z  }t        j!                  d|       |S # 1 sw Y   xY wc c}w c c}}w )aR  
    Returns benchmark results by examining torch profiler events.
    This could be more accurate as it doesn't count CPU side overhead.
    However, this also requires manually excluding irrelevant event, e.g.
    vectorized_elementwise_kernel which is used to fill L2 cache,
    various CUDA events, etc, so could also be fragile.
    g    Ar9   )dtypedeviceT)enable_timing   r)   )
activitiesNz
raw eventsself_device_time_total)sort_by	row_limitzContext Syncr   zYFailed to divide all profiling events into #repeat groups. #CUDA events: %d, #repeats: %szprofiling time breakdown)r   c              3  4   K   | ]  }|j                     y wrt   )device_time_total).0events     rG   	<genexpr>z+do_bench_using_profiling.<locals>.<genexpr>   s     A%e%%A   g     @@zprofiling results: %s ms)r@   r9   synchronizeemptyru   Eventrecordrangezero_elapsed_timemaxprofilerprofileProfilerActivityCUDAlogdebugkey_averagestablerL   eventsdevice_typerK   namerB   RuntimeError	enumerate_build_treesum)fnwarmuprepcachestart_event	end_event_estimate_msn_warmupn_repeatpir   filtered_eventsnum_event_per_groupactual_eventsress                    rG   do_bench_using_profilingr      s    D	JJKKJuyyHE **"""6K

  t 4I1X 
 	JJ**959K 1c&;./0H1c#+,-H 8_ 
 
		NN++00
 
  
 ! 
x 	AKKMD		 	

 ! IIlIIann$$-EQS$TU 	
  JOO3

n8T 	
O ?h&!+- 	
 	
 o.9 &o6	
5&&!+ 	
M !..0MII()IIm!!B!/0
A=A
AF
JX
UCII(#.J_! !$	
	
s   AM05M=
N
0M:c                    	 ddl m}  t        j                  j	                  dd       | d uxr% t        t        t        j                  dd       d      S # t        $ r Y yt        $ r}dt        |      v sJ Y d }~yd }~ww xY w)	Nr   )	roi_alignztorchvision::nmsMetatorchvisionr   Fztorchvision::nms does not exist)torchvision.opsr   r@   _C%_dispatch_has_kernel_for_dispatch_keyhasattrr?   opsImportErrorr   str)r   es     rG   has_torchvision_roi_alignr      s|    -667I6R$ 
EII}d3[*
 	
   0CF:::s   AA 	A?A?&A::A?c                b   | t        j                  d      j                  S t        | t              rt        j                  |       } | j
                  dvrZ| j                  Nt        | j
                        }t        j                  | j
                  |j                  j                               S | S )Ng        )cpumeta)index)
r@   tensorr   rf   r   typer   rI   Workercurrent_devicer   device_interfaces     rG   decode_devicer      s    ~||C '''&#f%{{/)fll.B3FKK@||FKK/?/F/F/U/U/WXXMrd   c                |    t        j                  t        j                  | t        j
                  j                        S rt   )	functoolsreduceoperatormulrg   SOne)its    rG   sympy_productr   	  s#    HLL"eggkk::rd   c           	         t        |       t        |      k(  sJ t        j                  t        d t	        | |      D                    S )Nc              3  ,   K   | ]  \  }}||z    y wrt   r   )r   abs      rG   r   zsympy_dot.<locals>.<genexpr>  s     >daAE>s   )rB   rg   expandr   zip)seq1seq2s     rG   	sympy_dotr     s8    t9D	!!!<<>c$o>>??rd   c                \    | D ci c]  }t        |      | c}j                         S c c}w rt   )idvalues)r   rD   s     rG   uniquer     s'     !BqE1H!((**!s   )c           
     n   t        | t        j                        st        |t        j                        r2t        t        j                  |       t        j                  |            S t        | t
              rt        |t
              s$J |  dt        |        d| dt        |              t        | |      S )Nz: , )rf   rg   ExprrO   sympifyru   r   runtime_ceildiv)numerdenoms     rG   rY   rY     s     %$
5%**(Eu}}U+U]]5-ABB eS!j&< 'DK=5'DK=9< 5%((rd   c                2   | yt        |       j                  d      d   }i dddddd	d
ddddddd	ddddddddddddddddd d!d"dd#d$d%d&}t        |j                               D ]  }|||<   	 t	        | t               r| S d'||    S )(Nz*i8.r   booli1
float8e4nvfp8e4nvfloat8e5fp8e5float8e4b15fp8e4b15float8e4b15x4
fp8e4b15x4float8_e4m3fnfloat8_e5m2float8_e8m0fnuu8float16fp16bfloat16bf16float32fp32float64fp64int8i8int16i16int32i32int64i64uint8u16u32u64)uint16uint32uint64*)r   splitlistr   rf   )key	dtype_strtysrp   s       rG   _type_ofr  $  sG   
 {Cs#B'Ii 	G 	z	
 	 	 	w 	$ 	6 	F 	6 	6 	  	!" 	#$ 	%& 	'( -C2 #**, AS#&3@aI/?,@@rd   c                R    | D cg c]  }t        j                  |       c}S c c}w )z
    Gets the shape and stride of a tensor. For non-symbolic tensors, this is
    trivial. But for symbolic tensors, we need to map from SymIntNode into
    sympy.Expr.
    )rg   r   )lstr   s     rG   convert_shape_to_inductorr!  J  s!     '**EMM!***s   $c                   ddl m} | D cg c]j  }t        |t              r|nUt        |t        j
                        rt        |      n0|j                  j                  j                  j                  |d      l c}S c c}w )zz
    Takes a list of shapes from Inductor and converts them into symints (or just
    ints if all shapes are static).
    r)   VN)hint)
virtualizedr$  rf   ru   rg   rv   graphsizevars	shape_envcreate_symintnode)r   r$  r   s      rG   convert_shape_to_symintr+  U  sx        !S!  a/ AWW%%//AA!$AO  s   A/A=c                N    t        d | j                  j                  D              S )z-
    Does this op overload have aliasing
    c              3  8   K   | ]  }|j                   d u  y wrt   )
alias_infor   r   s     rG   r   zis_view.<locals>.<genexpr>p  s     FAq||4'Fs   )any_schema	arguments)ops    rG   is_viewr4  l  s     F1E1EFFFrd   c                     yNFr   )r   s    rG   <lambda>r7  u      rd   c                   | j                   dk(  syt        | j                  t        j                  j
                        s| j                  t        j                  u syt        t        j                  j
                  | j                        }|t        j                  u st        |      rt        fd| j                  D              S t        j                  j                  |j                  v xs  |      S )z
    Do all uses of this op have torch.Tag.pointwise or return True for optional `is_pointwise_fn`

    Uses in views ops will follow the views uses
    call_functionFc              3  6   K   | ]  }t        |        y wrt   )is_pointwise_use)r   uis_pointwise_fns     rG   r   z#is_pointwise_use.<locals>.<genexpr>  s     KA#A7Ks   )r3  rf   targetr@   _ops
OpOverloadr   getitemr   r4  rj   usersTag	pointwisetags)user>  r?  s    ` rG   r<  r<  s  s     66_$3::uzz445xGWGW9W%**''4F!!!WV_KKKK99&++-H1HHrd   	list[Any]c           	        t         j                  j                         g dfd} j                  | gt	        t         j
                  |||f       }t        | j                  j                        dk(  r2t        | j                  j                  d   j                        dk(  r|f}j                  |       t         j                  j                  i       }|fS )Nc                `    j                  |        j                  dt                     S )Narg)appendplaceholderrB   )rK  g
graph_argss    rG   add_tensor_argz)gen_gm_and_inputs.<locals>.add_tensor_arg  s,    #}}s3z?"3455rd   r)   r   Tensor)rK  torch.Tensorr{   r(   )r@   fxGraphr:  r   rQ  rB   r1  returnsr   r   outputr&   )r?  rm   kwargsrP  nodegmrN  rO  s         @@rG   gen_gm_and_inputsrZ    s     	A%'J6 1??u||^dF^LD 	FNN""#q(&&q)../8;wHHTN			b!	$Bz>rd   c                h    | dk(  ry t        |       }|j                         r|j                          y y Nr   )rI   rA   r   r   s     rG   r   r     s4    /7$$&$$& 'rd   c                    t        |       t        j                  d       t        j                         }t        |      D ]  } | | }t        |        t        j                         }J ||z
  S )Ni9  )r   r@   manual_seedtimeperf_counterr   )modelexample_inputstimesr   t0r   resultt1s           rG   timedrg    sr     	d				B5\ 'F 
			B7Nrd   c                    t        j                  t        |      D cg c]  }t        | |||       c}      }t        j                  |      |z  }t        ||z  d       |j                         S c c}w )Nz.6f)r@   r   r   rg  medianprintitem)	ra  rb  rc  repeatbaseliner   r   timingstooks	            rG   print_performancerp    sg     ll>CFmLuneV	4LG << 5(D	TH_S!#99;	 	Ms   A1c                H     t        | |             t        | |fd       y)zKReplace obj.method() with a new method that returns a precomputed constant.c                      S rt   r   )re  s   rG   r7  z#precompute_method.<locals>.<lambda>  s     rd   N)r?   setattr)objmethodre  s     @rG   precompute_methodrv    s     !WS&!#FC(rd   c                *    |D ]  }t        | |        y)zFReplace methods with new methods that returns a precomputed constants.N)rv  )rt  methodsru  s      rG   precompute_methodsry    s     '#v&'rd   c                <    t        | |kD        t        | |k        z
  S rt   )ru   )r   r   s     rG   cmpr{    s    q1u:AE
""rd   c                ~    t        | t              r| g|z  S t        |       dk(  r t        |       | d   g      |z  S | S )Nr)   r   )rf   ru   rB   r   )rD   sizes     rG   pad_listliker~    sC    !SsTz
1v{tAw!v%%Hrd   c                D    t        |       dk(  rg S dd}t        | |      S )Nr   c                n    t        | t              r| S ddlm} t        | |      sJ | j	                         S )Nr)   )r7   )rf   r   	schedulerr7   get_name)elemr7   s     rG   	sort_funcztuple_sorted.<locals>.sort_func  s1    dC K0$ 1222}}rd   r  )r  r[   r{   r   )rB   sorted)rD   r  s     rG   tuple_sortedr    s&    
1v{	 !##rd   PRVT)	covariantc                  &    e Zd Zedd       ZddZy)CachedMethodc                     y rt   r   )r   s    rG   clear_cachezCachedMethod.clear_cache  s    ),rd   c                     y rt   r   selfrm   rW  s      rG   __call__zCachedMethod.__call__  r8  rd   N)r   r   r{   None)rm   P.argsrW  P.kwargsr{   r  )r|   r}   r~   staticmethodr  r  r   rd   rG   r  r    s    , ,Drd   r  c           	         | j                   }d| dd| i}t        d| d d dj                         |        t        j                  |       || d         }d
fd	}||_        |S )N___cacher   z        def zC_cache_on_self(self):
            try:
                return self.zy
            except AttributeError:
                pass
            rv = fn(self)
            object.__setattr__(self, "z%", rv)
            return rv
        _cache_on_selfc                8    t        |       rt        |        y y rt   )r   delattrr  r  s    rG   r  z"cache_on_self.<locals>.clear_cache  s    4D# rd   )r  r   r{   r  )r|   execlstripr   wrapsr  )r   r   ctxwrapperr  r  s        @rG   cache_on_selfr    s    ;;DtfF
C *CF  E "' (+e ,			 FH "ioob!#n&=">?G &GNrd   c           
     ^   ddl m} t        | t              rgt	        j
                  t        j                  | D cg c]0  }t        |d      r"|j                  r|j                  j                  2 c}t                     S t        | |j                        r| j                  S t               S c c}w )Nr)   irrX  ) r  rf   r  r   r   r   or_r   rX  originsr   r0   )node_scheduler  rX  s      rG   aggregate_originsr    s     -&LL *4(TYY 		!!
 L
 	
 
M2??	3$$$|s   5B*
c                   t        |       }|dk(  rq|D cg c]Q  }|j                  dk(  r@d|j                  v r2|j                  d   #|j                  d   j                  j                  S }}t        t        |            }n|dk(  rg }|D ]y  }|j                  dk(  sd|j                  v s"|j                  d   d   }t        |d   t              r|j                  |d          \|j                  |d   j                         { t        t        |            }n5|dk(  r*|D cg c]  }|j                  dk(  s|j                    }}nt        |}dj                  d	g|z         S c c}w c c}w )
Noriginal_atenr:  r@   source_fn_stackr   r)   inductor_noder   fused)r  r3  r   _overloadpacketr|   r  r   rf   r   rL  r   NotImplementedErrorjoin)r  descriptive_namesall_originsoriginsources	source_fns         rG   get_fused_kernel_namer  4  sm    $M2KO+ &
yyO+6;;.O,8	 KK(88AA
 
 G,-	g	%! 	:FyyO+0AV[[0P"KK(9:2>	ilC0NN9Q<0NN9Q<#8#89	: G,-	o	-&1
"VYY/5QFKK
 
 "!G88WI'((5
(
s   AE(%E-:E-c                   t        |       }|D cg c]  }|j                  dk(  s| }}t        j                  t              }t        j                  t              }d t        |      r{t        d |D              }t        |      dk(  r[|d   j                  t        d      s+i }t        j                        D ]
  \  }	}
|	||
<    |_        |j                  fd       |D ]  }d|j                  v rO|j                  d   @t        |j                  d   j                        }||   j!                  |j"                         d	|j                  v so|j                  d	   d   j"                  }||   j!                  |j"                          d
nd}|j$                   d| ddj'                  |j)                                ddj'                  |j)                                d}|j$                   dg}t+        |j-                               D ]@  \  }}|j!                  |j$                   d| ddj'                  t+        |                    B S|j!                  |j$                   d       |D ]0  }
|j!                  |j$                   d|
j/                                 2 |dj'                  |      fS c c}w )Nr:  c              3  4   K   | ]  }|j                     y wrt   )r'  )r   ns     rG   r   z&get_kernel_metadata.<locals>.<genexpr>g  s     "Cq177"Cr   r)   r   )_inductor_kernel_metadata_node_to_idx_mapc                "    j                   |    S rt   )r  )r  single_graphs    rG   r7  z%get_kernel_metadata.<locals>.<lambda>q  s    lTTUVW rd   r  r  	from_nodezTopologically SortedUnsorted z Source Nodes: [r   z], Original ATen: []z" Source node to ATen node mapping:z   z => z Graph fragment:
)r  r3  collectionsdefaultdictr  rB   r   r'  r   r   nodesr  sortr   r   r  rL  r   commentr  keysr  itemsformat_node)r  r  r  r  inductor_nodesfrom_node_dictoriginal_aten_dictunique_graphsnode_to_idx_mapidxr  rX  r  sort_strmetadatadetailed_metadataoriginal_noder  r  s                     @rG   get_kernel_metadatar  X  s    $M2K+6W&)):VfWNW ,,T2N$006
 L
>""CN"CC}")!,22L<)TU"$'(:(:; -FC),OA&-IXFW     2dii'DIIo,F,Rdii0@@ACs#**4995$))#))K(+00C3&&tyy12 *6)A%zH??
1XJ&6tyyATATAV7W6X Y99%7%<%<%>?@	C  $OO,,NOP &~';';'= > 
u  s=/diiu6N5OP	

   GOO#44D!EF 	OA $$'8AMMO;L%MN	O
 TYY0111g Xs
   J?J?c                    t        |       } t        |       }| rV| j                         }|j                  D ]4  }|r	 ||      r||vs|j	                  |       | j                  |       6 | rV|S )zJReturns the set of nodes whose values depend on those within initial_queue)r  r   rC   rC  addrL  )initial_queueskip_filterdominated_setrX  users        rG   dominated_nodesr    sz    
 'M}-M
  "JJ 	+D{40=(!!$'$$T*	+  rd   c                   dd l }ddlm dfd|j                         D cg c]  } |      s|j                   }}| D cg c]  } |      s|j                   }}t         |j                  g ||       S c c}w c c}w )Nr   r)   r  c                    t        | j                        r | j                        S t        | j                        r | j                        S t        | j                        xr t        | j
                        S rt   )rf   	TensorBoxdata
StorageBoxr1   	Pointwise)r  r  is_unrealized_nodes    rG   r  z*gather_origins.<locals>.is_unrealized_node  s^    a&%aff--a'%aff--!RYY'GJq",,,GGrd   )r  r1   r{   r   )	itertoolsr  r  r   r  r   chain)	rm   rW  r  valkwarg_originsrK  arg_originsr  r  s	          @@rG   gather_originsr    s     H -3MMOWS?QRU?VS[[WMW*.J32DS2I3;;JKJoiooC{C]CDD XJs   BBBBc                6   t        | t        j                        r| j                  S t        | t        j                        r)dj                  t        t        | j                              S t        | t        j                        r)dj                  t        t        | j                              S t        | t        t        t        t        f      rC| j                  j                   ddj                  t        t        | j                               dS t!        |       S )z
    Normal sympy str is very slow, this is a lot faster.  The result are
    somewhat worse, as it doesn't do as much simplification.  So don't
    use this for final codegen.
    z + z * (r   ))rf   rg   Symbolr   rh   r  rk   	sympy_strrm   MulrS   rP   rQ   rR   funcr|   r   )exprs    rG   r  r    s     $%yy$		"zz#i344$		"zz#i344$(HhGH))$$%QtyyY		1J'K&LANNt9rd   c                    ddl m} t        j                  r3t	        |j
                  dd       x}r|j                  dk7  rt        |       S t        j                         S )Nr)   r#  current_node
index_expr)
r&  r$  rX   compute_all_boundsr?   interpreterr?  rV   rW   unknown)r   r$  fx_nodes      rG   get_bounds_index_exprr    sN     	!!~tDDWDNNl*5!!""$$rd   c                    | d   dk(  S )Nr   rr   )prefixs    rG   prefix_is_reductionr    s    !9rd   c                J    | t         j                  k7  sJ t        | |dd      S )9
    Used to generate an integer-nonnegative symbol.
    Tintegernonnegative)rU   SIZErT   )r  r  s     rG   sympy_index_symbol_with_prefixr    s)     TYY vsDdCCrd   c                N    | xs t         j                  xr t         j                  S rt   )rX   debug_index_assertsassert_indirect_indexing)checks    rG   generate_assertr    s    /V//TV5T5TTrd   c                F    | d   dk7  sJ t        j                  | dd      S )r  r   sTr  )rg   r  r   s    rG   sympy_index_symbolr    s)     7c>> <<d==rd   c                    	 	 	 	 	 	 dd}t        j                  |       j                  |j                         D ci c]  \  }}| |||       c}}      S c c}}w )z
    When the passed replacement symbol v is a string, it is converted to a symbol with name v that
    have the same replaced expression integer and nonnegative properties.
    c                    t        | t        j                        sJ t        |t              r,t        j                  || j
                  | j                        S |S )Nr  )rf   rg   r   r   r  r   is_nonnegative)replacedreplacements     rG   	to_symbolzsympy_subs.<locals>.to_symbol   sP     (EJJ///k3'<< ++$33  rd   )r  rz   r  zUnion[sympy.Expr, str]r{   sympy.Symbol)rg   r   xreplacer  )r  replacementsr  krp   s        rG   
sympy_subsr    sf    +A	 ==''(4(:(:(<=1IaO	= =s   A
c                    t        | t        j                        xs^ t        | t        j                        xrB t	        d t        j                  | j                         | j                               D              S )Nc              3  2   K   | ]  }t        |        y wrt   is_symbolicr   rD   s     rG   r   zis_symbolic.<locals>.<genexpr>  s     N1AN   )	rf   r@   r$   rQ  r0  r  r  r}  stride)r   s    rG   r  r    sS    a& 1ell# 	ON	!((*(MNNrd   c                 &    t        d | D              S )Nc              3  2   K   | ]  }t        |        y wrt   r  r/  s     rG   r   z"any_is_symbolic.<locals>.<genexpr>  s     ,!{1~,r   r0  )rm   s    rG   any_is_symbolicr%    s    ,t,,,rd   c                4   ddl m} t        g d      }t        j                         r|j                  d       | j                  j                  D ]G  }t        |j                        |v r|c S |j                  j                  d      x}< ||      sE|c S  y )Nr   )free_unbacked_symbols)z,aten._fused_moving_avg_obs_fq_helper.defaultz7aten._fused_moving_avg_obs_fq_helper_functional.defaultzfbgemm.dense_to_jagged.defaultz%fbgemm.jagged_to_padded_dense.defaultrun_and_save_rng_staterun_with_rng_statezaten._local_scalar_densezaten._assert_scalar)zaten._unsafe_index_put.defaultz0aten._unsafe_masked_index_put_accumulate.defaultzaten.index_put.defaultzaten.index_put_.defaultzaten.scatter.srczaten.scatter.reducezaten.scatter.value_reducezaten.scatter_add_zaten.scatter_add.defaultzaten.scatter_reduce.twozaten.scatter_reduce_.twozaten.scatter_reduce.two_outr  )%torch.fx.experimental.symbolic_shapesr'  r   r@   $are_deterministic_algorithms_enabledupdater'  r  r   r?  r   get)rY  r'  forbidden_setrX  r  s        rG   %get_first_incompatible_cudagraph_noder/    s     L	
M  113	
"  t{{},K99==''C49Ns9SK	
 rd   c                    t        t        t        | j                  j                                    }|j
                  dk(  sJ |S )z$Get the output node from an FX graphrV  )nextiterreversedr'  r  r3  )rY  	last_nodes     rG   output_noder5  M  s6    T(288>>234I<<8###rd   _registered_cachesc                    t        | d      rt        | j                        st        |  d      t        j                  |        | S )zq
    Use this decorator to register any caches that should be cache_clear'd
    with fresh_inductor_cache().
    cache_clearz# does not have a cache_clear method)r   callabler8  AttributeErrorr6  rL  rt  s    rG   clear_on_fresh_inductor_cacher<  W  s?    
 3&hs.Gu$GHIIc"Jrd   c                 :    t         D ]  } | j                           y)z&
    Clear all registered caches.
    N)r6  r8  r;  s    rG   clear_inductor_cachesr>  c  s     " rd   c                 "   t        t        j                  j                               D ]  } | j	                  d      st        j                  |    }|j
                  j                         D ]  }|j	                  d      st        ||      }t        |t        j                  j                  j                  j                        sZ|j                  D ]0  }|j                  j                  j                   j#                          2  t        j                  | =  dt        j                  v rRt        j                  d   }t%        |j&                  j(                  j*                        `|j&                  j(                  `t/        j0                          y )Nz&torch._inductor.runtime.compile_tasks.triton_ztriton.runtime.driver)r  sysmodulesr  
startswith__dict__r?   rf   r@   	_inductorruntimetriton_heuristicsCachingAutotunercompile_resultskernelrunmod__del__r   driveractiveutilsinstancegccollect)module_namem	attr_namerJ  re  rL  s         rG   unload_xpu_triton_pydsrW  n  s,   CKK,,./ %%%&NOKK$* 	8I##I. I.EOO33EEVV #)"8"8 8))--5578	8 KK$% #++-kk12""(()2JJ#JJLrd   c              #  h  K   t                t        j                  |      	 t        j                  j                  t        j                  di      5  t        j                  d       t        j                  j                  d      }t        j                  j                  t        j                  d|i      5  d t        | t
              rt        |       dk(  sJ d       t        j                  j                  |      rtt        j                  |      }| j!                  |D ci c]D  }d	|vr>|t        j                  j#                  t        j                  j                  ||            F c}       ddd       ddd       |rLt%               r(t&        j(                  j+                         r
t-                t/        j0                  fd
       t                yc c}w # 1 sw Y   oxY w# 1 sw Y   sxY w# t2        $ r t        j5                  d        w xY w# t                w xY ww)z
    Contextmanager that provides a clean tmp cachedir for inductor.

    Optionally, pass a dict as 'cache_entries' to get a list of filenames and sizes
    generated with this cache instance.
    )dirTORCHINDUCTOR_CACHE_DIRzUsing inductor cache dir %stritonTRITON_CACHE_DIRNr   z!expected empty cache_entries dictz.lockc                4    t         j                  d|      S )Nz*Failed to remove temporary cache dir at %s)exc_info)r   warning)r  pathr^  inductor_cache_dirs      rG   r7  z&fresh_inductor_cache.<locals>.<lambda>  s    S[[@&% 6A 6 rd   )onerrorz(on error, temporary cache dir kept at %s)r>  tempfilemkdtempr   patchdictosenvironr   r   r`  r  rf   rB   existslistdirr,  getsize
is_windowsr@   r;   rA   rW  shutilrmtree	Exceptionr_  )cache_entriesrY  deletetriton_cache_dirfilesfra  s         @rG   fresh_inductor_cacheru    s     !))c2& ZZ__JJ24FG
 	 II35GH!ww||,>I.@BR-ST mT2}-2W4WW2ww~~&67 "

+; <%,, */$%#*!#3 !"277??277<<@PRS3T#U U	$ |		 6 6 8&(MM"
 	3 	 	B  >@RS 	sn   !H20G? A'G3<A-G')A	G"2G'9G3AG? H2"G''G0	,G33G<8G? ?!H  H# #H//H2c           	         | j                   }t        t        |             }t        t	        t        ||d                  S )NT)r  reverse)__getitem__r   rB   r  r3  r  )seqgettera_rs      rG   argsortr|    s1    __F
C/C>?@@rd   c           	     2    d fd}t        |      D cg c]9  \  }}|t        |t        j                        r|j                  j
                  n|f; }}}t        |t        j                  |            }|D cg c]  \  }}|	 }}}|S c c}}w c c}}w )Nc                n    | \  }}|\  }}dfd} |||k        ry |||kD        ry||k  ry||kD  ryy)Nc                N    t        | t              r| S j                  | d      S )NT)size_oblivious)rf   r   evaluate_expr)r  r)  s    rG   evaluatez*argsort_sym.<locals>.cmp.<locals>.evaluate  s(    $%**4*EErd   r   r)   r   )r  z%Union[bool, torch.SymInt, sympy.Expr]r{   r   r   )r   r   a_idxa_valb_idxb_valr  r)  s          rG   r{  zargsort_sym.<locals>.cmp  sT    uu	F
 EEM"EEM"
 5=5=rd   r  )r   tuple[int, sympy.Expr]r   r  r{   ru   )	r   rf   r@   r$   rX  r  r  r   
cmp_to_key)r)  ry  r{  r  r  exprsr   re  s   `       rG   argsort_symr    s    4  nC 
Z5<<8affkka@E  5i22378E %&fc1c&F&M
 's   >B<Bc                t    | t         j                  k(  ryt        j                  d|       j                         S )Nr_   r   r   )r@   r  r   element_sizer  s    rG   get_dtype_sizer    s-     ;;r'4466rd   c                      e Zd ZU ded<   y)LineContextr   contextNr|   r}   r~   __annotations__r   rd   rG   r  r    s    Lrd   r  c                  "    e Zd ZU ded<   ded<   y)ValueWithLineMapr   rx   zlist[tuple[int, LineContext]]line_mapNr  r   rd   rG   r  r    s    J++rd   r  c                      e Zd ZdZdddZddZddZddZddZddZ	ddZ
dd	Zdd
Z	 	 	 	 ddZdddZdddZdddZ	 d	 	 	 	 	 d dZd!dZddZd"dZy)#IndentedBuffer   c                     g | _         || _        y rt   )_lines_indent)r  initial_indents     rG   __init__zIndentedBuffer.__init__   s    GI%rd   c                   t               }d}g }| j                  D ]  }t        |t              r
 |       }|1t        |t              r|j                  ||j                  f       K|}t        |t              sJ |j                  |       |j                  d       |d|j                  d      z   z  } t        |j                         |      S )Nr)   r  )r
   r  rf   DeferredLineBaser  rL  r  r   writecountr  getvalue)r  bufr   linemaplilines         rG   getvaluewithlinemapz"IndentedBuffer.getvaluewithlinemap  s    j13++ 	&B"./t<B,2::/dC(((IIdOIIdOTZZ%%%A	&  88rd   c                6    | j                         j                  S rt   )r  rx   r  s    rG   r  zIndentedBuffer.getvalue  s    '')///rd   c                f   t               }| j                  D ]  }t        |t              r
 |       }|t        |t              r.|}t        |t
              sJ |j                  d      r|j                  |d d        h|j                  |       |j                  d        |j                         S )N\r   r  )	r
   r  rf   r  r  r   endswithr  r  )r  r  r  r  s       rG   getrawvaluezIndentedBuffer.getrawvalue  s    j++ 	 B"./t<B,dC(((}}T"		$s)$		$		$	   ||~rd   c                8    | j                   j                          y rt   )r  clearr  s    rG   r  zIndentedBuffer.clear/  s    rd   c                ,    t        | j                        S rt   )r   r  r  s    rG   __bool__zIndentedBuffer.__bool__2  s    DKK  rd   c                :    d| j                   | j                  z  z  S )Nr  )r  tabwidthr  s    rG   r  zIndentedBuffer.prefix5  s    dllT]]233rd   c                &    | j                  d       y )Nr  	writeliner  s    rG   newlinezIndentedBuffer.newline8  s    trd   c                   t        |t              r| j                  j                  |       y t        |t              r9| j                  j                  |j                  | j                                      y |j                         r.| j                  j                  | j                          |        y | j                  j                  d       y Nr  )rf   r  r  rL  r  with_prefixr  stripr  r  s     rG   r  zIndentedBuffer.writeline;  s    dK(KKt$./KKt//>?ZZ\KK$++-78KKr"rd   c                4    |D ]  }| j                  |        y rt   r  )r  linesr  s      rG   
writelineszIndentedBuffer.writelinesE  s      	!DNN4 	!rd   c                H     t         j                  d fd       } |       S )Nc               3     K   xj                    z  c_         	 d  xj                    z  c_         y # xj                    z  c_         w xY wwrt   r  )offsetr  s   rG   r  z"IndentedBuffer.indent.<locals>.ctxL  s9     LLF"L'&&s   A4 AAAr{   Iterator[None])
contextlibcontextmanager)r  r  r  s   `` rG   indentzIndentedBuffer.indentK  s$    		"	"	' 
#	' urd   c                .    | xj                   |z  c_         y rt   r  r  r  s     rG   	do_indentzIndentedBuffer.do_indentV      rd   c                .    | xj                   |z  c_         y rt   r  r  s     rG   do_unindentzIndentedBuffer.do_unindentY  r  rd   c           	        t        |t              rt        d      }|j                  D ]E  }t        |t              r|st        |t        |      t        |j                               z
        }G t        j                  |      rd}|j                  D ]P  }t        |t              r| j                  j                  |       /t        j                  | |t        |      d         R y t        j                  |      }|r|j                         }|sy |j                         }|j!                  d      D ]  }| j                  |        y )Ninfr   r  )rf   r  floatr  r  minrB   r  mathisinfrL  r  ru   textwrapdedentrstripr  )r  
other_coder  r  r  r  s         rG   splicezIndentedBuffer.splice\  s    j.15\F")) I!$4 TS5G)GHFI zz&!")) HdK0KK&&t,",,T4F3FG	H "4J'..0
#**,J%%d+ "q!"rd   c                    t        | j                        }| j                  D cg c]
  } ||       c}|_        |S c c}w N)r  )r  r  r  )r  r  r   r  s       rG   rk   zIndentedBuffer.mapu  s4    DLL9-1[[9Td4j9

 :s   >c                @    t        |        d| j                          dS )Nr  r  )r   r  r  s    rG   __repr__zIndentedBuffer.__repr__z  s     t*Qt}}/q11rd   c                    | j                   |j                   k(  sJ t        | j                         }|j                  | j                         |j                  |j                         |S r  )r  r  r  r  )r  otherr   s      rG   __add__zIndentedBuffer.__add__}  sK    ||u}},,,DLL9t{{#u||$
rd   Nr   )r  ru   r{   r  )r{   r  r{   r   r{   r  r{   r   )r  z)Union[LineContext, DeferredLineBase, str]r{   r  )r  z3Sequence[Union[LineContext, DeferredLineBase, str]]r{   r  rr   )r  ru   r{   'contextlib.AbstractContextManager[None])r  ru   r{   r  )F)r  zUnion[IndentedBuffer, str]r  r   r{   r  )r  zCallable[[Any], Any]r{   r  )r  r   r{   r  )r|   r}   r~   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rk   r  r  r   rd   rG   r  r    s    H&9(0(!4#!H!	!	 EJ"4"=A"	"2
2rd   r  c                  (     e Zd Zd fdZddZ xZS )FakeIndentedBufferc                "    t         |           y rt   )superr  )r  	__class__s    rG   r  zFakeIndentedBuffer.__init__  s    rd   c                V    |dk(  rt         j                  | |      S t        d| d      )Nr  zTried to call self.z on FakeIndentedBuffer. This bufferis currently used on TritonTemplateKernel to prevent actualwrites to the body without explicitly specifying the body with`TritonTemplateKernel.set_subgraph_body(name)`)object__getattribute__r   )r  r   s     rG   r  z#FakeIndentedBuffer.__getattribute__  s;    ;**466!$ (= =
 	
rd   r  )r   r   r{   r   )r|   r}   r~   r  r  __classcell__r  s   @rG   r  r    s    
rd   r  c               #     K   t         j                  t         j                  }} 	 d  | |ct         _        t         _        y # | |ct         _        t         _        w xY wwrt   )rA  stdoutstderr)initial_stdoutinitial_stderrs     rG   restore_stdout_stderrr    s@     %(ZZNN@!/
CJ
CJs   !AA  A AAc                  P    e Zd ZdZddZddZddZddZddZddZ	ddZ
dd	Zy
)r  z.A line that can be 'unwritten' at a later timec                6    |j                         sd}|| _        y r  )r  r  r  s     rG   r  zDeferredLineBase.__init__  s    zz|D	rd   c                    t         )zJReturns either self.line or None to indicate the line has been 'unwritten'r  r  s    rG   r  zDeferredLineBase.__call__      !!rd   c                    t         )z3Returns a new deferred line with the same conditionr  r  s     rG   	_new_linezDeferredLineBase._new_line  r  rd   c                @    | j                  | | j                         S rt   r  r  )r  r  s     rG   r  zDeferredLineBase.with_prefix  s    ~~455rd   c                T    | j                  | j                  j                               S rt   )r  r  r  r  s    rG   r  zDeferredLineBase.lstrip  s    ~~dii..011rd   c                >    | j                  | j                  |         S rt   r  )r  r   s     rG   rx  zDeferredLineBase.__getitem__  s    ~~dii.//rd   c                ,    t        | j                        S rt   )r   r  r  s    rG   r  zDeferredLineBase.__bool__  s    DIIrd   c                ,    t        | j                        S rt   )rB   r  r  s    rG   __len__zDeferredLineBase.__len__  s    499~rd   N)r  r   )r{   zUnion[str, None])r  r   r{   r   )r  r   r{   r   )r{   r   )r   zUnion[int, slice]r{   r   r  r{   ru   )r|   r}   r~   r   r  r  r  r  r  rx  r  r  r   rd   rG   r  r    s-    8
""620rd   r  c                  4     e Zd ZdZd fdZddZddZ xZS )DelayReplaceLinez6At end of codegen call `line.replace(key, value_fn())`c                @    t         |   |       || _        || _        y rt   )r  r  r  value_fn)r  r  r	  r  r  s       rG   r  zDelayReplaceLine.__init__  s     rd   c                j    | j                   j                  | j                  | j                               S rt   )r  replacer  r	  r  s    rG   r  zDelayReplaceLine.__call__  s#    yy  4==?;;rd   c                D    t        | j                  | j                  |      S rt   )r  r  r	  r  s     rG   r  zDelayReplaceLine._new_line  s    $-->>rd   )r  r   r	  zCallable[[], str]r  r   r  )r  r   r{   r  )r|   r}   r~   r   r  r  r  r  r  s   @rG   r  r    s    @!
<?rd   r  c                   t        | t        j                        r| }nt        j                  t               |       }t	        j
                  |      }t        j                  j                  rC|j                  J |j                  dk  s|j                  dk(  rt        j                  d       yy|j                  dk(  rdnd}|j                  }||k  rt        j                  d	||d
       yy)N	   
   z6GPU arch does not support max_autotune_gemm mode usageFTr;   r\   D   z,Not enough SMs to use max_autotune_gemm mode)min_sms	avail_sms)extra)rf   r@   r   rH   r   createversionhipmajorr   r_  r   multi_processor_count)index_or_devicer   propr  r  s        rG   
is_big_gpur    s    /5<<0 lno>""6*D }}zz%%%::>TZZ2-KKPQKK5(bbG**I7:%I> 	 	
 rd   c                 T    t         j                  j                  d      j                  S )Nr9   )r@   r9   get_device_propertiesr  r   rd   rG   get_max_num_smsr    s    ::++F3IIIrd   c                 d    t         j                  j                         } t               | | z
  S dz
  S )zFHandle experimental carveout if set otherwise return hardware SM countr   )r@   r   _get_sm_carveout_experimentalr  )carveouts    rG   get_num_smsr"    s1     xx557HH,@HHaHHrd   c                    ddl m}m} |j                  d      }t	               | z  t
        z  } |||| |j                               S )zKBuilds and returns a WorkspaceArg for the device side TMA workspace buffer.r)   )r*   WorkspaceZeroModeF)r  	zero_moder   
outer_name)codegen.commonr*   r$  	from_boolr"  TMA_DESCRIPTOR_SIZEunique_name)num_tma_descriptorsr   r*   r$  r%  r}  s         rG   get_tma_workspace_argr,    sO    
 @!++E2I=..1DDD+<++-	 rd   c                 j    t         j                  xs" t         j                  xs t         j                  S rt   )rX   max_autotunemax_autotune_gemmsearch_autotune_cacher   rd   rG   use_max_autotuner1    s&    Wv77W6;W;Wrd   c                    t        | j                  j                        xr% | j                  |v xr t	        | j                        S rt   )is_gpur   r   r   r  )layoutallowed_layout_dtypess     rG   _use_template_for_gpur6    s>     	v}}!!" 	&LL11	&v}}%rd   c                    | j                         t        j                  j                         j                  d      D cg c]  }|j	                          c}v S c c}w N,)upperrX   max_autotune_gemm_backendsr  r  backendrD   s     rG   _use_autotune_backendr>    M    ==?!<<BBDJJ3O	      Ac                    | j                         t        j                  j                         j                  d      D cg c]  }|j	                          c}v S c c}w r8  )r:  rX   max_autotune_conv_backendsr  r  r<  s     rG   _use_conv_autotune_backendrC    r?  r@  F)enable_int32enable_float8c               r   ddl m}m} t        j                  t        j
                  t        j                  g}|r>t        j                  t        j
                  t        j                  t        j                  g}|r/|j                  t        j                  t        j                  g       t        | j                  j                        xr t        | |      xs) | j                  j                  dk(  xr | j                  |v xr6 t!               xr* t#        d      xr  || j                  |j$                        S )Nr)   )BackendFeaturehas_backend_featurer   TRITON)r'  rG  rH  r@   r  r  r  r  extendr   r   r3  r   r   r6  r   r1  r>  TRITON_TEMPLATES)r4  rD  rE  rG  rH  layout_dtypess         rG   use_triton_templaterM  #  s     D]]ENNEMMBMu{{Se1153D3DEF v}}))* A)&-@O ""e+M0M		P 		P "(+		P  ~/N/NOrd   c                     ddl m} ddlm dfdt        j
                  j                  xr  |       xr t        fd| D              S )Nr   )has_triton_tma_devicer)   r#  c                   t        | j                               dk7  ry| j                         }|t        j                  t        j
                  fvry| j                         }|j                         }|j                         s|sy|j                  d   }|r|j                  d   }||j                  z  }j                  j                  j                  |t              S )N   Fr)   r   )rB   get_size	get_dtyper@   r  r  
get_layoutis_transposedis_contiguousr}  itemsizer'  r(  statically_known_multiple_ofTMA_ALIGNMENT)rD   r   r4  
transposed	inner_diminner_bytesr$  s         rG   _is_tma_compatiblez3use_triton_tma_template.<locals>._is_tma_compatible@  s    qzz|!77))+
$$&*KKN	AI%..0ww<<[-XXrd   c              3  .   K   | ]  } |        y wrt   r   )r   rU  r]  s     rG   r   z*use_triton_tma_template.<locals>.<genexpr>V  s     8!"1%8   rD   r1   r{   r   )torch.utils._tritonrO  r&  r$  rX   r[  enable_persistent_tma_matmulrj   )matricesrO  r$  r]  s     @@rG   use_triton_tma_templaterd  ;  sA    9Y( 	22 	9!#	98x88rd   c                   ddl m} |j                  j                  j	                  ||z  |z  d      }|dk  s|t
        j                  j                  k  ryddlm	} t        j                  j                  ryt        j                  t        j                  t        j                  t        j                   g}t#        | |      xr t%               xr t'        d      }|r |       st(        j+                  d	       y|S )
Nr)   r#  r   fallbackr   F)try_import_cutlassCUTLASSzFailed to import CUTLASS lib. Please check whether _inductor.config.cuda.cutlass_dir is set correctly. Skipping CUTLASS backend for now.)r&  r$  r'  r(  	size_hintrX   r9   cutlass_backend_min_gemm_sizecodegen.cuda.cutlass_utilsrh  r@   r  r  r  r  r  r  r6  r1  r>  r   r_  )	r4  rU  r  r  r$  	gemm_sizerh  rL  r   s	            rG   use_cutlass_templatern  Z  s      **1q519r*BIA~V[[%N%NN> }}]]ENNEMM5;;OMfm4 	-	-!),  !#KK4
 Jrd   c                T    t         j                  j                  |       j                  S rt   )r@   r9   r  gcnArchNamer   s    rG   _rocm_native_device_arch_namerr  x  s    ::++F3???rd   c                     	 dd l } ddlm}m} ddlm} t        j                  j                  | j                        }||||fS # t        $ r dd}dd} G d d      }d }Y %w xY w)	Nr   )gen_ops_librarygen_ops_preselected)CKGemmOperationc                     g S rt   r   r   rd   rG   rt  z*try_import_ck_lib.<locals>.gen_ops_library      Ird   c                     g S rt   r   r   rd   rG   ru  z.try_import_ck_lib.<locals>.gen_ops_preselected  rx  rd   c                      e Zd Zy)*try_import_ck_lib.<locals>.CKGemmOperationN)r|   r}   r~   r   rd   rG   rv  r{    s    rd   rv  )r{   rH  )ck4inductor(ck4inductor.universal_gemm.gen_instancesrt  ru  ck4inductor.universal_gemm.oprv  rg  r`  dirname__file__r   )r|  rt  ru  rv  package_dirnames        rG   try_import_ck_libr  }  sl    	
	
 ''//+*>*>? O-@/QQ  			 	 s   ;A A#"A#c                   t               syt        j                  j                  sy| j                  j
                  dk(  syt        | j                        }t        j                  j                  D ci c]  }|j                  d      d   | c}xs |j                  d      d   |i}|j                         t        j                  j                  z  D cg c]  }||   	 }}|sy| j                  t        j                  t        j                  t        j                   fvryt#               \  }}}}|st$        j'                  d       yt        j(                         r|t        j                  _        t        j                  j*                  st$        j'                  d       y|t        j                  j*                  k7  rt$        j'                  d       yyc c}w c c}w )	NFr9   :r   z,Please pip install Composable Kernel packagez,Please set TORCHINDUCTOR_CK_DIR env variablezInvalid path to CK libraryT)r1  r@   r  r  r   r   rr  rX   rocmarchr  r  ck_supported_archr   r  r  r  r  r   r_  	is_fbcodeck_dir)r4  native_archr  requested_archsrequested_supported_archsck_package_dirnamer   s          rG   use_ck_templater    s|   ====' 0>K39;;3C3CDaqwws|A)D #q!;IO
 !%%'&++*G*GG! 	! ! %||EMM5>>5==II"3"51aBC/;;BCV[[///01= E!s   2GGc                    ddl m} t        d      xr= t        |       xr0 |j                  j
                  j                  ||z  |z  d      dkD  S )Nr)   r#  CKr   rf  r   )r&  r$  r>  r  r'  r(  rj  )r4  rU  r  r  r$  s        rG   use_ck_gemm_templater    sR     	d# 	CF#	CGG&&q1uqy2&>Brd   c                2    t        d      xr t        |       S )Nr  )rC  r  r4  s    rG   use_ck_conv_templater    s    %d+G0GGrd   c                L    t               xr | j                  j                  dk(  S r\  )r1  r   r   r  s    rG   _use_template_for_cpur    s    =&--"4"4"==rd   c                    ddl m} t        |j                  |      sJ t	        | ||d      xr |j                  j                         S )Nr)   )r2   F)require_constant_mat2)r  r2   rf   r4  use_cpp_gemm_templaterV  )r4  mat1mat2r2   s       rG   use_cpp_bmm_templater    sE     dkk6*** 	fdDN 	(KK%%'rd   c                `   ddl m} ddlm} ddlm}	 ddlm}
 t        |       rt        d      syt        j                  j                  sy|j                         t        j                  t        j                   fv }t        j"                  t        j$                  t        j&                  t        j                  g} |
|||r| j(                  nd ||      \  }}}} }}t+        ||f      ryt-        ||j.                        r|j1                         } |	|j                               \  }} |d	||||j                         |j                         |t3               | |

      }dd}| j(                  |v xr= |d uxr7  ||      xr- t-        ||j4                        xr |j7                         xs | S )Nr)   r  )create_micro_gemm)*get_gemm_template_output_and_compute_dtype)mm_argsCPPF)	out_dtypemat2_transposeduse_4x2_dim
micro_gemm)input_dtypeinput2_dtypeoutput_dtypenum_threadsuse_refq_group_sizec                N    | j                          | j                         d   dk(  S )Nr   r)   )freeze_layout
get_striderD   s    rG   is_last_dim_stride1z2use_cpp_gemm_template.<locals>.is_last_dim_stride1  s"    	||~b!Q&&rd   r`  )r  r  codegen.cpp_micro_gemmr  codegen.cpp_utilsr  kernel.mm_commonr  r  r>  rX   cppweight_prepackrS  r@   r  r	  r  r  halfr   has_free_symbolsrf   BaseViewunwrap_viewparallel_num_threadsr  is_module_buffer)r4  r  r  r  r  is_woq_int4r  r  r  r  r  	int8_gemmrL  rU  r  r  r  r   r  r  s                       rG   r  r    s    9M) (0Ee0L::$$ U[[%**$==I]]ENNEJJLM")"+&,,'#Aq!VT4 A$$!@AQROL!"			NN$^^%!(*!J'
 	% 	Cd"	C%	C tR]]+	C ""$A,A(Ard   c                 2    t                xs t        d      S )NATEN)r1  r>  r   rd   rG   use_aten_gemm_kernelsr  '  s    !!B%:6%BBrd   c                  T    e Zd ZU  ej                  d      Zded<   ddZddZd	dZ	y)
DebugDirManagerr   r   prev_debug_namec                @    t        t        j                        | _        y rt   )r1  r  counterr   r  s    rG   r  zDebugDirManager.__init__/  s    ../rd   c                    t         j                  j                  j                  | _        | j                   d| j
                   | _        | j                  t         j                  j                  _        y )N_tmp_)r@   _dynamorX   debug_dir_rootr  r   new_namer  s    rG   	__enter__zDebugDirManager.__enter__2  sM    $}}33BB//0dggY?.2mm+rd   c                    t        j                  | j                         | j                  t        j
                  j                  _        y rt   )rm  rn  r  r  r@   r  rX   r  )r  rm   s     rG   __exit__zDebugDirManager.__exit__7  s*    dmm$.2.B.B+rd   Nr  )rm   r   r{   r  )
r|   r}   r~   r  r  r  r  r  r  r  r   rd   rG   r  r  +  s(    iooa G0<
Crd   r  c                    ddl m} g dfd}t        j                  j	                  |d|      5  t
        j                  j                           | |i |}d d d        |fS # 1 sw Y   fS xY w)Nr)   r-   c                (    j                  |        y rt   rL  codesource_codess    rG   save_output_codez*run_and_get_code.<locals>.save_output_codeE      D!rd   r  r  r   r{   r  r'  r.   r   re  r  r@   r  reset)r   rm   rW  r.   r  re  r  s         @rG   run_and_get_coder  <  su    
 % L" 
		=*<>N	O %T$V$% <% <s   'A$$A0c                    t        | g|i |\  }}g }|D ]6  }|j                  t        j                  d|t        j                               8 ||fS )Nz	'''.*?''')r  rJ  refindallDOTALL)r   rm   rW  re  r  kernelsr  s          rG   run_and_get_kernelsr  N  sZ     ,B@@@FLG Brzz,bii@AB7?rd   c                &     d fd}t        |      S )Nc                 R            } | j                         j                          | S rt   )r   backward)re  r   s    rG   run_with_backwardz1run_fw_bw_and_get_code.<locals>.run_with_backwardY  s!    

rd   )r{   r   )r  )r   r  s   ` rG   run_fw_bw_and_get_coder  X  s    
 -..rd   c                X   ddl m} g dfdd	fd}t        j                  j	                  |d|      5  t        j                  j	                  |d      5  t
        j                  j                           | |i |}ddd       ddd       S # 1 sw Y   xY w# 1 sw Y   S xY w)
zLGet the inductor-generated code, but skip any actual compilation or running.r)   r-   c                (    j                  |        y rt   r  r  s    rG   r  z"get_code.<locals>.save_output_codeg  r  rd   c                     G d d      }| j                   r| j                         n| j                         \  }} |j                         |r |j                          |       S )Nc                       e Zd ZdZddZddZy)@get_code.<locals>.patched_compile_to_module.<locals>.DummyModulez4This is empty to replace the generated triton modulec                     y rt   r   r  s    rG   r  zIget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.__init__n  s    rd   c                     y rt   r   r  s      rG   callzEget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.callq  s    rd   Nr  rm   r   rW  r   r{   r  )r|   r}   r~   r   r  r  r   rd   rG   DummyModuler  k  s    Frd   r  )cpp_wrappercodegen_with_cpp_wrappercodegenrx   )r  r  wrapper_codekernel_coder  s       rG   patched_compile_to_modulez+get_code.<locals>.patched_compile_to_modulej  s]    	 	 04/?/?D))+T\\^ 	"k
 	++,[../}rd   compile_to_moduler  Nr  )r  r.   r{   r   r  )r   rm   rW  r.   r  r   r  r  s         @@rG   get_coder  a  s    $ L". 	

.0I	
  	

-);=MN	  	          s#   "B'BBB	BB)c                |    t        | g|i |}dt        |      cxk  rdk  sn J dt        |              |d   S Nr)   rQ  z%expected one or two code outputs got r   )r  rB   )r   rm   rW  r  s       rG   get_triton_coder    sQ    B000LL!&Q& 
/L0A/BC& ?rd   c                    t        | g|i |\  }}dt        |      cxk  rdk  sn J dt        |              |d   S r  )r  rB   )r   rm   rW  r   r  s        rG   run_and_get_triton_coder    sU    &r;D;F;OA|L!&Q& 
/L0A/BC& ?rd   c                    ddl m ddlm} |j                  g dfd}t
        j                  j                  |d|      5   | |i |}d d d        |fS # 1 sw Y   fS xY w)Nr   r-   r5   c                 ^     | i | | d   }t        |      sJ j                  |       y )NrQ  )rf   rL  )rm   rW  r'  r.   graph_lowerings	real_inits      rG   	fake_initz-run_and_get_graph_lowering.<locals>.fake_init  s7    4"6"Q%///u%rd   r  r  )torch._inductor.graphr.   torch._inductor.output_coder6   r  r   re  r  )	r   rm   rW  r6   r   re  r.   r  r  s	         @@@rG   run_and_get_graph_loweringr    sq     4;((IO& 
		?J		B %T$V$% ?""% ?""s   	AA(c              #     K   ddl m} |j                  |    }	 t        j                  ||      |j                  | <   d ||j                  | <   y# ||j                  | <   w xY ww)z
    Override the lowering of aten_op with override_fn.
    The first argument of override_fn is the original lowering fn.
    r   )loweringN)torch._inductorr  	loweringsr   partial)aten_opoverride_fnr  orig_fns       rG   override_loweringr    s`      )  )G.&/&7&7W&M7#&-7#g7#s   A$'A  A$A!!A$c                     ddl m} |j                  d fd}t        j                  j
                  j                  |d|      S )zr
    Add hook functions to be called at the beginning and end of Scheduler.__init__.
    Used for unit tests.
    r   )	Schedulerc                B     | |        | |      }r	 | |       |S rt   r   )r  r  outr  post_fnpre_fns      rG   r  z(add_scheduler_init_hook.<locals>.wrapper  s+    y% i'Iu%
rd   r  )r  r   r  r   r{   r   )torch._inductor.schedulerr  r  unittestr   re  r  )r  r  r  r  r  s   ``  @rG   add_scheduler_init_hookr    s9     4  G ==%%iWEErd   c                z    t         j                  rt        j                  |        yt        j	                  |        y)z
    Warnings that will be actionable for PyTorch developers, but not
    end users.  Allows us to easily disable them in stable releases but
    keep them on for nightly builds.
    N)rX   developer_warningsr   r_  info)msgs    rG   developer_warningr    s$       Crd   c                    	 t         j                  j                  d      } | dz   t        t         j                        k  rTt        t         j                  | dz            dkD  r2t         j                  | dz      d   dk7  rt         j                  | dz      S t         j                  D ]#  }|j                  d      s|t        d      d c S  y# t        $ r Y Bw xY w)a  
    An experimental API used only when config.benchmark_kernel is true.

    The benchmark name is only available at codegen time. So we can not
    directly call it in benchmark_all_kernels which is run after codegen.

    The function assumes the argument after --only is the benchmark name.
    It works for torchbench.py/hugginface.py/timm_models.py. But for ad-hoc
    scripts, this function may return None.

    There are 2 flavors of --only argument we need handle:
    1. --only model_name
    2. --only=model_name
    z--onlyr)   r   -z--only=N)rA  argvr   rB   
ValueErrorrC  )r  rK  s     rG   get_benchmark_namer    s    	hhnnX&!Gc#((m#CHHS1W%&*q!!$+88C!G$$ xx )>>)$s9~'(()   s   BC 	CCc                &    t        d | D              S )Nc              3  &   K   | ]	  }|d k(    ywr)   Nr   r  s     rG   r   zis_ones.<locals>.<genexpr>	       %!qAv%   rj   r  s    rG   is_onesr'        %u%%%rd   c                &    t        d | D              S )Nc              3  &   K   | ]	  }|d k(    yw)r   Nr   r  s     rG   r   zis_zeros.<locals>.<genexpr>  r#  r$  r%  r&  s    rG   is_zerosr+    r(  rd   c                &    t        d | D              S )Nc              3     K   | ]@  }t        |t        j                        r$|j                  t        j                  d       k(   B yw)r   N)rf   r@   rQ  r   )r   rk  s     rG   r   z is_cpu_device.<locals>.<genexpr>  s8      dELL) 	u||E**s   AAr%  )inputss    rG   is_cpu_devicer/    s       rd   c                    t        | t        j                        sJ d       | j                  rt        j
                  S t        j                  S )Nz8only support sympy.Expr as input to get_sympy_Expr_dtype)rf   rg   r   r   r@   r  r  )r  s    rG   get_sympy_Expr_dtyper1    s=    c5::& B& ~~{{}}rd   c              /     K   | r-t        j                  j                  |i |5 }| d d d        y d  y # 1 sw Y   y xY wwrt   )r@   r   r   )should_profilerm   rW  r   s       rG   maybe_profiler4  "  sE     ^^##T4V4 	G	 	 		 	s   "A7AA Ac                 l    t         j                  j                  } | dk  rt        j                         } | S Nr)   )rX   r  threadsr@   get_num_threads)r7  s    rG   r  r  +  s+    jj  G{'')Nrd   c                     ddl m}   |        }|j                  dt        j                  j
                  rd      S d      S )Nr)   )get_backend_options
num_stagesrQ     )runtime.triton_helpersr:  r-  r@   r  r  )r:  optionss     rG   get_backend_num_stagesr?  2  s2    ;!#G;;|%--*;*;QCCCCrd   c                   ddl m}m} | t        j                  t        j
                  t        j                  fv sJ t        j                  |      j                  j                  d      rddlm}  |       }| t        j                  t        j
                  fv r	 || |      S t        j                  j                  j                  j                   r |t        j                  |      S  |t        j                  |      S | t        j                  t        j
                  fv r ||       S t        j                  j                  j                  j                   r |t        j                        S  |t        j                        S )Nr   )get_max_simd_tflopsget_max_tensorcore_tflops
clock_rate)max_clock_rate)triton.testingrA  rB  r@   r  r  r  inspect	signature
parametersr-  torch._utils_internalrD  backendsr9   matmul
allow_tf32)r   rA  rB  rD  sm_clocks        rG   get_device_tflopsrN  :  s   MU]]ENNEMMBBBB,-88<<\J8!#U]]ENN33,UH==>>%%00,U]]HEE&u}}h??U]]ENN33,U33>>%%00,U]];;&u}}55rd   c                     ddl m}   |        S )Nr   get_dram_gbps)rE  rQ  rP  s    rG   get_gpu_dram_gbpsrR  V  s    ,?rd   c                 x    ddl m}  | j                  j                  j	                  d      j                  dd      S )Nr   rN  max_shared_mem)triton.runtimerN  rO  rP  r  r-  rT  s    rG   get_gpu_shared_memoryrW  ]  s.    %==44Q7;;<LaPPrd   c                $    | j                  d      S )Nwelford)rC  reduction_types    rG   is_welford_reductionr\  c  s    $$Y//rd   c                (    t        |       ry| dk(  ryy)Nr<  online_softmax_reducerQ  r)   )r\  rZ  s    rG   reduction_num_outputsr_  g  s    N+	2	2rd   c                 0    t        j                         dk(  S )NLinux)platformsystemr   rd   rG   is_linuxrd  p  s    ??''rd   c                 (    t         j                  dk(  S )NrZ   )rA  rb  r   rd   rG   rl  rl  t  s    <<7""rd   c                &    t        d | D              S )Nc              3  n   K   | ]-  }t        |t        j                        xr |j                    / y wrt   )rf   rg   r   	is_numberr  s     rG   r   z#has_free_symbols.<locals>.<genexpr>y  s)     Jz!UZZ(<_<Js   35r$  )itrs    rG   r  r  x  s    JcJJJrd   c            	     x   ddl m} | D ]  }t        ||j                  |j                  |j
                  |j                  |j                  f      r=t        |j                         xs d      st        |j                         xs d      s yt        ||j                        st        dt        |              y)Nr)   r  r   Tzunexpected type for is_dynamic F)r  r  rf   r  r  r  ComputedBufferr/   r  maybe_get_sizemaybe_get_strider1   	TypeErrorr   )rm   r  ts      rG   
is_dynamicrp  |  s     IbmmR[[":K:KRYYW
   0 0 2 8b9=M""$*> Aryy)=d1gYGHHI rd   c                      e Zd ZdZdZy)PlaceholderKERNEL_NAMEDESCRIPTIVE_NAMEN)r|   r}   r~   rs  rt  r   rd   rG   rr  rr    s      K *rd   rr  c                   ddl m} t        j                  ddd      5 }t	        j
                         }t	        j
                         } t        |t        |            j                  |  t        d|j                   |	       t        |j                  |	       t        j                         }t        ||      5   | |j                         d d d        t        j                         |z
  }	 ||j                         |j                  j                          |j                          t        d
|j                   |	       t        |j                  |	       |j!                         |j!                         k(  }
t"        j%                  d||j&                  |
|	       d d d        y # 1 sw Y   xY w# 1 sw Y   y xY w)Nr)   )stable_topological_sortwzutf-8F)modeencodingrq  )rY  	fake_modezBefore:
)filezAfter:
zZ%s, save before/after graph to %s, graph before/after are the same = %s, time elapsed = %s)pattern_matcherrv  rc  NamedTemporaryFileior
   rN   rJ   	propagaterj  r'  r	   nowrM   lint	recompiler  r   r  r   )r  rY  inpr  rv  rt  	before_ioafter_io
start_timetime_elapsedro  s              rG   pass_execution_and_saver    sX    9		$	$
 
 
KKM	;;=C	R#3C#89CCSI	"(($1-bhhY'\\^
#B, 	N	||~
2)


#!,bhhX& H$5$5$77hFF	
-
 
	 	
 
s%   BF4<F(CF4(F1	-F44F=c                ~    ddl m} t        | |j                        xr  t        | j                  |j
                        S )zB
    Check if input buffer is a multi-outputs template buffer
    r)   r  )r  r  rf   CppTemplateBufferr4  MultiOutputLayout	input_bufr  s     rG   is_multi_outputs_templater    s9     i!5!56 :"..< rd   c                    ddl m} t        | |j                        xr2 t	        | j
                        dk(  xr t        | j
                  d         S )zL
    Check if input buffer is a output of multi-outputs template buffer
    r)   r  r   )r  r  rf   MultiOutputrB   r.  r  r  s     rG   #is_output_of_multi_outputs_templater    sL      	9bnn- 	;	  !Q&	;%i&6&6q&9:rd   c                   | yddl m} t        |       |j                  k(  xr |d u xs | j                  |u xsB t        |       |j
                  k(  xr' t        t        j                  j                  d      xr; | j                  t        j                  j                  j                  j                  k(  xs t        t        j                  j                  d      xr; | j                  t        j                  j                  j                  j                  k(  xsa t        t        j                  j                  d      xr; | j                  t        j                  j                  j                  j                  k(  S )NFr)   r  all_to_all_singleall_gather_into_tensorreduce_scatter_tensor)r  r  r   _CollectiveKernelop_overloadFallbackKernelr   r@   r   torchrecr  defaultr  r  rX  r3  r  s      rG   is_collectiver    s1    | 	T
b***Ud
0Td>N>NRT>T 	T
b''' 	

 		**,?@ U$$		(:(:(L(L(T(TT
 		**,DE E$$99%%<<DDE 		**,CD Y$$		(:(:(P(P(X(XX+rd   c                >    ddl m} t        |       |j                  k(  S Nr)   r  )r  r  r   _WaitKernel)rX  r  s     rG   is_waitr    s    :''rd   c                    ddl m} t        | |      rt        d | j                  D              S t        | j                        S )Nr   GroupedSchedulerNodec              3  2   K   | ]  }t        |        y wrt   )contains_collectiver  s     rG   r   z&contains_collective.<locals>.<genexpr>  s     @a&q)@r   )r  r  rf   r0  snodesr  rX  snoder  s     rG   r  r    s4    >%-.@5<<@@@$$rd   c                    ddl m} t        | |      rt        d | j                  D              S t        | j                        S )Nr   r  c              3  2   K   | ]  }t        |        y wrt   )contains_waitr  s     rG   r   z contains_wait.<locals>.<genexpr>  s     :=#:r   )r  r  rf   r0  r  r  rX  r  s     rG   r  r    s4    >%-.:U\\:::uzz""rd   c                    ddl m} t        |t        j                  j
                        r|g}t        | |j                        xr | j                  |v S r  )r  r  rf   r@   r@  rA  r  r  r  s      rG   is_fallback_opr    sE     "ejj++,TdB--.I43C3Cr3IIrd   c                B    |||    j                   j                            S rt   )defining_opr  )buf_namename_to_bufname_to_fused_nodes      rG   buf_name_to_fused_snoder    s#     k(3??HHJKKrd   c                     yr6  r   r  s    rG   r7  r7  *  r8  rd   c                     ||       ry |j                  |        | j                  D ].  }t        |j                  ||      }||v rt	        |||||       0 y )Ncriteria_cb)r  unmet_dependenciesr  r   find_recursive_deps_of_node)r  collected_node_setr  r  r  depdefining_op_for_deps          rG   r  r  %  sn     55!'' 
5HHk#5
 "44##	

rd   c                     yr6  r   r  s    rG   r7  r7  C  r8  rd   c           	     z    ||       ry |j                  |        | j                         D ]  }|j                  D ]}  }|j                  J |j                  j	                         dk(  r/|j                  j	                         |vrL||j                  j	                            }||v rnt        |||||         y )NOUTPUTr  )r  get_outputsrC  rX  r  find_recursive_users_of_node)r  r  r  r  r  or  user_ops           rG   r  r  >  s     55!  GG 	D99(((yy!!#x/yy!!#+==(););)=>G,,(""'	rd   c                b    t         j                  j                  j                  rdnd}|| z
  |z
  S )zaComputes the number of inputs to the aot fw graph which have fixed addresses (params and buffers)rQ  r   )r@   
_functorchrX   functionalize_rng_ops)dynamo_gm_num_inputsaot_fw_gm_num_inputsnum_rng_seed_offset_inputss      rG   num_fw_fixed_argumentsr  [  s6     $$::   "669SSSrd   c                    dd}d}g }| j                   j                  D ]0  }|j                  dk(  s ||      r|j                  |       |dz  }2 |t	        t        t        |                  k(  sJ t        |      S )z>
    Infers which inputs are static for a backwards graph
    c                ~    d| j                   vxr. d| j                   vxr d| j                   vxr d| j                   vS )Ntangentsbwd_seedbwd_base_offsetbwd_rng_stater  r  s    rG   is_saved_tensorz'count_tangents.<locals>.is_saved_tensork  sH    aff$ .!&&(.!/.  qvv-		
rd   r   rM  r)   )rD   r(   r{   r   )r'  r  r3  rL  r  r   rB   )fx_gr  	arg_countstatic_arg_idxsr  s        rG   count_tangentsr  f  s    

 IOZZ 44= q!&&y1NI	 d5_)=#>????rd   c                  2    e Zd ZU ded<   ddZedd       Zy)	BoxedBoolr   rx   c                    | j                   S rt   )rx   r  s    rG   r  zBoxedBool.__bool__  s    zzrd   c                6    t        | t              r	d| _        | S yr6  )rf   r  rx   r;  s    rG   disablezBoxedBool.disable  s    c9%CIJrd   Nr  )rt  r   r{   zUnion[BoxedBool, bool])r|   r}   r~   r  r  r  r  r   rd   rG   r  r    s     K  rd   r  c              #      K   ddl m} |j                  	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 d fd}t        j                  j                  |d|      5  d  d d d        y # 1 sw Y   y xY ww)Nr)   r+   c                @    j                  |        | |||||      S rt   r  )r  kernel_namer  r  gpucpp_definitionkernel_listorig_define_kernels         rG   define_kernelz.collect_defined_kernels.<locals>.define_kernel  s-     	;'!+{Hc>
 	
rd   r  )NTN)r  r,   r  r   r  r   r  Optional[str]r  r   r  r  r{   r   )codegen.wrapperr,   r  r   re  r  )r  r,   r  r  s   `  @rG   collect_defined_kernelsr    s     5-;; #'(,
"

 
  	

 
 &
 

 
		/-	P   s   AA*A	A*A'#A*c                    | dz   S )N__original__r   r  s    rG    get_cloned_parameter_buffer_namer    s    .  rd   c                    | t         v S rt   )r>   rq  s    rG   r3  r3    s    Yrd   c                    t        |       S rt   )r3  rq  s    rG   device_need_guardr    s    &>rd   c                D   t        j                         rS| t        j                  k(  r@t        j                  j                         r"t        j                  j                         dk\  ry| t        t        j                  t        j                  t        j                  g      v S )N)r  r   F)
rX   r  r@   r  r9   rA   get_device_capabilityr   r  r   r  s    rG   ,needs_fallback_due_to_atomic_add_limitationsr    sg    
 	U^^#JJ##%JJ,,.&8
EKKU^^#LMMMrd   c                   | j                   t        j                  j                  j                  t        j                  j                  j
                  fv r|y| j                   t        j                  j                  j                  k(  rdnd}|d |fvxs |xr t        |      xr t        |      xs | j                   t        j                  j                  j                  k(  xrW |dk(  xrP |xrL |dk(  xrE t        j                  j                  xr) t        j                  j                  xs t               dk7  xs? ||k(  xr" |t        j                  t        j                  fv xs t        j                          S )NFr  r   r   r)   )overloadpacketr@   r   atenscatter_reduce_scatter_reducescatter_r3  r  rX   r  fallback_scatter_reduce_sumdynamic_threadsr  r   r  r+  )r  r[  
self_dtype	src_dtypesrc_device_typesrc_is_tensor	reduce_tys          rG   use_scatter_fallbackr    sZ    	""IINN**EIINN,I,IJ	K" ++uyy~~/F/FFE 
 	tY// 	8 H'H<YG		8 &&%))..*H*HH L%'LL  5(L 

66	L
 ++J/C/E/J	8 i'SJ5::u{{:S,S	8 557!rd   c                   ddl m}m} ddlm} t        dt        |        d       t        |       D ]  \  }}t        d|dd       ||u rt        d	       '||u rt        d
       7t        ||      r|j                         }t        |rdnd d       |r:|j                  J t        d|j                  j                  j                          t        d       |j                  j                  D ]  }t        |        t        d       |j                  j                  D ]  }t        |        t!        dt#        |              y)z
    An API that can be used in pdb to dump a node_schedule.
    Right mainly dump the read/write dependencies but can add more as needed.
    r   DisableReductionEnableReduction)SchedulerNodezNode schedule with z nodesr  3r  zenable reductionzdisable reductionredpwz scheduler nodeNzoriginal reduction hint zReadDep:z	WriteDep:zUnrecognized node type: )torch._inductor.codegen.simdr   r  r  r  rj  rB   r   rf   is_reductionrX  r  reduction_hintread_writesreadswritesr   r   )r  r   r  r  r  rX  is_redr  s           rG   dump_node_scheduler    s=   
 O7	M 236
:;}- H	T#al?"$%%%%&m,&&(FfU$/?@yy,,,01N1N0OPQ*''-- c
+''.. c
 !9$t*FGG'Hrd   c                z    ddl m}  || j                         t        | j                        z  t
        z  dk(        S )Nr   )statically_known_true)r*  r  storage_offsetr  r   GPU_ALIGN_BYTES)r   r  s     rG   tensor_is_alignedr  	  s:     L 				 >&,,#?	??RVWW rd   c                |    t        | j                  j                        syt        j                  xs t        |       S r6  )r3  r   r   rX   assume_aligned_inputsr  )example_inputs    rG   should_assume_input_alignedr  	  s2     -&&++,''K+<]+KKrd   c                     t         j                  j                  j                         } | st	        j
                         S | j                  j                  }|st	        j
                         S |j                         S rt   )	r@   _guardsTracingContexttry_getr  nullcontextrz  r)  suppress_guards)tracing_contextr)  s     rG   #maybe_get_suppress_shape_guards_ctxr  	  sb    
 mm22::<O%%''  ))33I%%''$$&&rd   c                   t         j                  j                  j                  t        dd      5  t
        j                  j                          dd l}dd l	} |j                         } |j                  |      }ddlm} |j                  |       |j                  }|j!                  |j"                          | |i |}	|j%                         }
|j!                  |       |j'                  |       d d d        |	|
fS # 1 sw Y   	
fS xY w)Nr   Tr   )output_code_log)r  r   re  r  rX   r@   r  r  r~  loggingr
   StreamHandlertorch._inductor.codecacher   
addHandlerlevelsetLevelDEBUGr  removeHandler)r   rm   rW  r~  r!  log_capture_stringchr   
prev_levelre  r  s              rG   run_and_get_cpp_coder,  .	  s     
			#	#FGT	: *(R[[]"W""#56=""2&$**
  /T$V$'')  ,%%b)*  19!*  19s   CC>>D
c                    t        |       }||j                  S | D ]4  }t        |t        j                        s|j
                  j                  c S  y rt   )rJ   r)  rf   r@   r$   rX  )r.  rz  inputs      rG   shape_env_from_inputsr/  G	  sT     (I """  (eU\\*::'''(
 rd   c                8     t              dk(  r S d fd}|S )Nr   c                ,    t        |         |       S rt   )copy_misaligned_inputs)
new_inputsinputs_to_checkra  s    rG   rK  z)align_inputs_from_check_idxs.<locals>.runb	  s    z?;Z  rd   )r3  list[InputType]r{   r   )rB   )ra  r4  rK  s   `` rG   align_inputs_from_check_idxsr6  [	  s#     ?q ! Jrd   c                T   d| j                         v rd}n;t        d t        | j                         | j                               D              dz   }t	        j
                  | |fd      j                         }t	        j
                  || j                         | j                               S )Nr   c              3  2   K   | ]  \  }}|d z
  |z    ywr"  r   )r   shaper!  s      rG   r   z)clone_preserve_strides.<locals>.<genexpr>o	  s     Tf$Tr   r)   rr   )r}  r   r   r!  r@   
as_stridedclone)rD   needed_sizebuffers      rG   clone_preserve_stridesr>  i	  s    AFFH} T#affh
:STTWXX 	 a+6<<>FFAFFHahhj99rd   c                    |D ]I  }| |   }t        |t        j                        sJ |j                         t        z  s<t        |      | |<   K y rt   )rf   r@   rQ  data_ptr	ALIGNMENTr>  )r3  check_inputs_idxsr   _inps       rG   r2  r2  u	  sL      9!}$---==?Y&248JqM	9rd   c                    g }|D ]N  }| |   }t        |t        j                        s#|j                         t        z  dk(  s>|j                  |       P t        |      t        |      k7  r|S |S )z[
    We require all inputs to be aligned, so introduce a copy for any
    that aren't.
    r   )rf   r@   rQ  r@  rA  rL  rB   )r.  static_input_idxsaligned_static_input_idxsr  r.  s        rG   remove_unaligned_input_idxsrG  	  st     !#  2seU\\*0@90LQR/R%,,S12 $%->)??((rd   c                x   ddl m} t        j                  t        j                        j
                  }|j                  j                  j                  }|j                  j                  j                  j                  }|j                  j                  j                  | |k        ry ||       xr  ||       |k  S )Nr)   r#  T)r&  r$  r@   iinfor  r   r'  r(  rj  r)  has_hintis_expr_static_and_true)r   r$  int_maxrj  rJ  s        rG   expr_fits_within_32bitrM  	  s    kk%++&**G  **Iww))22H 	ww//W=A;29Q<722rd   c                   t         j                  j                  j                         }||j                  t        |j                        dk(  sJ t        |       |j                  J |j                  D ]  }||j                  j                  d        !dt         j                  j                  j                         x}r|j                  dfd|j                  j                  t        fd|D                      y y y )Nr   Fc                f    t        |       S rj                  |       S j                  |       S rt   )ru   deserialize_symexprevaluate_symexpr)r   fakify_first_callr)  s    rG   map_exprz4set_tracing_context_output_strides.<locals>.map_expr	  s7     ("1v((<<Q??$55a88rd   c              3  .   K   | ]  } |        y wrt   r   )r   r   rS  s     rG   r   z5set_tracing_context_output_strides.<locals>.<genexpr>	  s     5!(1+5r_  )r   r   r{   z,Union[float, int, SymInt, SymFloat, SymBool])
r@   r  r  r  output_stridesrB   r/  rL  rR  tuple)rb  compiled_graphr  r  r  rR  rS  r)  s        @@@rG   "set_tracing_context_output_stridesrX  	  s     mm**224Gw55A7))*a///).9	,,888#22 	E}&&--d3$)!--66>>@@3@(+(=(=%9 &&--5u55		  Brd   c                    t         j                  t         j                  S t        j                         syt        j                  j                         ry	 ddlm}  | t        j                  j                  d      k\  S # t        $ r Y yw xY w)NFr   REMOTE_CACHE_VERSIONz.pytorch/remote_cache:fx_graph_memcache_version)
rX   fx_graph_remote_cacher  r@   _utils_internalis_fb_unit_testtorch._inductor.fb.remote_cacher[  ModuleNotFoundErrorjustknobs_getval_intrZ  s    rG    should_use_remote_fx_graph_cacherb  	  s    ##/+++,,.H  5#8#8#M#M8$    s   A> >	B
	B
c                0    t        j                  dd|       S )Nz[^a-zA-Z0-9_]r   )r  subr  s    rG   normalize_namere  	  s    66"C..rd   ztl.int1ztl.float8e4nvztl.float8e5ztl.float8e4b8ztl.float8e5b16ztl.uint8)ztl.boolztl.float8_e4m3fnztl.float8_e5m2ztl.float8_e4m3fnuzztl.float8_e5m2fnuzztl.float8_e8m0fnuz^.*[.]c                l    t         j                  dt        |             }t        j	                  ||      S )z"Convert torch.dtype to triton typetl.)_triton_type_rerd  r   _triton_type_mappingr-  )r   triton_type_names     rG   triton_typerk  	  s.    &**5#e*=##$46FGGrd   c                    t         j                  | |       }|j                  dd      }t        t        |      }t        |t        j                        sJ |S )Nrg  r  )_torch_triton_mappingr-  r  r?   r@   rf   r   )r   adjusted_type	type_namer  s       rG   triton_type_to_torchrp  	  sL    )--eU;M%%eR0Iy)Ii---rd   c                   | j                    xr | j                         |j                         k(  xr | j                         |j                         k(  xr | j                  |j                  k(  xr{ | j                  |j                  k(  xr` | j                         j                         |j                         j                         k(  xr! | j                         |j                         k(  S rt   )	is_mkldnnr}  r!  r   r   untyped_storager@  r  r  rx   s     rG   is_same_tensorru  	  s    NN 	<IIK5::<'	<KKMU\\^+	< JJ%++%	< KK5<<'		<
   "++-1F1F1H1Q1Q1SS	< !U%9%9%;;rd   c                v   | j                   xr | j                         |j                         k(  xr | j                  |j                  k(  xrn | j                  |j                  k(  xrS t        j
                  j                  j                  |       t        j
                  j                  j                  |      k(  S rt   )rr  r}  r   r   r@   r   mkldnnr@  rt  s     rG   is_same_mkldnn_tensorrx  	  s     	PIIK5::<'	PJJ%++%	P KK5<<'	P II%%d+uyy/?/?/H/H/OOrd   c                      y)N)r  isnanlogical_notlogical_andsignbitand_leltgegteqner  xorr   r   rd   rG   boolean_opsr  	
  s    rd   c                  "    e Zd ZU ded<   ded<   y)OpDtypeRuler%   type_promotion_kindOptional[torch.dtype]override_return_dtypeNr  r   rd   rG   r  r  
  s    8800rd   r  zdict[str, OpDtypeRule]op_dtype_propagation_rulesc                *    t        ||      t        | <   y rt   )r  r  )r   r  r  s      rG   #register_op_dtype_propagation_rulesr  &
  s    
 (32(t$rd   c                    t         j                  j                  r2| t        j                  t        j
                  fv rt        j                  S | S )z"Maybe upcast [b]float16 to float32)rX   r[  codegen_upcast_to_fp32r@   r  r  r  r  s    rG   upcast_compute_typer  0
  s3    }}++%--00}}Lrd   KeyTypeValTypec                  Z    e Zd ZdZddZddZddZddZdddZddZ	dd	Z
dd
ZddZy)
ScopedDictz
    A dictionary-like object that allows for scoped updates. It maintains
    an original dictionary and a set of new items that can override
    the original items within the scope.  The original dictionary is
    unmodified.
    c                     || _         i | _        y rt   original_dict	new_items)r  r  s     rG   r  zScopedDict.__init__E
  s    *13rd   c                Z    || j                   v r| j                   |   S | j                  |   S rt   r  r  r  s     rG   rx  zScopedDict.__getitem__I
  s.    $.. >>#&&!!#&&rd   c                "    || j                   |<   y rt   )r  )r  r  rx   s      rG   __setitem__zScopedDict.__setitem__N
  s    #srd   c                >    || j                   v xs || j                  v S rt   r  r  s     rG   __contains__zScopedDict.__contains__Q
  s!    dnn$At/A/A(AArd   Nc                t    || j                   v r| j                   |   S | j                  j                  ||      S rt   )r  r  r-  )r  r  r  s      rG   r-  zScopedDict.getT
  s6    $.. >>#&&!!%%c733rd   c                z    t        | j                        }| j                  D ]  }|| j                  vs|dz  } |S r6  )rB   r  r  )r  r  r  s      rG   r  zScopedDict.__len__Y
  sC    ""# 	A***Q	 rd   c              #     K   | j                   E d {    | j                  D ]  }|| j                   vs|  y 7 )wrt   r  )r  r  s     rG   __iter__zScopedDict.__iter__`
  s@     %%%% 	A***	 	&s   ><!>>c                H    t        | j                  xs | j                        S rt   )r   r  r  r  s    rG   r  zScopedDict.__bool__f
  s    D&&8$..99rd   c                    t         rt   r  r  s     rG   __delitem__zScopedDict.__delitem__i
  s    !!rd   )r  zMapping[KeyType, ValType])r  r  r{   r  )r  r  rx   r  r{   r  )r  r  r{   r   rt   )r  r  r  Optional[ValType]r{   r  r  )r{   zIterator[KeyType]r  )r  r  r{   r  )r|   r}   r~   r   r  rx  r  r  r-  r  r  r  r  r   rd   rG   r  r  =
  s5    4'
$B4
:"rd   r  )frozen_defaultfrozenc              (    dfd}| |S  ||       S )Nc                    t         j                  dk\  rt        j                  | d      S t        j                  |       S )N)r<  r  T)kw_onlyr  r  )rA  version_infodataclasses	dataclass)rw   r  s    rG   wrapzir_dataclass.<locals>.wrapo
  s;    w&((d6JJ ((V<<rd   )rw   r[   r{   r[   r   )rw   r  r  s    ` rG   ir_dataclassr  m
  s    = {9rd   c                     t         j                  j                  j                         } | "| j                  r| j                  j
                  S y rt   )r@   r  r  r  fw_metadatabw_donated_idxs)r  s    rG   get_donated_idxsr  |
  s=    mm22::<O"'B'B**:::rd   c                    ddl m}m} ddlm} | D ][  }|||fvs
|j
                  |j
                  j                  D cg c]  }|j                   c}|j                  j                  |<   ] y c c}w )Nr)   r  r#  )
codegen.simd_kernel_featuresr   r  r&  r$  rX  r  r   r   ._inductor_triton_kernel_to_post_grad_node_info)r  r  r   r  r$  rX  r  s          rG   'set_kernel_post_grad_provenance_tracingr  
  sq     P )9::yy$ #'))"3"3W KKWFF{SWs    A0c                       e Zd ZdZdZdZdZdZy)TritonAttrsDescriptorVersionr   r)   rQ  r<  r  N)r|   r}   r~   V0_NO_TRITONV1_COMPILERV2_BACKENDSV3_BACKENDS_TUPLEV4_DICTr   rd   rG   r  r  
  s     LKK	  Grd   r  c                 P   t         j                  j                  d      t        j                  S dd l} dd l} t        | j                  j                  d      rt        j                  S t        | j                  j                  d      rt        j                  S t        j                  S )Nr[  r   AttrsDescriptor)	importlibutil	find_specr  r  triton.backends.compilertriton.compiler.compilerr   rJ  compilerr  r  r  )r[  s    rG   #get_triton_attrs_descriptor_versionr  
  s{    ~~)1+888##v''):; ,777	))+<	=+777 ,333rd   c                 8    t               t        j                  k(  S rt   )r  r  r  r   rd   rG   triton_version_uses_attrs_dictr  
  s    .04P4X4XXXrd   r  )rb   ru   r{   ru   )rp   rz   r{   r   )   d   )r   zCallable[[], Any]r   ru   r   ru   r{   r  r  )r   z"Union[Optional[torch.device], str]r{   torch.device)r   zIterable[sympy.Expr]r{   rz   )r   Sequence[sympy.Expr]r   r  r{   rz   )r   zIterable[_T]r{   zValuesView[_T])r   Union[int, sympy.Expr]r   r  r{   r  )r  r  r{   r   )r   z"Iterable[Union[int, torch.SymInt]]r{   zlist[sympy.Expr])r   z Iterable[Union[int, sympy.Expr]]r{   zlist[Union[int, torch.SymInt]])r3  torch._ops.OpOverloadr{   r   )rG  r(   r>  z'Callable[[torch._ops.OpOverload], bool]r{   r   )r?  r   rm   rH  rW  dict[str, Any]r{   z&tuple[GraphModule, list[torch.Tensor]])r9   )r   r   r{   r  )r)   r9   )
ra  Callable[..., Any]rb  Sequence[Any]rc  ru   r   r   r{   r  )r   r  r  g      ?r9   )ra  r  rb  r  rc  ru   rl  ru   rm  r  r   r   r{   r  )rt  r   ru  r   r{   r  )rt  r   rx  	list[str]r{   r  )r   ru   r   ru   r{   ru   )rD   zUnion[int, Sequence[int]]r}  ru   r{   Sequence[int])rD   ztuple[_T, ...]r{   zlist[_T])r   z!Callable[Concatenate[Any, P], RV]r{   zCachedMethod[P, RV])r  0Union[Sequence[BaseSchedulerNode], ExternKernel]r{   zOrderedSet[Node])r  Sequence[BaseSchedulerNode]r  z8Literal[True, 'torch', 'original_aten', 'inductor_node']r{   r   )r  r  r  r,   r{   ztuple[str, str]rt   )r  zIterable[torch.fx.Node]r  zOptional[Callable[[Any], bool]]r{   zOrderedSet[torch.fx.Node])rm   zSequence[IRNode]rW  zdict[str, IRNode]r{   zOrderedSet[IRNode])r  rz   r{   r   )r   rz   r{   zValueRanges[Any])r  r   r{   r   )r  rU   r  ru   r{   r  )r
  r   r{   r   )r   r   r{   r  )r  rz   r  zdict[sympy.Expr, Any]r{   rz   )r   r   r{   z,TypeGuard[Union[torch.SymInt, torch.Tensor]])rm   r   r{   r   )rY  torch.fx.GraphModuler{   zOptional[torch.fx.Node])rY  r  r{   r(   )rt  r   r{   r   r  )NNT)rp  zOptional[dict[str, Any]]rY  r  rq  r   r{   r  )ry  r  r{   	list[int])r)  r'   ry  z.Sequence[Union[int, torch.SymInt, sympy.Expr]]r{   r  )r   torch.dtyper{   ru   r  r  )r  zUnion[int, torch.device]r{   r   r  )r+  ru   r   r  r{   r*   )r4  r2   r5  zlist[torch.dtype]r{   r   )r=  r   r{   r   )r4  r2   rD  r   rE  r   r{   r   )rc  r1   r{   r   )
r4  r2   rU  ru   r  ru   r  ru   r{   r   )r   r   r{   r   )r{   zQtuple[Optional[str], Callable[[], list[Any]], Callable[[], list[Any]], type[Any]])r4  r2   r{   r   )r4  r2   r  zUnion[ReinterpretView, Buffer]r  r1   r{   r   )FTFN)r4  r2   r  r1   r  r1   r  r   r  r   r  r   r  zOptional[int]r{   r   )r   zCallable[P, _T]rm   r  rW  r  r{   ztuple[_T, list[str]])r   r  rm   r   rW  r   r{   tuple[Any, list[str]])r   r  r{   r  )r   r  rm   r   rW  r   r{   r  )r   r  rm   r   rW  r   r{   r   )r   r  rm   r   rW  r   r{   ztuple[Any, list[GraphLowering]])r	  r  r
  r  r{   r  )r  r  r  zOptional[Callable[..., Any]]r{   r   )r  r   r{   r  )r{   r  )r  r  r{   r   )r.  zSequence[torch.Tensor]r{   r   )r  rz   r{   r  )r3  r   rm   r   rW  r   r{   zIterator[Any])r[  r   r{   r   )r[  r   r{   ru   )ri  zIterable[Any]r{   r   )
r  r  rY  r&   r  r  r  r   r{   r  )r  z"Optional[Union[Buffer, Operation]]r{   r   )rX  z Optional[Union[Node, Operation]]r3  z!Optional[torch._ops.OperatorBase]r{   r   )rX  z"Optional[Union[IRNode, Operation]]r{   r   )r  r7   r{   r   )rX  zOptional[Operation]r3  z?Union[torch._ops.OpOverload, Collection[torch._ops.OpOverload]]r{   r   )r  r   r  r  r  r  r{   r   )r  r7   r  zMutableSet[BaseSchedulerNode]r  zdict[str, SchedulerBuffer]r  zdict[str, BaseSchedulerNode]r  zCallable[[Any], bool]r{   r  )r  ru   r  ru   r{   ru   )r  r  r{   ru   )r  r  r{   r  )r   r   r{   r   )r   r  r{   r   )r   r   r{   r   )r   r  r{   r   )r  r  r[  r  r  r  r  r  r  r   r  r   r{   r   )r  r  r{   r  )r   rR  r{   r   )r  rR  r{   r   )r{   r  )r   r  rm   r   rW  r   r{   ztuple[Any, str])r.  Sequence[InputType]r{   zOptional[ShapeEnv])ra   Callable[[list[InputType]], Any]r4  r  r{   r  )rD   rR  r{   rR  )r3  r5  rB  r  r{   r  )r.  r  rE  r  r{   r  )r   rz   r{   r   )rb  r  rW  r6   r{   r  )r   r  r{   r   )r   r   r{   r  )r  rR  rx   rR  r{   r   )r{   ztuple[str, ...])r   r   r  r%   r  r  r{   r  )r   r  r{   r  )rw   zOptional[type[Any]]r  r   r{   r   )r{   zOptional[list[int]])r  r  r  r   r{   r  )r{   r  (6  
__future__r   r  r  r  enumr   r  rF  r~  r  r!  r  r   rg  rb  r  rm  rA  rc  r  r_  r  collections.abcr   r   r   r   r   r	   r
   typingr   r   r   r   r   r   r   r   r   r   r   typing_extensionsr   r   r   r   r   r   rg   r@   torch._inductor.runtime.hintsr   torch.utils._ordered_setr   torch.utils._pytreer   r   r    r!   r"   r#   r$   torch._prims_commonr%   torch.fxr&   r*  r'   torch.fx.noder(   r'  r*   r  r,   r'  r.   r  r/   r0   r1   r2   r3   r4   output_coder6   r  r7   r8   r>   r<   	lru_cacherH   torch._dynamo.device_interfacerI   torch._dynamo.utilsrJ   torch.autogradrK   torch.autograd.profiler_utilrL   (torch.fx.passes.graph_transform_observerrM   torch.fx.passes.shape_proprN   torch.utils._sympy.functionsrO   rP   rQ   rR   rS   torch.utils._sympy.symbolrT   rU   torch.utils._sympy.value_rangesrV   rW   r  rX   runtime.runtime_utilsrY   r   _IS_WINDOWS	getLoggerr|   r   r[   rf  r   	VarRangesrQ  ru   	InputTypeGPU_KERNEL_BIN_EXTSr  rA  rY  r)  ra   rc   rl   Functionrn   r   r   r   r   r   r   r  r!  r+  r4  r<  rZ  r   rg  rp  rv  ry  r{  r~  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r%  r/  r5  r6  r  r<  r>  rR  rW  r  ru  r|  r  r  r  r  r  r  r  r  r  r  r  r  r"  r,  r1  r6  r>  rC  rM  rd  rn  rr  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r'  r+  r/  r1  r4  r  r?  rN  rR  rW  r\  r_  rd  rl  r  rp  Enumrr  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r3  r  r  r  r  r  r  r  r,  r/  r6  r>  r2  rG  rM  rX  rb  re  ri  r  rm  compilerh  rk  rp  ru  rx  r  r  r  r  r  r  r  r  r  r  r  r  r  r  )r  rp   s   00rG   <module>r     s   "        	     	  	  
     U U          : / - >>//C$>",5$TT,= #	CL
 T  D 0 % 2 K 0  8 D  = llg%g!T]UZZ'(	U5<<ell:;<	'7 	 {Q'A-+2B XDX XB5
LENN  9<QQ#&Q25Q
Qh T ;@
+)!)*@))#AL+	+++	)#.G @OI	I<I 
I0 *8+0' 	!  	
 ( %'!  	
    )'#$  cNTT"E8WQU^ E:C*!).!)O!) 	!)H82C82!82 82z 48*0 (E
E$5EE&$%	DU	>2-,,,^ !# I "	 
2 .24 +4 	4  4  	4  4 nA!!L!!H Q7 7*  , , ,
F FR
 
 @ @ @?' ? T 8 J JI "+<	 -2%)BF	0>< T@ @ TR R:+\H>

8
@F
	
" ""&"&<<
< < 	<
  < <  < 
<~CC C"      	 $#&25/)X###&#25#$#* ...@.. .$ IMFF)EFF*	B&&   TD D T6 66 T Q0(#K(*$)) *!

!
"-!
4A!
HK!
	!
H1	" -1!
*!)! 
!H(%#J
JGJ 
JLL .LDRLL *=

5
 ,
 5	

 '
 

< *=5 , 5	
 ' 
:T 2     ,!N$&$!$ $ 	$
 $ $ 
$NH>L'"#&252(+" &	:994A9	9$ $3!3B	:&/ '#)* $	  +?*D*D*FG$!QAG  "**Y'H	 T & 1 1 1
 68 2 7
8 1 
	 )

)
-" 01 -"` D)t  *.=@	499  T4 42Yo Hs   $c: