
    Vh                   0   d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlmZmZ d dlmZmZmZmZmZmZmZ erd dlmZ d dlmZ d dlZd dlZd dlZd dlm Z m!Z! d dl"m#Z#m$Z$ d d	l%m&Z&m'Z' d d
l(m)Z) d dl*m+Z+ d dl,m-Z-m.Z. d dl/m0Z0 ddl1m2Z2m3Z3m4Z4m5Z5m6Z6 ddl7m8Z8 ddl9m:Z:m;Z;m<Z< ddl=m>Z> ddl4m?Z?m@Z@mAZAmBZB ddlCmDZDmEZE ddl5mFZFmGZGmHZHmIZImJZJ ddlKmLZL ddlMmNZNmOZO ddlPmQZQmRZR ddlSmTZT ddlUmVZVmWZWmXZXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_m`Z`maZambZb ddlcmdZd  ej                  ef      Zgej                  j                  efd      Zjej                  j                  efd      Zkeld   Zmej                   G d d              Zoej                   G d! d"eo             Zp G d# d      Zq G d$ d%      Zrd@d&Zs G d' d(      Zt	 	 	 	 	 	 	 	 dAd)Zuej                  j                  j                  ej                  j                  j                  ej                  j                  j                  ej                  j                  j                  ej                  j                  j                  d*Z} G d+ d,eq      Z~ G d- d.eq      Z G d/ d0eq      Z	 	 	 	 dBd1Z	 	 	 	 	 	 	 	 dCd3Z G d4 d5eq      Z G d6 d7e      Z G d8 d9eq      Z	 dD	 	 	 	 	 	 	 dEd:Zej                   G d; d<             Z ej                         Z G d= d2      Z G d> d?      Zy)F    )annotationsN)Counterdefaultdict)AnyCallableGenericOptionalTYPE_CHECKINGTypeVarUnion)Sequence)
ModuleType)countersdynamo_timed)LambdaFuturePyCodeCache)get_metric_tableis_metric_table_enabled)free_unbacked_symbols
OrderedSet)free_symbol_is_typeSymT)
has_triton   )commsconfigdependenciesirmetrics)can_codegen_without_upcasts)BackendFeatureget_scheduling_for_deviceKernel) estimate_nccl_collective_runtime)Dep	MemoryDepStarDepWeakDep)GPUTooOldForTritonTritonMissing)ComputedBufferget_device_typeGraphPartitionSignatureMultiOutputMultiOutputLayout)LoopBody)MemoryPlanningInfoForBufferMemoryPlanningInfoForNode)
green_textred_text)SimplifyIndexing)cache_on_selfcmpdevice_need_guardget_device_tflopsget_dtype_sizeget_gpu_dram_gbpsIndentedBufferis_collectiveis_gpuis_multi_outputs_template#is_output_of_multi_outputs_templateis_waitsympy_product)Vfusionloop_orderingBaseSchedulerNodec                      e Zd ZU ded<   ded<   ded<    ej
                  e      Zded	<    ej
                  e      Z	d
ed<   ddZ
ddZddZddZddZddZddZddZddZy)SchedulerBuffer	Scheduler	schedulerz	ir.BuffernodeOptional[BaseSchedulerNode]defining_op)default_factorylist[NodeUser]usersr2   
mpi_bufferc                B    | j                   }|J |j                         S N)rN   get_name)selfops     I/home/dcms/DCMS/lib/python3.12/site-packages/torch/_inductor/scheduler.pydefining_op_namez SchedulerBuffer.defining_op_nameX   s#    ~~{{}    c                @    t        | j                  j                        S rT   )hashrL   namerV   s    rX   __hash__zSchedulerBuffer.__hash__]   s    DIINN##rZ   c                v   t               }| j                         }|j                  | dt        | j                        j
                          |j                  | d| j                  j                          | j                         r-|j                  | dt        | j                                       | j                         r-|j                  | dt        | j                                       t        | j                        dk  r0|j                  | d| j                          |j                         S |j                  | d       |j                  d      5  | j                  D ]  }|j                  | d        	 d d d        |j                  d	       |j                         S # 1 sw Y   *xY w)
N: z
.layout = z.aliases = z.mutations = r   z	.users = z
.users = [,])r=   rU   	writelinetyperL   __name__layoutget_aliasespformatget_mutationslenrQ   indentgetrawvalue)rV   resultr]   users       rX   	debug_strzSchedulerBuffer.debug_str`   s   !}}D6DO$<$<#=>?D6DII,<,<+=>?v[9I9I9K1L0MNOv]74;M;M;O3P2QRStzz?avYtzzl;< !!## vZ01q! 1 JJ 1D$$vQZ011 S!!!##	1 1s   &F//F8c                6    | j                   j                         S rT   rL   rU   r^   s    rX   rU   zSchedulerBuffer.get_namet       yy!!##rZ   c                   | j                   J | j                   j                         sy | j                   j                         sL| j                   j                         s2t	        | j                   j                         t        j                        r4t        j                  j                  j                  | j                          y t        t        j                  d      r| j                         t        j                  j                  v rt        j                  j                  | j                            }|| j                   j"                  v r$| j                   j"                  |   j                   }n#| j                   j$                  |   j                   }t        j                  j                  j'                  || j                          y t        j                  j                  j                  | j                          y )Nargs)rL   should_allocateget_inputs_that_alias_outputget_mutation_names
isinstanceget_output_specr   CommBufferLayoutrD   graphwrapper_codecodegen_allocationhasattrkernelrU   inplace_update_buffersrK   name_to_donated_buffername_to_bufcodegen_inplace_reuse)rV   input_buffer_nameinput_buffers      rX   allocatezSchedulerBuffer.allocatew   sV   yy$$$yy((* II224yy++-$))335r7J7JKGG  33DII> AHHf%188#B#BB !" ? ? P DNN$I$II#~~DD% $   $~~99:KLQQGG  66		
 GG  33DII>rZ   c                   | j                   J t        | j                   j                  t        j                        st        | j                         ry| j                  D ]  }t        |j                   t              s y yNFT)rL   ry   rg   r   
NoneLayoutr@   rQ   
OutputNode)rV   uses     rX   can_freezSchedulerBuffer.can_free   sg    yy$$$dii&&6:SII;
 :: 	C#((J/	 rZ   c                ,   i }|D ]o  }t        |j                        |v r>|j                  |t        |j                                 |t        |j                        <   X||t        |j                        <   q t        |j	                               | _        y rT   )idrL   mergelistvaluesrQ   )rV   rQ   rn   r   s       rX   	set_userszSchedulerBuffer.set_users   st    &( 	+C#((|v%'*yy3881E'Fr#((|$'*r#((|$		+
 &--/*
rZ   c                R    | j                   J | j                   j                         S rT   )rL   rw   r^   s    rX   rh   zSchedulerBuffer.get_aliases   s%    yy$$$yy5577rZ   c                R    | j                   J | j                   j                         S rT   )rL   rx   r^   s    rX   rj   zSchedulerBuffer.get_mutations   %    yy$$$yy++--rZ   Nreturnstrr   intr   Noner   bool)rQ   rP   r   r   r   zSequence[str])rf   
__module____qualname____annotations__dataclassesfieldr   rQ   r2   rR   rY   r_   rp   rU   r   r   r   rh   rj    rZ   rX   rI   rI   N   su    
O,,-K--dCE>C.?k.?.?3/J+ 
$$($?B
+8.rZ   rI   c                      e Zd ZU dZded<   y)SchedulerDonatedBufferNrM   rN   )rf   r   r   rN   r   r   rZ   rX   r   r      s    /3K,3rZ   r   c                     e Zd ZU ded<   ded<   ded<   ded<   ded	<   d
ed<   d?dZd@dZdAdZdAdZdAdZdBdZ	dAdZ
dCdZ	 	 	 	 	 	 dDdZdEdZdFdZdGdZdHdZ	 	 	 	 	 	 dIdZdCdZdJdZdJdZdCdZdCdZ	 	 	 	 dKdZdAd ZdAd!ZedJd"       ZedJd#       ZedGd$       ZedGd%       ZdLd&ZdMd'Z dNd(Z!dOd)Z"dGd*Z#dGd+Z$dGd,Z%dGd-Z&dGd.Z'dGd/Z(dGd0Z)dPd1Z*dGd2Z+dCd3Z,	 dQ	 	 	 	 	 dRd4Z-edSd5       Z.edSd6       Z/edSd7       Z0	 	 	 	 	 	 dTd8Z1	 	 	 	 	 	 dUd9Z2edVd:       Z3dWd;Z4dXd<Z5e6	 	 	 	 dYd=       Z7y>)ZrG   z7tuple[torch.device, tuple[tuple[sympy.Expr, ...], ...]]groupdependencies.ReadWritesread_writeszOrderedSet[Dep]unmet_dependenciesr   	min_order	max_orderr3   mpi_nodec                "    || _         d | _        y )Nc                     g S rT   r   )ru   kwargss     rX   <lambda>z,BaseSchedulerNode.__init__.<locals>.<lambda>   s    B rZ   )rK   debug_device_str)rV   rK   s     rX   __init__zBaseSchedulerNode.__init__   s    $-& 	rZ   c                Z   || _         t        t                  | _        t        t                  | _        d| _        |j                         D cg c]  }t        | j                  ||        c}| _	        | j                  D ci c]  }|j                         | c}| _        y c c}w c c}w )NF)rK   rL   rN   )rL   r   r   	ancestors
last_usagewrittenget_outputsrI   rK   outputsrU   outputs_by_name)rV   rL   outputbufs       rX   _init_from_nodez!BaseSchedulerNode._init_from_node   s    ,0	#C*$
   **,/
  .. /
 ,0<<<
$'CLLNC<
/
<
s   B#B(c                T    t        |       j                   d| j                         dS )Nz(name=)re   rf   rU   r^   s    rX   __repr__zBaseSchedulerNode.__repr__   s'    t*%%&fT]]_,?qAArZ   c                H   | j                         }t               }|j                  | dt        |       j                   dt        t        | dd            j                   d| dt        | j                  j                         d| dt        | j                         d| d	t        | j                  j                  | j                  z
         d| d
       |j                         5  | j                         D ]!  }|j                  |j                                # 	 ddd       |j                  d       	 |j                  | j                                |j'                         j)                         S # 1 sw Y   XxY w# t         $ r t"        j%                  dd       Y Lw xY w)#Longer form printout for trace logsra   (rL   N)

.writes = 
.unmet_dependencies = .met_dependencies = z.outputs = [
        rc   Ignoring error in debug_str()Texc_info)rU   r=   splicere   rf   getattrri   r   writesr   readsrl   r   rp   rd   debug_str_extra	Exceptionlogwarningrm   rstrip)rV   r]   r   outs       rX   rp   zBaseSchedulerNode.debug_str   s   }}

bd		QtGD&$$?@IIJ Kj))0012 3WT%<%<=> ?74#3#3#9#9D<S<S#STU V 		
 ZZ\ 	,'') ,

3==?+,	, 	c	HJJt++-.  ''))	, 	,  	HKK7$KG	Hs   %5E25E> 2E;> F! F!c                     y)N r   r^   s    rX   r   z!BaseSchedulerNode.debug_str_extra       rZ   c                $    | j                  |       S rT   )r   r^   s    rX   _debug_str_for_devicez'BaseSchedulerNode._debug_str_for_device  s    $$T**rZ   c                   t        | j                  dd       }d}t        |t        j                  j
                  j                        r'd|j                  |j                         gdd      z   }nct        |t        j                  j
                  j                        r5d|j                  |j                         |j                         gdd      z   }|  | S )Ndatar   z, F)shorten	multiline)r   rL   ry   torch	_inductorr   	Pointwise
str_helperget_size	Reductionget_reduction_sizeget_reduction_type)rV   
maybe_datadata_strs      rX   debug_str_shortz!BaseSchedulerNode.debug_str_short  s    TYY5
j%//"4"4">">?j33$$&'% 4  H 
EOO$6$6$@$@Aj33..0*2O2O2QR 4  H
 z""rZ   c                p    t         j                  d| | j                  | j                  j                         y )Nz(%s: unmet_dependencies = %s, writes = %s)r   infor   r   r   r^   s    rX   log_detailszBaseSchedulerNode.log_details  s,    6####		
rZ   c                     y rT   r   )rV   self_dep	other_deps      rX   reorder_loops_by_dep_pairz+BaseSchedulerNode.reorder_loops_by_dep_pair  s     	rZ   c                X    | j                  | j                  j                  |             y rT   )set_read_writesr   renamerV   renamess     rX   update_mutated_namesz&BaseSchedulerNode.update_mutated_names   s!    T--44W=>rZ   c                X    | j                  | j                  j                  |             y rT   )r   r   	with_readrV   deps     rX   add_fake_depzBaseSchedulerNode.add_fake_dep#  s!    T--77<=rZ   c                B    t        d | j                         D              S )Nc              3  `   K   | ]&  }|j                         xs |j                          ( y wrT   )rh   rj   ).0r   s     rX   	<genexpr>z=BaseSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>'  s-      
9<COO4!2!2!44
s   ,.)anyr   r^   s    rX   has_aliasing_or_mutationz*BaseSchedulerNode.has_aliasing_or_mutation&  s%     
@D@P@P@R
 
 	
rZ   c                h    || _         | j                   j                  | _        | j                          y rT   )r   r   r   
prune_deps)rV   rws     rX   r   z!BaseSchedulerNode.set_read_writes+  s(    "&"2"2"8"8rZ   c                b    | j                         }t        fd|D              }||z
  | _        y )Nc              3  B   K   | ]  }j                  ||        y wrT   )get)r  kmutation_real_names     rX   r  z3BaseSchedulerNode.set_last_usage.<locals>.<genexpr>4  s     !U1"4"8"8A">!Us   )used_or_aliased_buffer_namesr   r   )rV   future_used_buffersr  used_bufferss     ` rX   set_last_usagez BaseSchedulerNode.set_last_usage0  s0     88:!!U!UU&)<<rZ   c                F    | j                   D ]  }|j                           y rT   )r   r   )rV   r   s     rX   mark_runzBaseSchedulerNode.mark_run7  s    << 	CLLN	rZ   c                    t        d t        j                  | j                  j                  | j                  j
                        D              S )Nc              3  4   K   | ]  }|j                     y wrT   r]   r  r   s     rX   r  z6BaseSchedulerNode.used_buffer_names.<locals>.<genexpr><  s      
 HH
   )r   	itertoolschainr   r   r   r^   s    rX   used_buffer_namesz#BaseSchedulerNode.used_buffer_names;  s?     
 t'7'7'='=t?O?O?V?VW
 
 	
rZ   c                2   t        t                  t        j                  | j                  j
                  | j                  j                        D cg c]  }|j                   }}t        |      dkD  r|j                         }j                  |       t        j                  j                  j                  |      rC|j                  fdt        j                  j                  |   j!                         D               t        |      dkD  rS c c}w )Nr   c              3  *   K   | ]
  }|vr|  y wrT   r   )r  alias
used_namess     rX   r  zABaseSchedulerNode.used_or_aliased_buffer_names.<locals>.<genexpr>L  s#       J.	 s   )r   r   r  r  r   r   r   r]   rk   popaddrD   r|   name_to_bufferr  extendrw   )rV   r   depsr  s      @rX   r  z.BaseSchedulerNode.used_or_aliased_buffer_namesA  s    _&
 !t'7'7'='=t?O?O?V?VW
 HH
 
 $i!m((*CNN3ww%%))#. !"!7!7"224	 	 $i!m 
s   Dc                L     t         fd j                  D               _        y )Nc              3  f   K   | ](  }|j                   j                  j                  vr| * y wrT   )r]   rK   available_buffer_namesr  r   rV   s     rX   r  z/BaseSchedulerNode.prune_deps.<locals>.<genexpr>V  s/      -
xxt~~DDD -
s   .1r   r   r^   s   `rX   r  zBaseSchedulerNode.prune_depsU  s#    ", -
..-
 #
rZ   c                     d fdt        fd j                  j                  D              } j                   j                  j	                  |             y )Nc                    t        | t              syj                  j                  | j                     j                         }|t        j                  j                  v S NF)	ry   r)   rK   r   r]   rY   rD   r|   removed_operations)r   op_namerV   s     rX   should_prunez7BaseSchedulerNode.prune_weak_deps.<locals>.should_prune^  sF    c7+nn00:KKMGagg8888rZ   c              3  4   K   | ]  } |      s|  y wrT   r   r  r   r/  s     rX   r  z4BaseSchedulerNode.prune_weak_deps.<locals>.<genexpr>d  s      
\#5FC
   r   r&   r   r   )r   r   r   r   remove_reads)rV   	to_remover/  s   ` @rX   prune_weak_depsz!BaseSchedulerNode.prune_weak_deps\  sN    	9  
++11
 
	 	T--::9EFrZ   c                F    t        | || j                  j                         y rT   )_prune_redundant_depsrK   r   )rV   name_to_fused_nodes     rX   prune_redundant_depsz&BaseSchedulerNode.prune_redundant_depsi  s     	d$68R8RSrZ   c                R    | j                   J | j                   j                         S rT   )rL   get_operation_namer^   s    rX   rU   zBaseSchedulerNode.get_namen  r   rZ   c                "    | j                         S rT   rU   r^   s    rX   get_first_namez BaseSchedulerNode.get_first_namer  s    }}rZ   c                B    t        d | j                         D              S )Nc              3  <   K   | ]  }|j                           y wrT   r>  )r  rL   s     rX   r  z8BaseSchedulerNode.get_operation_names.<locals>.<genexpr>w  s     Gd$--/G   )r   	get_nodesr^   s    rX   get_operation_namesz%BaseSchedulerNode.get_operation_namesu  s    Gdnn6FGGGrZ   c                :    t        d | j                  D              S )Nc              3  <   K   | ]  }|j                           y wrT   r>  r  r   s     rX   r  z5BaseSchedulerNode.get_buffer_names.<locals>.<genexpr>{  s     AS#,,.ArB  )r   r   r^   s    rX   get_buffer_namesz"BaseSchedulerNode.get_buffer_namesy  s    ADLLAAArZ   c                B    t        d | j                         D              S )Nc              3  Z   K   | ]#  }t        |t              xr t        |d        % yw)T)disallow_fp32_opsNry   SchedulerNoder!   r  ns     rX   r  zABaseSchedulerNode.can_codegen_in_low_precision.<locals>.<genexpr>  s7      
  q-( G+AFG
s   )+allrC  r^   s    rX   can_codegen_in_low_precisionz.BaseSchedulerNode.can_codegen_in_low_precision}  s%     
 ^^%
 
 	
rZ   c                B    t        d | j                         D              S )Nc              3  V   K   | ]!  }t        |t              xr t        |       # y wrT   rL  rN  s     rX   r  z@BaseSchedulerNode.can_codegen_without_upcasts.<locals>.<genexpr>  s-      
 q-(K-H-KK
s   ')rP  r^   s    rX   r!   z-BaseSchedulerNode.can_codegen_without_upcasts  s#     
^^%
 
 	
rZ   c                    | gS rT   r   r^   s    rX   rC  zBaseSchedulerNode.get_nodes  s	    vrZ   c                    | j                   S rT   )r   r^   s    rX   r   zBaseSchedulerNode.get_outputs  s    ||rZ   c                     | j                   |   S rT   )r   )rV   buf_names     rX   
get_outputzBaseSchedulerNode.get_output  s    ##H--rZ   c                R    | j                   J | j                   j                         S rT   )rL   
get_devicer^   s    rX   r[  zBaseSchedulerNode.get_device  s%    yy$$$yy##%%rZ   c                L    | j                         }|d uxr |j                  dk(  S Ncpu)r[  re   rV   devices     rX   is_cpuzBaseSchedulerNode.is_cpu  s'    "T!:fkkU&::rZ   c                X    | j                         }|d uxr t        |j                        S rT   )r[  r?   re   r_  s     rX   r?   zBaseSchedulerNode.is_gpu  s'    "T!9fV[[&99rZ   c                     yr,  r   r^   s    rX   is_reductionzBaseSchedulerNode.is_reduction      rZ   c                     yr,  r   r^   s    rX   is_split_scanzBaseSchedulerNode.is_split_scan  re  rZ   c                     yr,  r   r^   s    rX   is_templatezBaseSchedulerNode.is_template  re  rZ   c                     yr,  r   r^   s    rX   	is_externzBaseSchedulerNode.is_extern  re  rZ   c                     yr,  r   r^   s    rX   
is_foreachzBaseSchedulerNode.is_foreach  re  rZ   c                     yr,  r   rV   read_deps     rX   can_inplacezBaseSchedulerNode.can_inplace  re  rZ   c                     yr,  r   r^   s    rX   has_side_effectsz"BaseSchedulerNode.has_side_effects  re  rZ   c                \
    ddl m} t         t              rt        j
                  rt        j                  j                   j                         t        j                        r{t        t        j                  t        j                  j                  j                   j"                        rt%        t        j                  dd      t'        t        j                  d      sy j(                  t        j                  j*                  z   j,                  j.                  z  }d fd} j1                         D ]  }|j2                  }|J |j5                         rJ|j7                         s:|j9                         s*|j;                         t        j                  j<                  v ro j>                  j@                  D ]h  }|jB                   j,                  jD                  v r$ j,                  jD                  |jB                     }n/ j,                  jF                  jI                  |jB                        }|s|t        j                  jJ                  jM                  |       st        |jN                  tP              r|jR                  J |jR                  D cg c]   }|j2                  j;                         |vr|" }	}tU        |	      dk(  s|	d   jV                  s&|	d   j2                   u s9|j2                  Gt        |j2                  jY                         tZ        j\                  tZ        j^                  tZ        j`                  f      r|jN                  rft        |jN                  j2                  tZ        jb                  tZ        jd                  f      r(tU        |j2                  j7                               dkD  r ||j2                  |j2                        s+ ||      s5t        j                  jf                  ji                  |j;                         |j;                                t        t        j                  t        j                  j                  j                   j"                        rnt        j                  jj                  jm                  |j;                                t        j                  jj                  jm                  |j;                                |j;                         t        j                  jn                  |j;                         <      yc c}w )	z~
        Decide if there should be inplace updates for the node
        and record the decision in the active kernel.
        r   )can_match_buffer_size	mutationsNru   c                v   | j                   j                        }| j                         t               }| j                  D ]s  }|j
                  }t        |t              s | j                   j                  |      |ur>|fd|j                  j                         D        z  }t        |      dkD  ss y y)Nc              3  @   K   | ]  }|j                   k(  r|  y wrT   r  )r  orX  s     rX   r  z^BaseSchedulerNode.decide_inplace_update.<locals>.single_index_in_fused_node.<locals>.<genexpr>  s%      vv)    r   FT)rK   get_fused_noderU   r   rQ   rL   ry   rG   r   reads_and_writesrk   )buf_to_be_inplaced
fused_noder$  ro   	user_noderX  rV   s        @rX   single_index_in_fused_nodezKBaseSchedulerNode.decide_inplace_update.<locals>.single_index_in_fused_node  s    
 ,55DDTJJ)224H %/LD*00 ! II	!)->? '00??	J%&  &22CCE 
 t9q= #!& rZ   r   )r}  rI   r   r   )8codegen.wrapperru  ry   rM  r   inplace_buffersrD   r|   has_featurer[  r"   INPLACE_BUFFERSr   r   r   codegensimd
SIMDKernelr   r   r   r-  rK   completed_operationsr   rL   rv   rw   rx   rU   removed_buffersr   r   r]   r   r   r  r}   	can_reuserN   NopKernelSchedulerNoderQ   rk   rq  rz   r   r   r0   MutationLayoutSHOULDREMOVEFallbackKernelr/   ru   make_inplacerv  r!  r   )
rV   ru  inconsequential_nodesr  r   buf_noderead	input_bufxremaining_usess
   `         rX   decide_inplace_updatez'BaseSchedulerNode.decide_inplace_update  s   
 	; t]+&&##DOO$5~7U7UVqxx)@)@)E)E)P)PQ188[$7C &) NNgg(()nn112 		@ ##% C	CxxH''',,.88:..0<<>QWW%<%<<((.. 899 E EE $ E Edii PI $ : : > >tyy II ,,66y$G&y'<'<>TU$??666 "+&66??,4II &N & N+q0*1-99*1-22d:%NN6 *%NN::< " " 4 4 " = =! &11 * ) 5 5 : :!#!2!2BNN C! !$INN$O$O$Q RUV V1)..#((K6yA
 2293E3E3GX%HHeoo&=&=&B&B&M&M HH..2293E3E3GHHH..223<<>B &..0 77G q8C	0&s   %T)c                .   t         j                  sy |r| j                  ry | j                  J | j                  j	                         }g }|D ]  }|j
                  dk(  r|j                  d       |j                  d       d|j
                   d|j                   }d|j                  v r|d|j                  d    z   }|j                  |       d|j                  v s|j                  d    }|j                  d	      d
   }|j                  d|j                  dd      j                  dd      j                  dd      z          |j                  d       |j                  d       ! t        |      dk(  ry |j                  |       d| _        y )Nr   r   z#pragma CMT ORIGIN:z#pragma CMT  seq_nrz seq_nr:stack_trace|{z{{}z}}r   \z#pragma CMT END ORIGINr   T)r   comment_originr   rL   get_originsrW   appendtargetmetasplitreplacerk   
writelines)	rV   buffer	only_onceorigins	out_linesry  op_info_strr  stack_trace_last_lines	            rX   codegen_originating_infoz*BaseSchedulerNode.codegen_originating_info6  s    $$yy$$$))'')	 	%AttxR 23(az:K166!)hqvvh7G6H,II[)&!"!6 7(3(9(9#(>r(B%  "+33C>WS$'WT4()   !9:  $-	%0 y>Q 	)$rZ   c                (    | j                  dd      S )NTinclude_readsinclude_writes!get_read_write_buffers_sizes_implr^   s    rX   get_read_write_buffers_sizesz.BaseSchedulerNode.get_read_write_buffers_sizesb  s    55t 6 
 	
rZ   c                (    | j                  dd      S )NTFr  r  r^   s    rX   get_read_buffer_sizesz'BaseSchedulerNode.get_read_buffer_sizesh  s    55u 6 
 	
rZ   c                (    | j                  dd      S )NFTr  r  r^   s    rX   get_write_buffer_sizesz(BaseSchedulerNode.get_write_buffer_sizesn  s    55 6 
 	
rZ   c                Z    t        | j                  ||      j                         d      S )Nr  r   )start)sumget_read_write_buffer_accessesr   )rV   r  r  s      rX   r  z3BaseSchedulerNode.get_read_write_buffers_sizes_implt  s3     //+N 0 fh	
 	
rZ   c                $    t         t              ri S t         t              rt         j                  t              ri S ddt         t
              r@ t         j                         d         t         j                         d         z        nt        d      t        j                  t              }|r9 j                  j                  D ]   }||j                     j                  |       " |r9 j                  j                   D ]   }||j                     j                  |       " |r&t#        d  j                  j                  D              n	t#               }|r&t#        d  j                  j                   D              n	t#               }d fdt         t$              rt#         fd|D              }||z
  }||z
  }i }||z  D ]  }	t'        fd	||	   D              |	t(        j*                  j,                  v rt(        j*                  j,                  |	   }
n;|	t(        j*                  j.                  v rt(        j*                  j.                  |	   }
n	 	 	 	 d fd
 |
      }|	|vr|||	<   ||	xx   |z  cc<    |S )az  
        Counting the number of bytes accessed for a kernel is
        surprisingly tricky. In particular, there is a differentiation
        between 'theoretical' memory accesses and practical memory
        accesses. For example, a layernorm kernel may actually access an
        input 3 times, but in theory, it only needs to access its input
        once (and may be optimized to do so through say, persistent
        reductions)

        Another example is that even though a buffer is passed in, we may
        not access the entire buffer. This may occur if we are accessing
        a slice of the buffer. Another tricky case is for indirect
        indexing, where the amount of bytes accessed depends on the
        values of the input.

        What this function aims to compute is the memory accesses for
        worst-case inputs, best-case optimization. What this means is
        that for each buffer we compute the amount of potential accesses in two ways and take the minimum.

        1. Numel in ranges multiplied by number of deps the buffer has
        2. The buffer size

        Returns memory accesses per buffer.
        c                X    t         j                  j                  j                  | d      S )Nr   fallback)rD   r|   sizevars	size_hint)ss    rX   try_size_hintzGBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.try_size_hint  s"    77##--a!-<<rZ   r   r       eAc              3  4   K   | ]  }|j                     y wrT   r  r  s     rX   r  zCBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>  s     BCsxxBr  c              3  4   K   | ]  }|j                     y wrT   r  r  s     rX   r  zCBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>  s     CCsxxCr  c                    j                   j                  |    j                  }t        d |D              }t	        |t        |      z
        dkD  S )Nc              3  4   K   | ]  }|j                     y wrT   rL   )r  ro   s     rX   r  z\BaseSchedulerNode.get_read_write_buffer_accesses.<locals>.is_materialized.<locals>.<genexpr>  s     !>$))!>r  r   )rK   r   rQ   r   rk   )r   snodesrQ   buf_usesrV   s       rX   is_materializedzIBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.is_materialized  sG    NN..s399E!!>!>>Hx*V"44599rZ   c              3  J   K   | ]  } |j                         r|  y wrT   r  )r  r   r  rV   s     rX   r  zCBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>  s#      )_S$++-N)s   ##c              3  "   K   | ]  }  y wrT   r   )r  r   
node_numels     rX   r  zCBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>  s     $RCZ$Rs   c                   | syt        | t        j                        r| j                         S t        | j                  t
              rj                  j                  | j                            j                  }d}|D ]x  }t        |j                  t              sJ t        |j                  j                  t              r5|j                  j                         D ]  }| |j                        z  } x y |S t        | j                  t        j                        r"t        fd| j!                         D              S  	t#        | j%                                     }t'        | j)                               t+        |      z  S )Nr   c              3  h   K   | ])  } t         j                  j                  |             + y wrT   )rD   r|   
get_buffer)r  mut_nameget_buf_bytess     rX   r  zZBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.get_buf_bytes.<locals>.<genexpr>  s-      $ &agg&8&8&BCs   /2)ry   r   TorchBindObjectr  rg   r0   rK   r   rU   rQ   rL   rG   r/   r   r   r  rx   rC   r   r;   	get_dtypemin)
r   rQ   totro   	sched_buf	buf_elemsbuf_accessed_elemsr  rV   r  s
         rX   r  zGBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.get_buf_bytes  sC    c2#5#56,,..

,=> !NN66s||~FLLEC % 	%)$))5FGGG%diinnkB-1YY-B-B-D E	 #}Y^^'D DE $%	% J

BMM: (+(>(>(@  
 !.mCLLN.K LI)#--/:S*I>  rZ   )r  z
sympy.Exprr   r   )r   r   r  Sequence[BaseSchedulerNode]r   r   )r   z<Optional[Union[ir.Buffer, ir.TensorBox, ir.TorchBindObject]]r   r   )ry   r  ExternKernelSchedulerNoderL   r/   rM  rC   
get_rangesr   collectionsr   r   r   r   r]   r  r   r   FusedSchedulerNoder  rD   r|   r"  graph_inputs)rV   r  r  buf_accessesr   r   r   r  buf_byte_accessesrX  r   	buf_bytesr  r  r  r  r  s   `           @@@@@rX   r  z0BaseSchedulerNode.get_read_write_buffer_accesses~  sn   6 d23Id56:II{<
 I	= dM*&doo/23 1! 456J
 SJ"..t4''-- 3SXX&--c23 ''.. 3SXX&--c23
  B4+;+;+A+ABB 	  C4+;+;+B+BCC 		:
 d./( )%) O o-FO+E,. 1	9H!$$R<;Q$R!R177111gg,,X6QWW111gg**84!Q!! !F &c*I00.7!(+!(+y8+c1	9f ! rZ   c                j   | j                         d   j                         d   }|j                  j                         }t	        t        |            syt        | j                        r<t        | j                  t        j                        sJ 	 t        | j                        S t        | j                        ry|j                  j!                         }	 t#               }t%        |      dz  }t        | t(              rt        | j                  t        j*                        sJ dt-        | j                               t.        j1                  t3        | j                  dd      d      }|Xddlm} ddlm}	 t=        d	 | j                  j>                  D              ry |       5 }
 |	d
      5 }tA        jB                  | j                  jD                        5  tA        jF                  |
      5  ddlm$} | j                  j>                  D cg c]  } ||d
       }}| j                  jJ                  } |jL                  |g|i | j                  jN                   d}|jQ                         }| jS                         }||z  |z  dz  }||z  }tU        ||      cddd       cddd       cddd       cddd       S yt        | tV              st        | j                  tX              r| jS                         |z  S y# t        $ r}t        j                  |       Y d}~yd}~wt        $ r}t        j                  |       Y d}~yd}~ww xY w# t&        $ r Y yw xY wc c}w # 1 sw Y   nxY wddd       n# 1 sw Y   nxY wddd       n# 1 sw Y   nxY wddd       y# 1 sw Y   yxY w)zB
        Returns estimated op runtime in nanoseconds (ns)
        r   Nl    J)type(self.node)=python_kernel_namer   )FakeTensorMode)FlopCounterModec              3  f   K   | ])  }t        t        |j                                     d kD   + ywr   N)rk   r   	get_numelrN  s     rX   r  z:BaseSchedulerNode.get_estimated_runtime.<locals>.<genexpr>6  s.       -akkm<=As   /1F)displayr   )ir_node_to_tensor)guard_shapeg      ?r  )-rC  r   rL   rz   r?   r-   r>   ry   r   IRNoder%   
ValueErrorr   r   	TypeErrorrB   maybe_get_dtyper<   r:   r   r  ExternKernelre   kernel_name_to_opr  r   torch._subclasses.fake_tensorr  torch.utils.flop_counterr  r  inputsrD   set_current_nodefx_nodeset_fake_moder  	__class__process_kernelr   get_total_flopsr  maxr  r,   )rV   r   rg   edtypegpu_memory_bandwidth	gpu_flopsrW   r  r  	fake_modeflop_counter_moder  inputfake_inputsclsfactorcounted_flopscounted_bytescompute_timetransfer_times                        rX   get_estimated_runtimez'BaseSchedulerNode.get_estimated_runtime  s>   
 nnq!--/2))+of-. #dii333
7		BB TYY
 ((*	#4#6 )%069I d56dii9P>Nd499o=O;PP9"&&		#7<dB
 ~HD !YY--   #$<(1#E2<6G&&tyy'8'89< OOI.	< 6 &*YY%5%5#! *%UC#K # ))--C&C&&rLKL499;K;KL !F$5$E$E$GM$($E$E$GM$*]$:Y$F##ML$14H$HM |];/< < < < <>  01ZII~6
 4469MMMU       		<#< < < < < < < < <> ?<> s   L M (
N)2*NM?2M*	M%"A?M*	!	M?*	N3	N)	ML((M4MM	M"!M"%M*	*M3/M?6	N?NN	N)N	N))N2c                     y rT   r   r^   s    rX   get_template_nodez#BaseSchedulerNode.get_template_node_      rZ   c                .    | j                         }|J |S rT   r  )rV   templates     rX   get_template_node_or_throwz,BaseSchedulerNode.get_template_node_or_throwb  s!    ))+###rZ   c                f    t        d t        |       D              }| d| }| |   }| |dz   d }|||fS )zQ
        For the list of nodes, get the prologue, template, and epilogue
        c              3  H   K   | ]  \  }}|j                         s|  y wrT   ri  )r  irO  s      rX   r  zCBaseSchedulerNode.get_prologue_template_epilogue.<locals>.<genexpr>n  s     PDAqaPs   ""Nr   )next	enumerate)nodestemplate_indexprologuetemplate_nodeepilogues        rX   get_prologue_template_epiloguez0BaseSchedulerNode.get_prologue_template_epilogueg  sN     PIe,<PP.)n-!+-.00rZ   N)rK   rJ   r   r   )rL   ir.Operationr   r   r   )r   z	list[str]r   r   r'   r   r'   r   r   r   dict[str, str]r   r   )r   r&   r   r   r   )r  r   r   r   r  OrderedSet[str]r  r'  r   r   r   r)  r9  dict[str, BaseSchedulerNode]r   r   r   r  )r   zSequence[SchedulerBuffer])rX  r   r   rI   r   Optional[torch.device]rp  zdependencies.Depr   r   T)r  r=   r  r   r   r   r   )r  r   r  r   r   r   )r  r   r  r   r   zdict[str, int])r   floatr   zOptional[ir.TemplateBuffer])r   zir.TemplateBuffer)r  list[BaseSchedulerNode]r   zJtuple[list[BaseSchedulerNode], BaseSchedulerNode, list[BaseSchedulerNode]])8rf   r   r   r   r   r   r   rp   r   r   r   r   r   r   r   r  r   r  r  r  r  r  r6  r:  rU   r?  r7   rD  rH  rR  r!   rC  r   rY  r[  ra  r?   rd  rg  ri  rk  rm  rq  rs  r  r  r  r  r  r  r  r  r  r  staticmethodr#  r   rZ   rX   rG   rG      sX   BB(('' NN''

&B*2+#
!.7	
?>


=#2=HV=	=
(
GT">T	T
. H H B B 
 
 
 
.&;:~B 9=*$*15*	*X 
 

 
 

 
 


!
37
	
C!!C!37C!	C!J Y Yv
 1&1	S1 1rZ   c                  B    e Zd ZU g dZded<   ded<   d
dZddZddZy	)	WhyNoFuse)node1node2reasonru   r   r:  ztuple[Any, ...]ru   c                     || _         || _        y rT   )r8  r9  rV   r8  r9  s      rX   r   zWhyNoFuse.__init__}  s    

rZ   c                J    || _         || _        t        j                  |        y rT   )r:  ru   
fusion_logdebug)rV   r:  ru   s      rX   __call__zWhyNoFuse.__call__  s    	rZ   c                    d| j                   j                          d| j                  j                          d| j                  | j                  z  z   S )Nzcannot fuse z with ra   )r8  rU   r9  r:  ru   r^   s    rX   __str__zWhyNoFuse.__str__  sK    djj1134F4::;N;N;P:QQSTKK$))#
 	
rZ   Nr8  rG   r9  rG   r   r   )r:  r   ru   r   r   r   r   )rf   r   r   	__slots__r   r   r@  rB  r   rZ   rX   r7  r7  v  s#     5IK


rZ   r7  c                    t        | t        t        f      rt        | t              } t        j                  | d      }d|v rdt        j                  |d       S |S )Nkey   )rl   r       )	ry   r   setsortedr   pprintri   textwraprl   )objrn   s     rX   ri   ri     sR    #
C()Sc"^^C*Fv~HOOFG4566MrZ   c                  0    e Zd ZddZddZddZd	dZeZy)
r   c                &    t        |g      | _        y rT   r)  r   s     rX   r   zOutputNode.__init__  s    ",cU"3rZ   c                     yr,  r   r^   s    rX   rd  zOutputNode.is_reduction  re  rZ   c                     y)Nr   r   r^   s    rX   rw   z'OutputNode.get_inputs_that_alias_output  r   rZ   c                     y)NOUTPUTr   r^   s    rX   rU   zOutputNode.get_name  s    rZ   N)r   r(   r   r   r   r   r   )rf   r   r   r   rd  rw   rU   r   r   rZ   rX   r   r     s    4 HrZ   r   c                    t        j                          j                  D ]N  }t        |t              r|j
                     j                         }|   j                         xx   dz  cc<   P d fdt        fd j                  D              }|r? j                  |z
   _         j                   j                  j                  |             yy)am  
    Prunes weakdeps intended for mutation ordering
    on an upstream fused node if after fusion there is another dependency
    on the fused upstream node, making the weakdep redundant

    In essence this enforces an ordering on fusions. As fusions occur, weakdeps will
    be incrementally removed, enabling other fusions, ensuring they are fused in order.
    r   c                    t        | t              rD| j                     j                         }|   j	                            dkD  }|   k(  }|xs |S y)Nr   F)ry   r)   r]   rY   rU   )r   r.  is_redundantis_self_depr   name_to_dep_countr9  rL   s       rX   r/  z+_prune_redundant_deps.<locals>.should_prune  sb    c7#!#((+<<>G,-?-H-Q-Q-STWXXL -W5=K.;.rZ   c              3  4   K   | ]  } |      s|  y wrT   r   r1  s     rX   r  z(_prune_redundant_deps.<locals>.<genexpr>  s      ,s2Cr2  Nr3  )r  r   r   ry   r)   r]   rY   rU   r   r   r   r4  )rL   r9  r   r   r.  deps_to_prunerY  r/  s   ```   @@rX   r8  r8    s     '2&9&9&;&& K#w'!#((+<<>G09BBDEJEK

 
  .. M "&"9"9M"IT--::=IJ rZ   )zextern_kernels.convolutionzextern_kernels.mmzextern_kernels.bmmzextern_kernels.addmmzextern_kernels._scaled_mmc                  8     e Zd Zd fdZddZddZddZ xZS )r  c                    t         |   |       | j                  |       | j                  |j	                                y rT   superr   r   r   get_read_writesrV   rK   rL   r  s      rX   r   z"ExternKernelSchedulerNode.__init__  5    #T"T1134rZ   c                V    | j                          dt        | j                  dd        S )Nz.node.kernel = r  )rU   r   rL   r^   s    rX   r   z)ExternKernelSchedulerNode.debug_str_extra  s*    --/"/'$))EY[_2`1abbrZ   c                     yNTr   r^   s    rX   rk  z#ExternKernelSchedulerNode.is_extern  r  rZ   c                    | j                   J t        | j                   d      xr | j                   j                         S )Nrs  )rL   r   rs  r^   s    rX   rs  z*ExternKernelSchedulerNode.has_side_effects  s6    yy$$$tyy"45V$)):T:T:VVrZ   rK   rJ   rL   r$  r   r   r   r   )rf   r   r   r   r   rk  rs  __classcell__r  s   @rX   r  r    s    5
cWrZ   r  c                        e Zd Zd fdZ xZS )r  c                    t         |   |       | j                  |       | j                  |j	                                y rT   r^  ra  s      rX   r   zNopKernelSchedulerNode.__init__  rb  rZ   rg  )rf   r   r   r   rh  ri  s   @rX   r  r    s    5 5rZ   r  c                  J    e Zd ZU ded<   ded<   	 	 	 	 	 	 d fdZ	 	 d	 	 	 	 	 ddZ	 	 d	 	 	 	 	 ddZ	 	 	 	 	 	 ddZdd	Zdd
Z		 	 	 	 	 	 d dZ
d!dZd"dZd#dZd#dZd#dZd$dZd%dZ	 	 	 	 d&dZd'dZ	 d(	 	 	 d)dZed*d       Zed*d       Zd+dZed,d       Z xZS )-rM  z tuple[Sequence[sympy.Expr], ...]_sizesr1   _bodyc                f    t         |   |       | j                  |       | j                          y rT   )r_  r   r   _compute_attrsra  s      rX   r   zSchedulerNode.__init__  s,    
 	#T"rZ   c                   t        | j                  t        j                  t        j                  f      sJ | j                  j                  ||      \  | _        | _        | j                  j                         }| j                  j                  |      j                  }| || j                        f| _        t        j                   xs t        |j                          }t        | j                  t        j                        r,| j#                  | j                  j%                  |             y | j#                  t'        j$                  | j                  g| j                  d|i       y )Nextra_indexing_constraintsrecompute_sizes_body_func)	normalizeru  )ry   rL   r   r,   TemplateBuffersimplify_and_reorderrm  rn  get_device_or_errorrK   get_backendgroup_fnr   r   loop_ordering_after_fusionr?   re   r   extract_read_writesr   )rV   rs  rt  r`  rz  should_normalizes         rX   rp  zSchedulerNode._compute_attrs  s3   
 $))b&7&79J9J%KLLL"&))"@"@'A&? #A #
TZ
 ..0>>--f5>>ht{{34
  &@@@ 
KKI
 E
 dii!2!23  		--8H-I   00JJ!%8HrZ   c                *    | j                  ||       y )Nrr  )rp  )rV   rs  rt  s      rX   recompute_size_and_bodyz%SchedulerNode.recompute_size_and_body  s    
 	'A&? 	 	
rZ   c                b   t        d | j                  j                  D              }| j                  t	        j
                  | j                  g| j                  d|ij                  |             | j                  j                  |        |r!ddlm} |j                  j                          y y )Nc              3  N   K   | ]  }t        |t        t        f      s|  y wrT   )ry   r)   r(   r  s     rX   r  z5SchedulerNode.refresh_dependencies.<locals>.<genexpr>-  s#      0
ZgwEW5XC0
s   %%ru  r   SIMDScheduling)r   r   r   r   r   r|  rn  rm  r   pointwise_read_writesclear_cachecodegen.simdr  candidate_tilingscache_clear)rV   ru  need_clear_tiling_cache	fake_depsr  s        rX   refresh_dependenciesz"SchedulerNode.refresh_dependencies(  s    
 &0 0
++110
 &
	 	,,

![[4=i	"	
 	""..t4"4 ,,88: #rZ   c                    | j                   j                  |      | _         | j                   j                  | _        | j	                  dd       y )NFTru  r  )rn  reorder_iter_loopssizesrm  r  )rV   	new_orders     rX   apply_new_loop_orderz"SchedulerNode.apply_new_loop_orderD  sA    ZZ22

 jj&&!!E4!PrZ   c                    | j                   j                         | _         | j                   j                  | _        | j	                  dd       y )NTFr  )rn  merge_loopsr  rm  r  r^   s    rX   r  zSchedulerNode.merge_loopsL  s<    ZZ++-
jj&& 	!!D%!PrZ   c                   d }| j                   d   }t        |      |j                  cxk(  r|j                  k(  rn n|j                  |      }|rPt        xj
                  dz  c_        t        j                  d| j                         |       | j                  |       y t        j                  d| j                                y )Nr   r   z"Reorder loops for %s with order %szEDon't reordering %s because we can not decide the suitable loop order)
rm  rk   num_varsdecide_loop_order_to_matchr    num_loop_reorderingloop_ordering_logr?  rU   r  )rV   r   r   r  
self_sizess        rX   r   z'SchedulerNode.reorder_loops_by_dep_pairX  s     	[[^
z?h//E93E3EE ;;IFI''1,'##4dmmoy %%i0##WrZ   c                $   | j                         }| d| j                  d    | d| j                  d    | d| j                   g}| j                  j	                         D ]  }t        |t              r|j                  }t        j                  j                  |      }t        |t        j                        rZ|j                  | dt        |j                                 t        | j                   t"              rR|j                  d| d       |j                  t%        j&                  | j                   j)                         d	             | j*                  J |j-                  | j/                                d
j1                  |      S )Nz.group.device = r   z.group.iteration = r   z	.sizes = z
_layout = zclass z_loop_body:rI  r   )rU   r   rm  r   r|  ry   r)   r]   rD   r|   r  r   r  r  ri   rg   rn  r1   rM  rl   rp   rL   r#  r   join)rV   r]   linesr   rX  r   s         rX   r   zSchedulerNode.debug_str_extral  sK   }}f$TZZ]O4f'

17fIdkk]+

 ##446 	OCc7+88gg((2!#r'9'9:LLH:Z

8K7L!MN	O djj(+LL6${34LL)=)=)?HIyy$$$T//12yyrZ   c                    | j                   S rT   )rm  r^   s    rX   r  zSchedulerNode.get_ranges      {{rZ   c                    t        | j                  t        j                  t        j                  f      sJ dt        | j                               t        | j                  j                               S Nr  )ry   rL   r   r,   rv  re   r   r   r^   s    rX   rd  zSchedulerNode.is_reduction  s[    $))b&7&79J9J%KL 	
tDII !	
L DII00233rZ   c                L   t        | j                  t        j                  t        j                  f      sJ dt        | j                               t        | j                  t        j                        xr. t        | j                  j                  t        j                        S r  )ry   rL   r   r,   rv  re   r   	SplitScanr^   s    rX   rg  zSchedulerNode.is_split_scan  sy    $))b&7&79J9J%KL 	
tDII !	
L $))R%6%67 
JIINNBLL=
 	
rZ   c                J    t        | j                  t        j                        S rT   ry   rL   r   rv  r^   s    rX   ri  zSchedulerNode.is_template  s    $))R%6%677rZ   c                f    t        | j                  t        j                        r| j                  S d S rT   r  r^   s    rX   r  zSchedulerNode.get_template_node  s$    &tyy"2C2CDtyyN$NrZ   c                f    | j                          | j                          | j                  |       y rT   )r  r  r  )rV   
index_varss     rX   runzSchedulerNode.run  s#    ""$Z rZ   c                &   | j                   }t        t        t        |            t        t        t        |            k(  sJ t	        t        t        j                  j                  |      t        j                  j                  |                  }|S rT   )	rm  r  maprk   dictzipr  r  from_iterable)rV   r  r  
var_rangess       rX   ranges_from_index_varsz$SchedulerNode.ranges_from_index_vars  sp     3sE?#s3sJ+?'@@@@--j9--e4

 rZ   c                   | j                  |      }	 t        j                  t        t        j                         |            5  t        j
                  j                  |       5   | j                  |  d d d        d d d        y # 1 sw Y   xY w# 1 sw Y   y xY w# t        $ r" t        j                  d| j                          w xY w)NzError in codegen for %s)r  rD   set_ops_handlerr6   get_ops_handlerr   r  rn  r   r   fatalrL   )rV   r  r  s      rX   r  zSchedulerNode.codegen  s    00<
	!!"213D3D3F
"ST())$/( 

J'	( ( ( ( ( (
  	II/;	sA   1B  B$B4B<B B	
BBB B +Cc                    |r| j                   nt        | j                         \  }}t        j                  | j                  |t
        j                  j                  gt        |      z  g      S )z\
        Get the memory dependencies in either the pointwise or the reduction axes.
        )hidden_args)	rm  reversedr   r|  rn  sympySZerork   )rV   	pointwise
keep_sizesignore_sizess       rX   "pointwise_or_reduction_read_writesz0SchedulerNode.pointwise_or_reduction_read_writes  sT     3<4;;$++AV 
L//JJ
%'',,#lBS1S0T
 	
rZ   c                &    | j                  d      S )zH
        Get the memory dependencies in the non-reduction axes.
        Tr  r  r^   s    rX   r  z#SchedulerNode.pointwise_read_writes  s    
 666FFrZ   c                &    | j                  d      S )zD
        Get the memory dependencies in the reduction axes.
        Fr  r  r^   s    rX   reduction_read_writesz#SchedulerNode.reduction_read_writes  s    
 666GGrZ   c                   | j                         ryt        d | j                         D              ryt        | j                  j
                        dk(  rt        |t        j                        rt        t        | j                  j
                              }t        |t        j                        sJ dt        |             |j                  |j                  k(  xr |j                  |j                  k(  S y)NFc              3  <   K   | ]  }|j                           y wrT   )rh   rG  s     rX   r  z,SchedulerNode.can_inplace.<locals>.<genexpr>  s     ?Ss ?rB  r   ztype(write_dep)=)ri  r  r   rk   r   r   ry   r   r'   r  iterre   indexsize)rV   rp  	write_deps      rX   rq  zSchedulerNode.can_inplace  s    ?D,<,<,>??t&&'1,l,,2
 T$"2"2"9"9:;Ii)?)?@WEUT)_DVBWW@>>Y__4X)..9XXrZ   c                   t        t                  }t        | j                  t              r| j                  j                         D ]  }|j                  dk(  s|j                  dk(  s#d|j                  v r|j                  d   dk(  s,t        |j                        dk(  s\|j                  d   dk(  so|j                  d|j                  v r|j                  d   n(t        |j                        dk\  r|j                  d	   nd
        |S )Ncall_methodstoremode
atomic_add   rH  r]      r   r   )r   r   ry   rn  r1   rC  rW   r  r   rk   ru   r!  )rV   buffers_store_as_atomic_addrL   s      rX   _get_atomic_add_buffersz%SchedulerNode._get_atomic_add_buffers  s    &0o&7#djj(+

,,. GG},w.4;;.4;;v3F,3V		Na/DIIaLL4P 033!T[[0 F+.1$))n.Adiilr +*rZ   )rK   rJ   rL   z+Union[ir.ComputedBuffer, ir.TemplateBuffer]r   r   NN)rs  z*Optional[tuple[dict[Any, Any], list[Any]]]rt  zOptional[Callable[..., Any]]r   r   )ru  r   r  r   r   r   )r  zSequence[int]r   r   r   r%  r   )r   Sequence[Sequence[sympy.Expr]]r   r3  )r  Sequence[sympy.Expr]r   r   )r  r  r   zdict[sympy.Expr, sympy.Expr])r  r  r   r   r1  )r  r   r   r   )r   r   r0  r*  )rf   r   r   r   r   rp  r  r  r  r  r   r   r  rd  rg  ri  r  r  r  r  r  r7   r  r  rq  r  rh  ri  s   @rX   rM  rM    sk   ,,O : 
	 RVBF$N $@ 
	D RVBF
$N
 $@
 
	
;;8<;	;8Q
Q!.7	( ,4
8O!
8	%
 !%	
	
	 	
 G G H H + +rZ   rM  c           	     n     j                   } j                  t        j                  j	                  |D cg c]  }|j
                   c}             t         fdt        j                  |D cg c]  }|j                   c} D               j
                  j                  z
   _        y c c}w c c}w )Nc              3  Z   K   | ]"  }|j                   j                         vr| $ y wrT   r]   rH  )r  r   group_snodes     rX   r  z2refresh_group_node_dependencies.<locals>.<genexpr>  s.      
xx{;;== 
   (+)
r  r   r   
ReadWrites
merge_listr   r   unionr   r   )r  r  r  s   `  rX   refresh_group_node_dependenciesr    s     F**6+JaAMM+JK
 	 
!'')O1!*>*>)OP
 	

 
!
!
(
(	) " ,K *Ps   B-0B2rJ   c                   t        | t        t        f      sJ || _        || _        d | _        t        j                  |D cg c]  }|j                  |j                   c} | _        t        |        t        d | j                  D              | _        t        d | j                  D              | _        | j                         D ci c]  }|j                         | c}| _        y c c}w c c}w )Nc              3  4   K   | ]  }|j                     y wrT   r   r  r  s     rX   r  z"init_group_node.<locals>.<genexpr>       HHr  c              3  4   K   | ]  }|j                     y wrT   )r   r  s     rX   r  z"init_group_node.<locals>.<genexpr>  r  r  )ry   r  GroupedSchedulerNoder  rK   rL   r   r  r   r  r  r   r  r   r   rU   r   )r  rK   r  r  r   s        rX   init_group_noder    s    
 k$68L#MNNNK%KK&,,%	A!)@!++	AK $K0H[5G5GHHKH[5G5GHHK'2'>'>'@# ##K 
B#s   C*C*	C/c                  t    e Zd ZU dZded<   e	 	 	 	 	 	 dd       Z	 	 	 	 	 	 ddZd fdZe	dd       Z
ddZe	d d	       Zd!d
ZddZddZ	 	 	 	 	 	 d" fdZe	d d       Ze	d d       Zd#dZddZe	d$d       Ze	d$d       Ze	d$d       Ze	d%d       Zd&dZe	d$d       Zd'dZd(dZd)dZddZ xZS )*r  z
    This is a "fake" scheduler node that represents a group of scheduler nodes
    that are meant to be fused together. The way it does this is by maintaining
    its unmet dependencies as the union of its constituent nodes.
    r4  r  c           	        |j                   |j                   u sJ t        |t        t        f      sJ |j	                         rt        |t
              rt        |j                  t              sJ t        |j                  j                        dk(  sJ t        t        t        |j                  j                              t              sJ t        t        |j                  j                              j                  }|j                         D cg c]  }|j	                         s| }}t        |      dk(  sJ |d   }t        |j                  j                        dk(  sJ t        t        |j                  j                              }t        |t               sJ t#        t!        ||j$                  |j&                  |j(                  |j*                        g      |j                  _
        nt        |t        t        f      sJ t-        t/        j0                  |j                         |j                                     } | |j                   |      S c c}w )Nr   r   )rK   ry   rM  r  ri  r  rL   r/   rk   r   r   r  r  r(   r]   rC  r'   r   r  	var_namesr  r  r   r  r  )	r
  r8  r9  r]   rL   template_nodesr!  writer  s	            rX   fusezFusedSchedulerNode.fuse#  s    %//111%-1C!DEEE:e5N#O ejj+666u((//0A555d4(9(9(@(@#ABGLLLU..5567<<D/4/@WtDDTDTDVdWNW~&!+++*1-M}00778A===m77>>?@EeY///'1ekk5??EJJ

(E$ em5G%HIIIY__U__%68IJK5??E**! Xs   I'Ic                   | j                         ry d }| j                  D ]`  }t        |t              sJ |;t	        |      t	        |j
                  d         k7  rt        j                  d        y |j
                  d   }b d }|J t        |      |j                  cxk(  r|j                  k(  rn n|j                  |      }|s%t        j                  d| j                                y t        xj                  dz  c_        t        j                  d| j                         |       | j                  D ]%  }t        |t              sJ |j                  |       ' t        |        y )Nr   z1Can not reorder fused node due to different sizeszODont reordering fused node %s because we can not decide the suitable loop orderr   z-Reorder loops for fused node %s with order %s)ri  r  ry   rM  tuplerm  r  r?  rk   r  r  rU   r    r  r  r  )rV   r   r   r  snoder  s         rX   r   z,FusedSchedulerNode.reorder_loops_by_dep_pairE  sI    
[[ 	)Ee]333%%
*;uU\\RS_?U*U!''G aJ	) 	%%%z?h//E93E3EE ;;IFI##a ##q(#;T]]_i	
 [[ 	2Ee]333&&y1	2 	(-rZ   c                    t         |   |       t        | ||       g | _        t	        |d       j
                  | _        y )Nc                4    t        | j                               S rT   )r   rd  r  s    rX   r   z-FusedSchedulerNode.__init__.<locals>.<lambda>n  s    s1>>3C/D rZ   rF  )r_  r   r  rQ   r  r   rV   rK   r  r  s      rX   r   zFusedSchedulerNode.__init__j  s8    #i0%'
%DEKK
rZ   c                z    dj                  | j                  D cg c]  }|j                          c}      S c c}w N_r  r  rU   rV   r  s     rX   rU   zFusedSchedulerNode.get_namep  )    xxt{{;!;<<;   8c                <    | j                   d   j                         S Nr   r  rU   r^   s    rX   r?  z!FusedSchedulerNode.get_first_namet      {{1~&&((rZ   c                |    t        j                  | j                  D cg c]  }|j                          c} S c c}w rT   r   r  r  rH  r  s     rX   rH  z#FusedSchedulerNode.get_buffer_namesw  .    !L1!"4"4"6!LMM!L   9c                j    g }| j                   D ]!  }|j                  |j                                # |S rT   r  r#  r   rV   rn   rL   s      rX   r   zFusedSchedulerNode.get_outputs{  4    (*KK 	.DMM$**,-	.rZ   c           
     ~   t        | j                        D cg c]+  \  }}| j                          d| d|j                          - }}}| j                  d   j                  }||j                  | j                                t        j                  dj                  |      j                         d      S c c}}w )Nz.snodes[z] =
r   r   rI  )r  r  rU   rp   rL   r#  r   rM  rl   r  r   )rV   r  rL   r  s       rX   r   z"FusedSchedulerNode.debug_str_extra  s     %T[[1
4 }}xs%0@/AB
 
 {{1~""LL3356tyy/668&AA
s   0B9c                h    | j                   D cg c]  }|j                          }}|  d| S c c}w )Nz
, snodes: )r  r   )rV   rL   
snodes_strs      rX   r   z"FusedSchedulerNode.debug_str_short  s9    9=Ed**,E
Ez*.. Fs   /c                    t         |   ||       t        t                  }t	        | j
                        D ]/  }|j                  ||       |j                  |j                         1 y rT   )r_  r  r   r   r  r  updater   )rV   r  r  rL   r  s       rX   r  z!FusedSchedulerNode.set_last_usage  sa    
 	24FG )o/T[[) 	8D 35GH&&t7	8rZ   c                |    t        j                  | j                  D cg c]  }|j                          c} S c c}w rT   )r   r  r  r  r  s     rX   r  z$FusedSchedulerNode.used_buffer_names  s.    !MA!"5"5"7!MNN!Mr  c                |    t        j                  | j                  D cg c]  }|j                          c} S c c}w rT   )r   r  r  r  r  s     rX   r  z/FusedSchedulerNode.used_or_aliased_buffer_names  s3    8<D1a,,.D
 	
Dr  c                    | j                   S rT   r  r^   s    rX   rC  zFusedSchedulerNode.get_nodes  r  rZ   c                T    t        |       j                   d| j                          dS )Nz(nodes=r   r   r^   s    rX   r   zFusedSchedulerNode.__repr__  s'    t*%%&gdmmo->a@@rZ   c                :    t        d | j                  D              S )Nc              3  <   K   | ]  }|j                           y wrT   )rd  r  s     rX   r  z2FusedSchedulerNode.is_reduction.<locals>.<genexpr>  s     91>>#9rB  r  r  r^   s    rX   rd  zFusedSchedulerNode.is_reduction  s    9T[[999rZ   c                :    t        d | j                  D              S )Nc              3  <   K   | ]  }|j                           y wrT   )rg  r  s     rX   r  z3FusedSchedulerNode.is_split_scan.<locals>.<genexpr>  s     :1??$:rB  r  r^   s    rX   rg  z FusedSchedulerNode.is_split_scan  s    :dkk:::rZ   c                :    t        d | j                  D              S )Nc              3  <   K   | ]  }|j                           y wrT   r  r  s     rX   r  z1FusedSchedulerNode.is_template.<locals>.<genexpr>  s     8q1==?8rB  r  r^   s    rX   ri  zFusedSchedulerNode.is_template  s    8DKK888rZ   c                j    | j                   D ]$  }|j                         s|j                         c S  y rT   )r  ri  r  rV   rL   s     rX   r  z$FusedSchedulerNode.get_template_node  s5    KK 	0D!--//	0 rZ   c                     | j                   d   S r  )r   r^   s    rX   r[  zFusedSchedulerNode.get_device  s    zz!}rZ   c                :    t        d | j                  D              S )Nc              3  <   K   | ]  }|j                           y wrT   )r  r  s     rX   r  z>FusedSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>  s     EA1--/ErB  r  r^   s    rX   r  z+FusedSchedulerNode.has_aliasing_or_mutation  s    EEEErZ   c                    t         rT   NotImplementedErrorr   s     rX   r   z'FusedSchedulerNode.update_mutated_names      !!rZ   c                    t         rT   r  )rV   r]   s     rX   r   zFusedSchedulerNode.add_fake_dep  r  rZ   c                    t         rT   r  ro  s     rX   rq  zFusedSchedulerNode.can_inplace  r  rZ   c                P   | j                         }dj                  d | j                  D              }t               }|j	                  | dt        |       j                   d| d| dt        | j                  j                         d| dt        | j                         d| d	t        | j                  j                  | j                  z
         d| d
       |j                         5  | j                         D ]!  }|j	                  |j                                # 	 ddd       |j                  d       	 |j	                  | j!                                |j)                         j+                         S # 1 sw Y   XxY w# t"        $ r t$        j'                  dd       Y Lw xY w)r   rb   c              3  F   K   | ]  }t        |      j                    y wrT   )re   rf   rN  s     rX   r  z/FusedSchedulerNode.debug_str.<locals>.<genexpr>  s     FQQ 0 0Fs   !ra   r   r   r   r   r   r   z.outputs = [
            Nrc   r   Tr   )rU   r  r  r=   r   re   rf   ri   r   r   r   r   rl   r   rp   rd   r   r   r   r   rm   r   )rV   r]   node_typestrr   r   s        rX   rp   zFusedSchedulerNode.debug_str  s   }}xxF$++FF

bd		Q|n -j))0012 3WT%<%<=> ?74#3#3#9#9D<S<S#STU V 	
 ZZ\ 	,'') ,

3==?+,	, 	c	HJJt++-.  ''))	, 	,  	HKK7$KG	Hs   )5E69F 6E? F%$F%r8  rG   r9  rG   r   r  r%  rK   rJ   r  r4  r   r   r   r*  r   zlist[SchedulerBuffer]r(  r-  r   r3  )r   torch.devicer&  )r]   r&   r   r   r0  ) rf   r   r   __doc__r   classmethodr  r   r   r7   rU   r?  rH  r   r   r   r  r  r  rC  r   rd  rg  ri  r  r[  r  r   r   rq  rp   rh  ri  s   @rX   r  r    so    $#+%+.?+	+ +B#.!#..7#.	#.JL = =) N N	B/8#28HV8	8 O O 
 

A : : ; ; 9 9   F F
"""*rZ   r  c                  N    e Zd ZU dZ	 	 	 	 ddZ	 	 	 	 ddZedd       Ze	 	 	 	 	 	 dd       Z	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZ	e	 	 	 	 dd       Z
e	 	 	 	 dd       ZeZd	ed
<   e	 	 	 	 dd       Ze	 	 	 	 dd       ZddZddZddZddZd dZd!dZ	 	 	 	 d"dZ xZS )#ForeachKernelSchedulerNodez
    This is a schedular node that consists of a set of scheduler nodes that
    has no data dependencies among them and can be executed in parallel.
    c                    |j                         D ]=  }|j                         | j                  v s | j                  |j                            c S  y rT   )r   rU   read_to_node)rV   producerr   s      rX   get_consumer_subnode_forz3ForeachKernelSchedulerNode.get_consumer_subnode_for  sL     '') 	9C||~!2!22((88	9 rZ   c                   t        t                  }|j                  j                  D ]  }|j                  | j
                  j                  vr&| j
                  j                  |j                     j                         }|| j                  v sf|j                  | j                  |           t        |      dk(  rt        t        |            S y Nr   )r   rG   r   r   r]   rK   r   rY   name_to_noder!  rk   r  r  )rV   consumer	producersrd	node_names        rX   get_producer_subnode_forz3ForeachKernelSchedulerNode.get_producer_subnode_for  s     013	&&,, 	<Bwwdnn88822277;LLNID---d//	:;	< y>QY((rZ   c                   t        |      }j                         r|j                         rt        j                  t              t        j                  t        |      }t        j                        t        |j                        k(  }|s |d       |xr2 t        fdt        j                  |j                        D              S |j                         rkj                         r	 |d       yt        j                  t        |      }|j                        }||j                  j                  |      S  |d       yj                         rk|j                         r	 |d       yt        j                  t              j                  |      }|j                  j                  ||      S  |d       yt        d      )	Nzforeach do not have same lengthc              3  \   K   | ]#  \  }}j                   j                  ||       % y wrT   )rK   can_fuse)r  lrr.  s      rX   r  z6ForeachKernelSchedulerNode.can_fuse.<locals>.<genexpr>  s0      )Aq ""++Aq1)s   ),zXcandidate producer is a reduction, foreach ops cannot be fused with reductions currentlyFz5candidate producer is not dep of any foreach consumerzXcandidate consumer is a reduction, foreach ops cannot be fused with reductions currentlyz5candidate consumer has no dep in any foreach producerzXAt least one node passed to ForeachKernelSchedulerNode.can_fuse should be a foreach node)r7  rm  typingcastr+  rk   r  rQ  r  rd  r/  rK   r:  r7  AssertionError)r
  r.  r3  whyforeach_matchconsumer_subnodeproducer_subnodes    `     rX   r:  z#ForeachKernelSchedulerNode.can_fuse  s   (+ X%8%8%:{{#=xHH{{#=xHH0C4HHM 56  S )A) &    "$$&n {{#=xHH'@@J+))228=MNNGH  "$$&n {{#=xHH'@@J+))223CXNNGHf
 	
rZ   c                
   |j                         s|j                         sJ |j                         r3t        j                  t        |      }|j                  }|j
                  }n2t        j                  t        |      }|j                  }|j
                  }d }d }|j                         r|j                         r|t        j                  t        |      }t        j                  t        |      }t        |j                  |j                        D cg c]  \  }}t        j                  ||       }	}}n/|j                         rt        j                  t        |      }|j                  |      }
g }	|}d }|j                  D ]A  }||
u r*t        j                  ||      }|}|	j                  |       1|	j                  |       C n|j                         rt        j                  t        |      }|j                  |      }g }	|}d }|j                  D ]A  }||u r*t        j                  ||      }|}|	j                  |       1|	j                  |       C nt        d       | |j                  |	||||      S c c}}w )NzTAt least one node passed to ForeachKernelSchedulerNode.fuse should be a foreach node)use_custom_partition_algoprev_node_1prev_node_2enable_autotune)rm  r=  r>  r+  rE  rH  r  r  r  r  r7  r  r/  r?  rK   )r
  r.  r3  rE  rH  rF  rG  r;  r<  fused_nodesrC  rL   new_noderB  s                 rX   r  zForeachKernelSchedulerNode.fuse>  sZ    ""$(;(;(=== {{#=xHH(0(J(J%&66O{{#=xHH(0(J(J%&66O X%8%8%:{{#=xHH{{#=xHH  AAq #''1-K    "{{#=xHH'@@JK"KK  -++166tXFH"*K&&x0&&t,-   "{{#=xHH'@@JK"KK  -++166xFH"*K&&x0&&t,- !f  &?##+
 	
Ks    I?c                @    i  _         i  _        ||qt           ||       |D ]Z  }|j                  j
                  D ]  }| j                   |j                  <    |j                         D ]  }	| j                  |	<    \ n| _        | _	        d  _
        g  _         j                  t        j                  j                  |j                  |j                  g             t!         fdt!        j"                  |j$                  |j$                        D               j                  j&                  z
   _        t)        |j*                  |j*                  g       _        t-        |j.                  |j.                  g       _        |j1                         rt3        |t4              sJ ||}}
nt3        |t4              sJ ||}}
|
j6                   _         j6                  j9                  |j6                         |
j                   _        |j                         D ]  }	| j                  |	<    | _        |d   j=                         }|sJ |t?        j@                  d      fff _!        t!        tD        jF                  jH                             _%        | _&        y )Nc              3  Z   K   | ]"  }|j                   j                         vr| $ y wrT   r  r(  s     rX   r  z6ForeachKernelSchedulerNode.__init__.<locals>.<genexpr>  s0       xxt'<'<'>>	 r  r   combo_kernel)'r-  r2  r_  r   r   r   r]   rD  rK   r  rL   rQ   r   r   r  r  r   r  r   r   r  r   r  r   rm  ry   r+  r   r
  rE  r[  r  Exprr   r   fxNoder  rH  )rV   rK   r  rE  rF  rG  rH  rL   r  r]   foreach_node
other_noder`  r  s   `            rX   r   z#ForeachKernelSchedulerNode.__init__  sq    +"5GY/ 3 ,,22 8D37D%%dii08 !446 3D.2D%%d+3	3 'DN DKDI)+DJ  ''22 ,,k.E.EF  )//#668V8V   ""))* # !+"7"79N9N!OPDN +"7"79N9N!OPDN%%'!+/IJJJ+6j!+/IJJJ+6j)33DNNN!!*"6"67 , 9 9D"668 5*4!!$'5 *C&%%'v

> :<>?
!%((--02.rZ   c           	         |D cg c]  }t        |t              s| }}|rSt        j                  dt	        |      |D cg c])  }|j
                  |j
                  j                         + c}       |D cg c]  }t        |t        t        f      s| }}|D cg c]  }t        |t              s| }}|rt        j                  dt	        |             |D cg c]  }t        |t              r| }}|D cg c]  }|j                         s| }}|r)t        j                  dt        t	        |      g             |D cg c]	  }||vs| }}|S c c}w c c}w c c}w c c}w c c}w c c}w c c}w )Nz/ComboKernels: %d external nodes are filtered %sz+ComboKernels: %d foreach nodes are filteredz,ComboKernels: %d template nodes are filtered)ry   r  r   r?  rk   rL   r  r  r+  ri  r   )r
  r  r  externrL   filtered_nodesforeach_nodesr  s           rX   combinable_nodesz+ForeachKernelSchedulerNode.combinable_nodes  sy    #Oj4M&N!OOIIAF5;UTtyy?T&&(U 
a"8:S!TU 
 
 &
A7Q)RA
 
 IICSEWX%
Z;U-VA
 
 &4Gq}}!GGII>C/01 &4Oq7N!OO7 P
 V




 H PsL   EEE"E":E'E,5E, E16E1 E6E6	E;E;c           
         | j                         }g }d}|D ];  }|j                  t        dt        |      |      D cg c]
  }||||z     c}       = |S c c}w )zS
        Returns a list of lists of nodes that are to be grouped together.
           r   )_topological_sort_nodesr#  rangerk   )rK   sorted_nodesgrouped_nodesmax_num_nodesr  r  s         rX   &_default_group_nodes_for_combo_kernelszAForeachKernelSchedulerNode._default_group_nodes_for_combo_kernels  sw     !88:! 	E   #1c%j-@ !a-/0	 s   A
4Callable[[Scheduler], list[list[BaseSchedulerNode]]]!group_algorithm_for_combo_kernelsc                    | t         _        y rT   r+  ra  )custom_group_algorithms    rX   %set_group_algorithm_for_combo_kernelsz@ForeachKernelSchedulerNode.set_group_algorithm_for_combo_kernels  s    
 # 	#DrZ   c                ,    t         j                  |       S rT   rc  rK   s    rX   group_nodes_for_combo_kernelsz8ForeachKernelSchedulerNode.group_nodes_for_combo_kernels  s     *KKIVVrZ   c                    t         rT   r  r^   s    rX   r  z#ForeachKernelSchedulerNode.mark_run
  r  rZ   c                    t         rT   r  r^   s    rX   r  z"ForeachKernelSchedulerNode.codegen  r  rZ   c                     yre  r   r^   s    rX   rm  z%ForeachKernelSchedulerNode.is_foreach  r  rZ   c                ,    t        | j                        S )zeReturns a list of nodes which comprise the combo kernel.
        These nodes may be vertically fused.)r   r  r^   s    rX   get_subkernel_nodesz.ForeachKernelSchedulerNode.get_subkernel_nodes  s     DKK  rZ   c                t    t        t        j                  j                  d | j                  D                    S )zqReturns all nodes contained in this kernel, unpacking fused nodes
        into their constituent scheduler nodes.c              3  <   K   | ]  }|j                           y wrT   )rC  r  s     rX   r  z7ForeachKernelSchedulerNode.get_nodes.<locals>.<genexpr>  s     1UA!++-1UrB  )r   r  r  r  r  r^   s    rX   rC  z$ForeachKernelSchedulerNode.get_nodes  s(     IOO111U1UUVVrZ   c                <    | j                   d   j                         S r  )r  r?  r^   s    rX   r?  z)ForeachKernelSchedulerNode.get_first_name  s    {{1~,,..rZ   c                    t        | || j                  j                         | j                  D ]  }|j	                  |        y rT   )r8  rK   r   r  r:  )rV   r9  rL   s      rX   r:  z/ForeachKernelSchedulerNode.prune_redundant_deps   s=     	d$68R8RSKK 	:D%%&89	:rZ   )r.  rG   r   rM   )r3  rG   r   rM   r.  rG   r3  rG   r   r   )r.  rG   r3  rG   r   r+  )NNF)rK   rJ   r  r4  rE  r   rF  rM   rG  rM   rH  r   r   r   r  r4  r   r4  )rK   rJ   r   list[list[BaseSchedulerNode]])rd  r`  r   r   r   r   r   r4  r-  r   r+  )rf   r   r   r(  r/  r7  r)  r:  r  r   rW  r5  r_  ra  r   re  rh  r  r  rm  rm  rC  r?  r:  rh  ri  s   @rX   r+  r+    s   
)	$)	$& ,
 ,
\ >
(>
4E>
	#>
 >
J 4837 %B/B/ (B/ $(	B/
 1B/ 1B/ B/ 
B/H +	  @ 	& * 	/ & ( / 
 T
	
 
 WW	&W W
""!
W
/:">:	:rZ   r+  c                       e Zd ZU dZded<   edd       Zd fdZddZddZ	e
dd       Zdd	Ze
dd
       ZddZddZedd       Z xZS )r  aC  
    This is a "fake" scheduler node that represents a group of scheduler nodes
    that are meant to be *grouped* together (it does not allow another node to be scheduled
    in between its constituent nodes, nor does it allow another node to fuse into any of its constituent nodes).
    The way it does this is by maintaining its unmet dependencies as the union of its constituent nodes.
    Fusion will still happen among the nodes within each GroupedSchedulerNode.
    At codegen time, this scheduler node will be unpacked and codegen is called on each constituent node.
    r4  r  c                    |d   j                   t        fd|D              sJ  | |      }|D ]  }|j                  |j                         <   ! |j                  |j                         <   |S )Nr   c              3  :   K   | ]  }|j                   u   y wrT   rg  )r  rL   rK   s     rX   r  z.GroupedSchedulerNode.create.<locals>.<genexpr>8  s     B44>>Y.B   )rK   rQ  r9  rU   )r
  r  grouped_snoder  rK   s       @rX   createzGroupedSchedulerNode.create5  sy    1I''	B6BBBBIv. 	KE=JI(()9:	KAN	$$]%;%;%=>rZ   c                >    t         |   |       t        | ||       y rT   )r_  r   r  r  s      rX   r   zGroupedSchedulerNode.__init__?  s    #i0rZ   c                   | j                   D ])  }|| j                  j                  |j                         <   + | j                  j                  | j                         = | j                  j	                  | j                         S )z
        Do fusion among nodes within this GroupedSchedulerNode,
        and then unpack this GroupedSchedulerNode into regular nodes.
        )r  rK   r9  rU   
fuse_nodes)rV   r  s     rX   unpackzGroupedSchedulerNode.unpackC  se    
 [[ 	HEBGDNN--enn.>?	HNN--dmmo>~~((55rZ   c                    | j                  | j                  j                  |             | j                  j	                  |       y rT   )r   r   r   r   r!  )rV   fake_deps     rX   r   z!GroupedSchedulerNode.add_fake_depM  s5    T--77AB##H-rZ   c                z    dj                  | j                  D cg c]  }|j                          c}      S c c}w r  r  r  s     rX   rU   zGroupedSchedulerNode.get_nameQ  r  r  c                <    | j                   d   j                         S r  r  r^   s    rX   r?  z#GroupedSchedulerNode.get_first_nameU  r  rZ   c                |    t        j                  | j                  D cg c]  }|j                          c} S c c}w rT   r  r  s     rX   rH  z%GroupedSchedulerNode.get_buffer_namesX  r   r  c                j    g }| j                   D ]!  }|j                  |j                                # |S rT   r  r  s      rX   r   z GroupedSchedulerNode.get_outputs\  r  rZ   c                    | j                   S rT   r  r^   s    rX   rC  zGroupedSchedulerNode.get_nodesb  r  rZ   c                     yr,  r   )r
  r.  r3  s      rX   r:  zGroupedSchedulerNode.can_fusee  s     rZ   )r  r4  r   r  r%  ru  )r  r&   r   r   r   r*  r&  r-  rr  )rf   r   r   r(  r   r)  r{  r   r  r   r7   rU   r?  rH  r   rC  r:  rh  ri  s   @rX   r  r  )  s~     $# 16. = =) N N  rZ   r  c           
          t         j                  d fd       }t        t        t	        t         d                           }t        |      dkD  r|D cg c]  } |   	 c} t        j                  r|j                  |       |S c c}w )z
    A heuristic to decide loop iteration orders.  This has not been well
    tuned and may be something we should autotune.
    c                t   |    dk(  s|   dk(  rt        |    dk(  |   dk(        S D cg c]  }t        ||           }}D cg c]  }t        ||          }}t        d t        ||      D              }t        d t        ||      D              }||kD  ry||kD  ryt        ||       S c c}w c c}w )Nr   c              3  :   K   | ]  \  }}|d k(  xs ||k    ywr  r   r  sl_asl_bs      rX   r  z5pick_loop_order.<locals>.index_cmp.<locals>.<genexpr>  )      
)3tDAI$$
   c              3  :   K   | ]  \  }}|d k(  xs ||k    ywr  r   r  s      rX   r  z5pick_loop_order.<locals>.index_cmp.<locals>.<genexpr>  r  r  r  )r8   absr  r  )	abslstride_len_astride_len_ba_firstb_firstr  stride_lengthss	          rX   	index_cmpz"pick_loop_order.<locals>.index_cmpu  s    8q=E!HMuQx1}eAh!m44 .<<rBqE
<<-;<rBqE
<<  
7:<7V
 
  
7:<7V
 
 WW 1ay# =<s   B0	B5r   rF  )r  r   r  r   r   r   )		functools
cmp_to_keyr   r  r[  rk   r   pick_loop_orderssort)r  r  priority_idxr  orderpis   ``    rX   pick_loop_orderr  k  s      4 %N1$5 6789E
<17CD.,D

y
!L Es   Bc                  T    e Zd ZU ded<   dZded<   dZded<   ddZddZdd	Zdd
Z	y)NodeUser$Union[BaseSchedulerNode, OutputNode]rL   Fr   rq  is_weakc                v    t        | j                  j                         | j                  | j                  f      S rT   )r\   rL   rU   rq  r  r^   s    rX   r_   zNodeUser.__hash__  s+    TYY'')4+;+;T\\JKKrZ   c                    t        |t              xrW | j                         |j                         k(  xr4 | j                  |j                  k(  xr | j                  |j                  k(  S rT   )ry   r  rU   rq  r  rV   others     rX   __eq__zNodeUser.__eq__  s[    uh' .5>>#33.  E$5$55. -		
rZ   c                6    | j                   j                         S rT   rr   r^   s    rX   rU   zNodeUser.get_name  rs   rZ   c                    | j                   |j                   u sJ t        | j                   | j                  xr |j                  | j                  xr |j                        S rT   )rL   r  rq  r  r  s     rX   r   zNodeUser.merge  sP    yyEJJ&&&II2!2!2LL*U]]
 	
rZ   Nr   )r  objectr   r   r   )r  r  r   r  )
rf   r   r   r   rq  r  r_   r  rU   r   r   rZ   rX   r  r    s3    
..K GTL
$
rZ   r  c                  N    e Zd ZU ded<   dCdZdC fdZdDdZedEd       Zej                  dFd       ZdGdZ
dHd	ZdId
ZdGdZdGdZdGdZ	 	 	 	 dJdZdKdZdLdZdGdZdGdZdJdZdGdZ	 	 	 	 dMdZ	 	 	 	 	 	 dNdZ	 	 	 	 	 	 dOdZdGdZdPdZ	 	 	 	 	 	 dQdZdRdZ	 	 	 	 dJdZdSdTdZdUdZ 	 	 	 	 dVdZ!	 	 	 	 	 	 dWd Z"	 	 	 	 	 	 dWd!Z#	 	 	 	 	 	 dWd"Z$	 	 	 	 	 	 	 	 dXd#Z%	 	 	 	 	 	 dYd$Z&dZd%Z'	 	 	 	 	 	 	 	 d[d&Z(dWd'Z)	 	 	 	 	 	 dWd(Z*	 	 	 	 	 	 	 	 d\d)Z+d]d*Z,d^d+Z-	 	 	 	 	 	 dYd,Z.	 	 	 	 d_d-Z/	 	 	 	 d`d.Z0dGd/Z1dGd0Z2dGd1Z3dad2Z4dbd3Z5dcd4Z6ddd5Z7	 	 	 	 	 	 ded6Z8dZd7Z9	 	 dfd8Z:	 	 	 	 	 	 dgd9Z;	 	 dhd:Z<dGd;Z=	 	 	 	 	 	 did<Z>dGd=Z?dUd>Z@	 	 	 	 djd?ZAdkd@ZBdldAZCdGdBZD xZES )mrJ   zdict[Dep, int]_Scheduler__dep_size_hint_cachec                f    t        d      5  | j                  |       d d d        y # 1 sw Y   y xY w)NzScheduler.__init__)r   _initrV   r  s     rX   r   zScheduler.__init__  s,    ./ 	JJu	 	 	s   '0c           
         t                    i  _         t        j                  _        i  _        t        t               _	        t        j                          _        t        t                   _        t        g t        j                  j                   j#                         t        j                  j$                  j#                         t        j                  j&                  j#                                _        |D cg c]  } j+                  |       c} _         j/                           j(                  j1                  t        j                  j$                  j#                                 j,                  D ]  }|j3                            j5                          _         j,                  D ci c]  }|j9                         | c} _         j,                  D ci c](  }|j=                         D ]  }|j9                         | * c}} _         j:                  jA                          _!        i  _"        i  _#        tI        jJ                   j,                   j>                   jB                         _         jM                           jO                   j,                         _         jQ                           j,                  D ci c]  }|j9                         | c} _!         jS                          tT        xjV                  tY         j,                        z  c_+        ddl-m.}m/}  | j,                         tY         j,                         _0         jc                           jO                   j,                         _        t        td        t        t        f              _3        th        jj                  $ti        jj                   j,                         _         jm                   j,                         _         jo                           jq                          th        jr                  r ju                  d        th        jv                  rddl<m;}  | j,                   j>                   jB                  t        t        j                  j                   j#                               t        t        j                  j{                                      _        th        j|                  r$tI        j~                   j,                         _         j                           j                           | j,                         t        j                  j                   j,                          j                          t        t                   _E        i  _F        t        d      j                   fd       y c c}w c c}w c c}}w c c}w )Nr   )log_ir_post_fusionlog_ir_pre_fusion)num_ck_nodesr   )reorder_for_peak_memorygraph_statsc                 ^     j                    j                  t         j                        dS )N)graph_idnum_nodes_before_fusionnum_nodes_after_fusion)post_grad_graph_idnum_orig_nodesrk   r  r^   s   rX   r   z!Scheduler._init.<locals>.<lambda>1  s'     33+/+>+>*-djj/ rZ   )Ir_  r   r  rD   r|   rK   backendsr  _post_grad_graph_counterr  r  count_graph_partition_counterr   r   r  r  keys	constantstorchbind_constantsr'  create_scheduler_noder  update_zero_dim_cpu_tensorr
  r  get_donated_buffersr   rU   r2  r   r   copyr9  r  mutation_renamesr   decide_global_ordering_of_commscompute_dependenciestopological_sort_scheduledead_node_eliminationcompute_ancestorsr    ir_nodes_pre_fusionrk   torch._inductor.debugr  r  r  create_foreach_nodesr  logged_slow_fusionr   _pre_fusion_custom_passr~  r  finalize_multi_template_bufferscombo_kernelscreate_combo_kernel_nodesr  memoryget_output_names reorder_for_compute_comm_overlap$reorder_compute_and_comm_for_overlapprocess_grouped_nodescompute_last_usager?  graph_diagramdebug_draw_graphbuffer_names_to_freeorigin_to_indexr   add_row)	rV   r  rO  rL   r   r  r  r  r  s	   `       rX   r  zScheduler._init  sT   %'" <>"&'?"@(1(9%$.sO$5!&0%%**,""'') ,,113'
# >CCd003C
'')##**177+<+<+A+A+CDJJ 	DOO	 $$& 	# &*ZZ;
 !AJJL!O;
 -1JJ8
$($BRBRBT8
;>CLLNC8
8
 AE@Q@Q@V@V@X 35 13 ::JJ##

 	!!#33DJJ?
""$<@JJ"Gq1::<?"G ##s4::6#O$**%!$**o!!#33DJJ?
",U38_"="?))577

CDJ__TZZ0
,,.***= ))70

  ''177//44671773356DJ 22CCDJJODJ""$!4::&	djj) %/sO$5! :<'//	
y D;
8
B #Hs   V>8W'-WWc                   i }t         j                  j                  D ]d  }t        t         j                  j                  |   t        j
                        s9t        | t         j                  j                  |   d       ||<   f |S )N)rN   )rD   r|   graph_inputs_originalry   r   DonatedBufferr   )rV   name_to_donated_bufr]   s      rX   r  zScheduler.get_donated_buffers8  sp     GG11 	D!''77=r?O?OP,BGG11$7 $-#D)	 #"rZ   c                6    t         j                  j                  S rT   rD   r|   current_devicer^   s    rX   r  zScheduler.current_deviceC  s    ww%%%rZ   c                .    |t         j                  _        y rT   r  r_  s     rX   r  zScheduler.current_deviceG  s    !'rZ   c                    t         j                  j                  dd      dk(  rddlm}  || j
                  d       yy)z,Generate an image of the graph for debuggingINDUCTOR_WRITE_SCHEDULER_GRAPHN1r   )draw_buffersT)print_graph)osenvironr  r?  r  r  )rV   r  s     rX   r  zScheduler.debug_draw_graphK  s1    ::>>:DASH+6 IrZ   c                    t         j                  t        j                        r8t         j	                  d|       | j
                  D ]  }|j                           y y )Nz%s:)r   isEnabledForloggingINFOr   r  r   )rV   labelrL   s      rX   debug_print_nodeszScheduler.debug_print_nodesR  sF    GLL)HHUE"

 #  "# *rZ   c                6   |j                         J d       |j                         rt        | |      S t        |t        j
                  t        j                  f      rt        | |      S t        |t        j                        rt        | |      S t        |      )Nz2All nodes passed to scheduling must have an origin)r  is_no_opr  ry   r   r,   rv  rM  r  r  r  r  s     rX   r  zScheduler.create_scheduler_nodeX  s    !- 	
@	
- ==?)$55r00"2C2CDE t,,boo.,T488%d++rZ   c                   t        t                  }g }| j                  j                         }t        j
                  j                  j                         D ]  }|D cg c]%  }||v rt        | j                  |   t              s|' }}|s6|j                  |       |D cg c]  }| j                  |    }}t        j                  dkD  }t        | |d|      }|j                  |       |D ]  }|| j                  |<     | j                   D 	cg c]  }	|	j#                         |vs|	 c}	t%        |      z   | _        y c c}w c c}w c c}	w )Nr   FrE  rH  )r   r   r9  r  rD   r|   listsr   ry   r2  r  r
  r   combo_kernels_autotuner+  r  r  rU   r   )
rV   removed_node_namesfe_nodeskept_node_namesnamesr]   r  rH  fe_noderL   s
             rX   r  zScheduler.create_foreach_nodese  sS   '_.11668WW]]))+ 	8E "?*"4#4#4T#:<RS E  %%e,:?@$d''-@F@$;;a?O0*/ /	G OOG$ 807''-81	88 "ZZ
4==?BT+TD
N
5 A
s   *E!EE*Ec                     t        d      } G fddt        |         t        j                         j                  D ]  }|j                         D ]}  }|j                         }|j                         D ]X  }|v r=|v r9|   }|   }||z   }j                         D ]  }	|	   |u s|	   |u s||	<    D|v r	|   |<   Q|   |<   Z   d  fd 	 	 d	 	 	 	 	 	 	 	 	 d fd}
i }t        j                  j                  j                         D ]6  \  }}t        |t        j                        s!|j                   D ]  }d||<   	 8  j                  D ]  }t"        j%                  d|j&                         |j&                  J t)        |j&                  j+                         d 	      }|D ]6  }t        |t        j,                        sJ ||vs$|j                         ||<   8 t)        |j&                  j/                         d
 	      }|D ]d  }||v sJ | d|        ||   x} j0                  |   j                         D ]*  }|j3                  t5        |j                                      , f t7        |j8                  j:                        dk(  rGt=        t?        |j8                  j:                              x}rt        |t@              r|jB                  }nd}|j                         D ]  }t7        |jE                               dk  sJ |jE                         D ]  }  |      } |
||       |j3                  t5        ||             |   j                  D ]  }|j                         |j                         k(  r%t        |j&                  tF              sJ |j&                  jI                         D ]?  }  |      }|j3                  tK        ||j                                       |
||d       A    |j8                  jL                  D ]6  }t        |tJ              r |
|jN                  ||jQ                  |             8 |jS                   jT                         |j                         D ]  }|jE                         D ]y  }|j                          jT                    |      <   |j                          jT                  |<    jV                  jY                  ||       jV                  |j                         <   {   t        j                  j[                         D ]3  }t"        j%                  d|        |
|t]        t5        |                   5 t        j                  j^                  D ]  }|j/                         D ]|  }||v sJ | d|j                                 ||   x}s) j0                  |   jI                         D ]4  }t"        j%                  d||        |
|t]        t5        |                   6 ~   jT                  D ]  }|t        j                  j                  v rE |
|t]        t5        |                   t        j                  j`                  jc                  |       d|t        j                  jd                  v s |
|t]        t5        |                    tg        t        j                  j                  j                               D ci c]  \  }}||
 }}}t        j                  j`                  D cg c]  }||   	 c}t        j                  _4         j                  D ]C  }|j                         D ].  }|jk                  |j                            j                         0 E  jl                  D ]-  } jl                  |   jk                  |   j                         / yc c}}w c c}w )zi
        Create dependency edges between nodes, handling aliasing and
        mutation properly.
        Tc                  >    e Zd ZdZ	 	 d	 	 	 	 	 ddZddZd	 fdZy)
1Scheduler.compute_dependencies.<locals>.DedupListan  
            This data structure behaves like a list except it makes sure the
            elements remain unique.
            Normally one could use a OrderedSet/dict for this purpose however
            the list in question gets elements appended as it is being
            iterated over which means that we need to keep the list
            semantics.
            Nc                @    |xs g | _         |xs
 t               | _        y rT   )itemsr   
membership)rV   r	  r
  s      rX   r   z:Scheduler.compute_dependencies.<locals>.DedupList.__init__  s    
 #[b
","<
rZ   c                    || j                   v ry | j                  j                  |       | j                   j                  |       y rT   )r
  r	  r  r!  )rV   	node_users     rX   r  z8Scheduler.compute_dependencies.<locals>.DedupList.append  s5    /

!!),##I.rZ   c                    t        j                  | j                  |j                        }| j                  |j                  D cg c]  }|| j                  vs| c}z   } ||      S c c}w rT   )r   r  r
  r	  )rV   r  new_membershipr  	new_items	DedupLists        rX   __add__z9Scheduler.compute_dependencies.<locals>.DedupList.__add__  sc    !+!1!1$//5CSCS!T JJ${{*at.FA* 	 !N;;*s   A+A+r  )r	  zOptional[list[T]]r
  zOptional[OrderedSet[T]]r   r   )r  r  r   r   )r  DedupList[T]r   r  )rf   r   r   r(  r   r  r  )r  s   rX   r  r    s;     ,06:=(= 4= 	=/<rZ   r  c                N    | j                   v r j                   |          S | S rT   )r  )rO  r   rV   s    rX   r   z.Scheduler.compute_dependencies.<locals>.rename  s,    D)))d33A677HrZ   c                P     |          j                  t        |||             y rT   )r  r  )used_by_namer  rq  r  name_to_usersr   s       rX   add_userz0Scheduler.compute_dependencies.<locals>.add_user  s)     &./66K9rZ   Nzscheduling %sc                    | j                   S rT   r  r  s    rX   r   z0Scheduler.compute_dependencies.<locals>.<lambda>  
    AFF rZ   rF  c                    | j                   S rT   r  r  s    rX   r   z0Scheduler.compute_dependencies.<locals>.<lambda>  r  rZ   z not in r   )r  )mutating_bufT)r  zscheduling output %sz+scheduling output %s for unbacked symint %s)rO  r   r   r   )FF)
r  r   r  r  rq  r   r  r   r   r   )7r   r   r  r   r  r   rU   rh   r  rD   r|   r  r	  ry   r  rN  free_symbolsr   r?  rL   rK  get_unbacked_symbol_defsSymbolget_unbacked_symbol_usesr2  r   r(   rk   r   r   r  r  r'   r  rj   rG   rH  r)   r   r]   rq  r   r  r  r  r  r   graph_outputsmutated_inputsr!  r  r  mutated_input_idxsr   r   )!rV   r  rL   buf1	buf1_name	buf2_namelist1list2combinedrG  r  unbacked_symbol_to_origin_noder]   valfsunbacked_symbol_defsr  unbacked_symbol_usesr<  r   r   	node_modealt_namero   
other_namer  rX  r   r  	inp_namesr  r  r   s!   `                             @@@rX   r  zScheduler.compute_dependencies  s    CL	<
 	<> @K?V?V@
 JJ 	LD((* L MMO	!%!1!1!3 LI M1i=6P -i 8 -i 8#(5=#0#5#5#7 >C -c 2e ;#0#5#>5=c 2> #m33@3Ki03@3Ki0LL	L(	 !&!			;	 	 		
 	 MO&
 --335 	>ID##uzz*** >B9=226>	>
 JJ J	DIIotyy1 99(((#)		224:J$  * H!!U\\222 ::8<215H $*		224:J$  * C:: c"@!AB: 8::AG#003??A C))'#,,.*ABCC D$$++,1 d&6&6&=&=!>??S?sI.HH	 	 '') E3,,./1444 # 1 1 3 EH%h/HXt,%%ghY&GH -h 7 = = E==?dmmo=$)$))5FGGG*.))*D*D*F EJ)/
);J -- '
 P %ZtDEEEE, ((.. F!$0TYYd.>.>t.DEF %%d&;&;< '')  # 1 1 3 H>AllnD))&*:;69llnD))(3//33HhG ++CLLN;IJ	Z 002 	>HII,h7Xz'(*;<=	>
 77(( 
	JC113 	J:: c"@"E"E"G!HI: 7q9919$($5$5a$8$I$I$K J		I8UV !:gh6G+HI	J	J
	J )) 	:Dqww+++z'$-89&&**40***z'$-89	: ,5QWW5I5I5N5N5P+Q
'E4D%K
	 
 )*(>(>&
 $IdO&
"
 JJ 	CD'') CmCLLN;AABC	C // 	SD''-77d8K8Q8QR	S
&
s   _ /_&c                Z  	 g }t        | j                        D ]  }dd	d}|j                         D ]  }t        	fd|j                  D              }|r\t
        j                  d|j                                t        j                  j                  j                  |j                                d} |j                          xr | }|s|j                  |       t
        j                  d|j                                t        j                  j                  j                  |j                                |j                  j                   D ]  }|j"                  | j$                  v s| j$                  |j"                     j                  }|D cg c]0  }|j&                  j                         |j                         k7  s/|2 c}| j$                  |j"                     _          t)        t        |            | _        | j                  D ]  }|j+                           yc c}w )	z0
        Remove any nodes without users
        c                r    | j                   xs* | j                         t        j                  j                  v S rT   )r  rU   rD   r|   r-  )ro   s    rX   can_eliminate_userz;Scheduler.dead_node_elimination.<locals>.can_eliminate_userd	  s&    ||Tt}}!'':T:T'TTrZ   Fc              3  .   K   | ]  } |        y wrT   r   )r  ur4  s     rX   r  z2Scheduler.dead_node_elimination.<locals>.<genexpr>i	  s     #Ma$6q$9#M   zremoved dead buffer: %sTzremoved dead operation: %sN)ro   r  r   r   )r  r  r   rQ  rQ   r   r?  rU   rD   r|   r  r!  rs  r  r-  r   r   r]   r   rL   r   r6  )
rV   updated_nodesrL   active_buffersr   can_eliminater  rQ   r6  r4  s
            @rX   r  zScheduler.dead_node_eliminationZ	  s    TZZ( 	DU #N'') * ##M399#M M II7HGG++//?%)N* !% 5 5 77N<NM $$T* 		6H**..t}}? ,,22 DyyD$4$44 $ 0 0 ; A A',="#0AT]]_0TA=((39-	8 (=12
 JJ 	#D  "	#=s   0H(H(c                    t        t                  t               g dfd|D ]  }|j                         D ]  }||<   	  |D ]
  } |        S )z?
        Ensure nodes is in topologically sorted order
        c                    | vrdj                  |        t        | j                  d       D ]&  }|j                  vr |j                            ( j	                  |        y y )Nc                    | j                   S rT   r  )ds    rX   r   zDScheduler.topological_sort_schedule.<locals>.visit.<locals>.<lambda>	  s
    aff rZ   rF  )r!  rK  r   r]   r  )rO  r   r2  rn   seenvisits     rX   r@  z2Scheduler.topological_sort_schedule.<locals>.visit	  se    }!!"6"6<LM 2Cxx|3 ,sxx01	2
 a  rZ   )rO  rG   r   r   )r   rG   r  rH  )rV   r  rL   r]   r2  rn   r?  r@  s       @@@@rX   r  z#Scheduler.topological_sort_schedule	  sy     +,.59V*,	! 	!  	*D--/ *%)T"*	*  	D$K	rZ   c                @    t        t                  }t        |t        t        t
        t        f      r-|j                  D ]  }|j                  |j                          nt        dt        |       d       fd|D        }t        t         fd|D                    S )Nz+get_unmet_dep_nodes is not implemented for .c              3  X   K   | ]!  }j                   |   j                          # y wrT   )r   rY   r(  s     rX   r  z1Scheduler._get_unmet_dep_nodes.<locals>.<genexpr>	  s%     Xc))#.??AXs   '*c              3  <   K   | ]  }j                   |     y wrT   r9  )r  rO  rV   s     rX   r  z1Scheduler._get_unmet_dep_nodes.<locals>.<genexpr>	  s     Qat66q9Qs   )r   r   ry   rM  r  r  r  r   r!  r]   RuntimeErrorre   r   )rV   r  
unmet_depsr   unmet_dep_opss   `    rX   _get_unmet_dep_nodeszScheduler._get_unmet_dep_nodes	  s    _&
)&"	
 // )sxx() =d5k]!L  YZXJQ=QQRRrZ   c                z   g }t         j                  | j                  d      }i }| j                  D ]P  }| j                  |      }t	        |      ||<   |D ]*  }|j                  |g       }|j                  |       |||<   , R |j                         D 	cg c]  \  }}	|	dk(  s| }
}}	|
rx|j                  |
       |
D ]7  }|j                  |g       D ]  }||xx   dz  cc<    |j                  |       9 |j                         D 	cg c]  \  }}	|	dk(  s| }
}}	|
rx|rJ d       |S c c}	}w c c}	}w )zU
        Sort nodes by their topological order, return a list of node lists.
        r   r   zTopological sort failed!)	r  fromkeysr  rI  rk   r  r  r	  r   )rV   r  r  childrenrL   r$  r   crO  vzero_deg_nodesro   s               rX   rZ  z!Scheduler._topological_sort_nodes	  sF    djj!,#%JJ 	"D,,T2Dd)E$K "LLb) !"	" ).@1a!@@LL(# $LLB/ %D$K1$K%		! -2KKMDDAqQ!VaDND  444y A Es   D1%D1D7D7c                x   i }| j                   D ]~  }t        t                  }|j                  D ]B  }| j                  |j
                     j                         }|j                  |       |||   z  }D |||j                         <   ||_	         t        | j                         D ]  \  }}||_        ||_         y)z.
        Populate each node.ancestors
        N)r  r   r   r   r   r]   rY   r!  rU   r   r  r   r   )rV   name_to_ancestorsrL   r   r   dep_node_namer  s          rX   r  zScheduler.compute_ancestors	  s    
 9;JJ 	'D"3)I.. > $ 0 0 : K K Mm,.}==	> 2;dmmo.&DN	' %TZZ0 	#KE4"DN"DN	#rZ   c                H   | j                   D ]  }t        j                  st        |t        t
        f      r#|j                         st        j                  dk7  rN|j                         D ]3  }t        |t              r|j                         r$|j                          5  y )Nhalide)r  r   r{  ry   rM  r  r?   cpu_backendrC  ri  r  )rV   rL   r  s      rX   r  zScheduler.merge_loops	  s    JJ 	$D44 d]4F$GHKKMf&8&8H&D) $!%75;L;L;N!!#$	$rZ   c                j   t        d      5  t        d      D ]  }t        |      }t        j	                  d|dz   |       | j                  |      }t        |      }t        j	                  d|dz   ||       ||k(  s|dk(  sjt        j	                  d|dz           n |cddd       S # 1 sw Y   yxY w)zB
        Combine eligible nodes into FusedSchedulerNodes.
        zScheduler.fused_nodes
   z/===== attempting fusion (%d/10): %d nodes =====r   z=completed fusion round (%d/10): fused %d nodes into %d nodes
z+===== fusion complete (%d iterations) =====N)r   r[  rk   r>  r?  fuse_nodes_once)rV   r  r  old_lennew_lens        rX   r~  zScheduler.fuse_nodes	  s     12 	2Y e*  EE
 ,,U3e*  TE	 g%A$$Eq1u '( +	 	 	s   A5B)B))B2c                    g }| j                   D ]4  }|j                  t        |t              r|j	                         n|g       6 || _         y)zA
        Unpack GroupedSchedulerNode into regular nodes.
        N)r  r#  ry   r  r  )rV   	new_nodesrL   s      rX   r  zScheduler.process_grouped_nodes
  sJ     .0	JJ 	D!+D2F!GdV	 
rZ   c                    t        |      dkD  sJ |d   j                         }|| _        | j                  |      }t	        ddd      5  |j                  |      cddd       S # 1 sw Y   yxY w)
        Benchmark fused list of nodes and return the execution time
        in milliseconds on randomly generated inputs.
        r   benchmark_fused_nodesTcompile_time_autotune_time_us)log_pt2_compile_eventdynamo_compile_column_usN)rk   r[  r  ry  r   r_  )rV   r  r`  backends       rX   r_  zScheduler.benchmark_fused_nodes!
  st     5zA~~q$$&$""6*#"&%D
 	8
 007	8 	8 	8s   
A%%A.c                    t        |      dkD  sJ |d   j                         }|| _        | j                  |      }t	        d      5  |j                  ||      cddd       S # 1 sw Y   yxY w)r^  r   r_  N)rk   r[  r  ry  r   generate_kernel_code_from_nodes)rV   r  benchmark_kernelr`  rc  s        rX   re  z)Scheduler.generate_kernel_code_from_nodes3
  sq     5zA~~q$$&$""6*12 	T::5BRS	T 	T 	Ts   A##A,c                    || _         | j                  |      }t        d      5  |j                  |      cddd       S # 1 sw Y   yxY w)r^  r_  N)r  ry  r   benchmark_codegened_module)rV   moduler`  rc  s       rX   rh  z$Scheduler.benchmark_codegened_moduleA
  sH     %""6*12 	>55f=	> 	> 	>s	   ?Ac                   	 	 	 	 	 	 dd}t        | j                        D ]A  \  }}t        |t              st        |j                  t
        j                        s=|j                  }t        j                  j                  s|j                         \  }}nt        d |j                  D              }t        |t        j                  j
                  j                        r|j                  j!                  |       |j#                         }|j$                  }t        |t
        j&                        sJ |j$                  }	t        |	t
        j(                        sJ |j*                  |	_         |||	       | j-                  |	      }
|
| j                  |<   |
| j.                  |j1                         <   |
| j2                  |j1                         <   t5        |
j7                         |j7                               D ]3  \  }}|| j8                  |j1                         <   |j:                  |_        5 |j<                  |
_        |j>                  |
_        |j@                  |
_         D y )Nc                   |j                         }| j                         }t        |t              rt        |t              sJ |j                         }| j                         }t        |t              rt        |t              sJ t        j
                  j                  |= ||_        t        j
                  j                  |= ||_	        t        j
                  j                  j                  |       }t        j
                  j                  j                  |       |t        j
                  j                  |<   |t        j
                  j                  |<   t        j
                  j                  j                  |       }t        j
                  j                  j                  |       |t        j
                  j                  |<   |t        j
                  j                  |<   y rT   )rU   ry   r   r<  rD   r|   r"  r]   
name_to_opoperation_namebuffersr  remove
operations)	orig_noderJ  replaced_buf_nameorig_buf_namereplaced_op_nameorig_op_nameorigs          rX   replace_operation_bufferzKScheduler.finalize_multi_template_buffers.<locals>.replace_operation_bufferN
  sW    !) 1 1 3%..0MmS1jARTW6XXX'::<$779LlC0Z@PRU5VVV&&'89)HM""#34&2H#77??((3DGGOO""8,$,AGGOOD!4<AGG""=177%%++I6DGG%%h/'/AGGt$/7AGG|,rZ   c              3  |   K   | ]4  }t        |t        j                  j                  j                        r| 6 y wrT   )ry   r   r   select_algorithmExternKernelCaller)r  timings     rX   r  z<Scheduler.finalize_multi_template_buffers.<locals>.<genexpr>r
  s6       &) & % @ @ S S  #s   :<)rq  zir.MultiTemplateBufferrJ  zir.OperationBufferr   r   )!r  r  ry   rM  rL   r   MultiTemplateBufferr   test_configs%force_extern_kernel_in_multi_templateget_min_choicer  choice_timingsr   r   TritonTemplateCallerBasefinalize_as_triton_calleroutput_noder   
StorageBoxOperationBufferrg   r  r2  rU   r9  r  r   r   rQ   r   r   r   )rV   rw  r  rL   
multi_nodemin_node_unfusedr  out_tensorboxout_storage
out_buffernew_scheduler_nodenew_outold_outs                rX   r  z)Scheduler.finalize_multi_template_buffersM
  s   	8-	89K	8	86 !, 0	@GAt$.:		2114 "YY
**PP*4*C*C*E'$a'+*4*C*C	($ $OO&&?? II778HI 0 < < >+00!+r}}===(--
!*b.@.@AAA$.$5$5
!(Z@%)%?%?
%K" 2

15G!!$--/2;M''8(+&224d6F6F6H) 2$GW <CD$$W%5%5%78$+MMGM	2 04~~",/3~~",04"-a0	@rZ   c                &    t        d |D              S )Nc              3     K   | ]q  }t        |j                  d       xrU |j                  duxrE t        |j                  j                  d      xr# |j                  j                  j                  dk(   s yw)r   Nscatter_moder  )r   rL   r   r  rN  s     rX   r  z,Scheduler._any_atomic_add.<locals>.<genexpr>
  so      

 	 AFFF# 9d"9^49 ((L89
s   A7A9)r  rV   	node_lists     rX   _any_atomic_addzScheduler._any_atomic_add
  s     

 
 
 	
rZ   c           	         t        d fD              }t        j                  s|syj                         r(t	        j                         t        j                        r j                         sj                         ryj                         }|d   j                         sJ j                  dk(  ryj                         }t        t        j                  ||            } j                  |      ryddlm t%              |d   j                         J dfdt&        j(                  j*                  j-                         	 	 	 	 d fd}|rt        d	 fD              rj                         durj                         nj                         t	        t        j.                        sJ j0                  }j3                         \  }	j3                         \  }	r j5                  |      n j5                  |      \  }
g d}t7        |j9                         d
       D ]  \  }}t	        |t&        j(                  j                  j:                        s5s&t=        |d      r|j>                  j>                  k7  r]|z   k\  r nQ|dz  }|t        j@                  kD  r n7jC                  |      5  jE                  |g ||             ddd        tG              dk(  ryd fd}|S  ||       ||       ||      d fd}|S # 1 sw Y   xY w)
        If config.benchmark_fusion is False, always return True.
        Otherwise, return True if fusion can brings speedup.
        c              3     K   | ]>  }|j                         xr( t        |j                         t        j                         @ y wrT   )ri  ry   r  r   r|  rN  s     rX   r  z.Scheduler.speedup_by_fusion.<locals>.<genexpr>
  sE       
  MMO J1..0"2H2HIJ 
s   AATr   r^  CompilationErrorNc           
     t   t         j                  t        j                        r| ||z   k  rFt         j	                  dj                         j                         t        ||z   | z  d             y t         j	                  dj                         j                         t        | ||z   z  d             y y )Nz9can fuse (benchmark): fusing %s with %s cause %sx speedup.3fz=cannot fuse (benchmark): fusing %s with %s cause %sx slowdown)r>  r  r  DEBUGr?  rH  r4   r5   )ms_fusedms1ms2r8  r9  s      rX   
log_fusionz/Scheduler.speedup_by_fusion.<locals>.log_fusion
  s    &&w}}5cCi'$$S..0..0"sSyH&<S%AC	 $$W..0..0 Hc	$:3#?A	 6rZ   c                    j                  | d      }t        j                  |      }j                         sd }||fS j	                  d|      }t        |t              sJ ||fS )NT)rf  triton_)kernel_namesource_code)re  r   loaduse_process_pooltritonry   r   )r  src_codemodfutasync_compilerV   s       rX   compile_kernelz3Scheduler.speedup_by_fusion.<locals>.compile_kernel
  s     ;; < H ""8,C 113
 : $**yh*W!#|444:rZ   c              3  @   K   | ]  }|j                         d u  y wrT   r  rN  s     rX   r  z.Scheduler.speedup_by_fusion.<locals>.<genexpr>
  s#      %
23A!-%
s   c                    | d   S r1  r   r  s    rX   r   z-Scheduler.speedup_by_fusion.<locals>.<lambda>  s
    ad rZ   rF  allowed_prologue_inpsr   Fc            	        t        d      } d }i }D ]V  \  }}}	 ||j                          j                  |      5  j                  |	      \  }}|||<   || k  r|} |}d d d        X  |        | z   k  r|j                  |       |_        yy# t        $ rQ}t        j	                  t
        j                        r$t        j                  d
sdndt        |             Y d }~d }~ww xY w# 1 sw Y   xY w)NinfzException in compiling %s: %sr   r"  TF)r2  rn   r   r>  r  r  r  r?  r   swap_as_triton_callerrh  r  _choice_timings)min_ms_fusedms_fused_choicenew_timingschoicefuture	mod_fusedr  r  pathr`  epilogue_fusionfuture_choicesr  r  r  r  rV   s            rX   benchmark_when_readyz9Scheduler.speedup_by_fusion.<locals>.benchmark_when_ready1  s$   $U|"& 1? 5-FFI!!-"MMO $99&A 5)-)H)H%v*$ /7F+#l2+3L.4O5 550 <c239-/2M88I1<J. 1 % !%227==A&,, ?2A
z #A
 !!5 5s#   B$C7	C4#AC//C47D 	c                    ddl m}  	 d   d   d   fD ]  }||j                           j                  d   
      \  t	        j
                        r	 d       yj                  d   
      \  t	        j
                        r	 d       yj                  d   
      \  t	        j
                        r	 d       y        t        d      rWz   k\  rOfj                  vr?j                  j                  f       t        d      j                  fd	       z   k  S # | $ r Y y	$ r}d
t        |      v rY d }~y d }~ww xY w)Nr   )NoTritonConfigsErrorr   z%register spilling of the first kernelFz&register spilling of the second kernelz%register spilling of the fused kernelslow_fusionc            	     $      z   z  dS )N)kernel1_pathkernel1_latencykernel2_pathkernel2_latencyfused_kernel_pathfused_kernel_latencyslow_down_ratior   )r  r  r  path1path2
path_fuseds   rX   r   zKScheduler.speedup_by_fusion.<locals>.benchmark_when_ready.<locals>.<lambda>  s(    053605365?8@3;sSy3I% rZ   Loop-carried variableT))torch._inductor.runtime.triton_heuristicsr  rn   rh  mathisinfr   r  r!  r   r  r   )r  r  r  r  r  r  r  r  r  r  r`  future_and_mod_l1future_and_mod_l1_fusedfuture_and_mod_l2r  rV   r@  s      @@@@@@rX   r  z9Scheduler.speedup_by_fusion.<locals>.benchmark_when_ready`  s   ; *!,)!,/2  )
 ?JJL) "&!@!@)!,f"JC zz#CD$!%!@!@)!,f"JC zz#DE$+/+J+J/2F,(Hj zz(+CD$xc2 0>$c	1"EN$2I2II//33UENC(7?? 
 $cCi//+ ! ' .#a&8#s<   E AE +5E !5E A3E E.E.E)(E))E.)r  r2  r  r2  r  r2  r   r   )r  r  r   z)tuple[Optional[LambdaFuture], ModuleType]r   )$r  r   benchmark_fusionri  ry   r  r   TritonTemplateBufferrm  rC  r[  re   r   r  r  r  triton.compiler.errorsr  r7  r   r   r  AsyncCompiler|  r  r  r_  rK  r	  r  r   r   max_epilogue_benchmarked_choicesr  r  rk   )rV   r8  r9  is_multi_templatenode_list_1node_list_2node_list_fusedr  r  r  r  triton_choicesr  unfused_timer  r  r  r`  r  r  r  r  r  r  r  r  r  r@  s   ```            @@@@@@@@@@@@@rX   speedup_by_fusionzScheduler.speedup_by_fusion
  s]       
 U^ 
 

 &&/@ u668":Q:QR!! oo'Q**,v ;;%oo'y{KHI
 0;u% #..0!!!	" 55BBD	.	6	$  %
8=u~%
 "
 $557tCO # ''),,. 
 j"*@*@AAA'66N..0FAs  ..0FAs # **;7//< C TVNN(.$$&N) V$ "&%//*<*<*U*UV ((?@44
8X8XX39,!#!F$K$KK55f= V"))6*TN?4S*TUV V3V8 >"a'%! %!N (' !/{ ; .{ ;&4_&E#@ @D ('oV Vs   M

M	c                <    | j                   |j                            S )z0Look up the node in Scheduler name_to_fused_node)r9  r?  r  s     rX   r{  zScheduler.get_fused_node  s    &&t':':'<==rZ   c                    t        |      t        j                  t        j                        rBt        j                  d       D ](  }t        j                  d|j                         z          * i 	 	 	 	 	 	 d fd	 	 	 	 	 	 d fd} j                  |      D ]  \  }} |||        j                  |      } j                  |      } j                  ||      sD j                  ||      rW j                  ||      }t        |      r|||f|<   |||f|<   |s ||        t               }j                         D ]j  \  }}	}
||v r|j                  |        j                  |	      |	u sJ  j                  |
      |
u sJ  |       sO j                  |	|
      rb |	|
       l t        d       } j!                  |      } j#                  |       |S )	a  
        Combine eligible nodes into FusedSchedulerNodes.

        This relies on two key functions to control the logic:
            - self.can_fuse(): checks if a fusion is legal
            - self.score_fusion(): assigns priority to a given fusion
        zfuse_nodes_once, candidates:z  c                   t         j                  d| j                         |j                                | j                         }|j                         |k(  sJ j	                  |      j                  | |      }j                  |        j                  |       j                  |       j                  j                  |j                         D ci c]  }|j                         | c}       |S c c}w )Nzfusing %s with %s)r>  r?  rU   r[  ry  r  ro  r!  r9  r
  rC  )r8  r9  r`  node3rO  rI  rV   s        rX   fuse_two_nodesz1Scheduler.fuse_nodes_once.<locals>.fuse_two_nodes  s     0%..2BENNDTU%%'F##%///$$V,11%?Eu%u%OOE"##**.3oo.?@u$@ L As   C6c                   j                  |       v sj                  |      v rj                  j                  |       j                  j                  |      d             }|J |\  }}}j                  |d        j                  |d        j                  |      |u sJ j                  |      |u sJ  |       rj                  | |      r ||       j                  |       v rωj                  |      v ry y rT   )r{  r  r   will_fusion_create_cycle)	r8  r9  pending_fusion
is_speedup	node_key1	node_key2r  pending_fusionsrV   s	         rX   resolve_pending_fusionsz:Scheduler.fuse_nodes_once.<locals>.resolve_pending_fusions  s    ##E*o=&&u-@!0!4!4''.#''(;(;E(BDI" &1113A0
Iy##It4##It4**95BBB**95BBB!|t'D'DUE'Ry)4' ##E*o=&&u-@rZ   c                    | j                   S rT   r  r  s    rX   r   z+Scheduler.fuse_nodes_once.<locals>.<lambda>  s
    !++ rZ   rF  )r8  rG   r9  rG   r   rG   rC  )r   r>  r  r  r  r?  r   get_possible_fusionsr{  r:  r  r  callabler   r!  rK  r  r:  )rV   r  rL   r  r8  r9  speedupseen_pair_speedup_fnis_speedup_fnr  r  r  rI  r  s   `          @@@rX   rX  zScheduler.fuse_nodes_once  s*    !'""7==1;<# @  (<(<(>!>?@  	
	$	->		 	5$	5->	5	52 !55e< 	-LE5 $E51''.E''.E}}UE*43P3Pu4 00>G$.5ue-DOE*.5ue-DOE*ue,)	-, @J|3B3I3I3K 	5/M9i 44 $$]3&&y1Y>>>&&y1Y>>>t'D'D9( y)4	5 {(=>..u5!!%(rZ   c                   t        | j                        }d}t        | j                        }t        j	                  d|       t        t        j                  |             D ]  \  }}t        j                  |      }t        |      dk  r+|||kD  r n| j                  |      st        j	                  d|       \|dz  }t        j                  dkD  }t        |d   j                  |d|      }t        j                  d	t        |      |       |D ]  }	|j                  |	        |j                  |       | j                   j#                  |j%                         D 
ci c]  }
|
j'                         | c}
       ! t)        |d
       | _        | j+                  | j                        | _        t        j                  d||t        | j                               | j-                  | j                         yc c}
w )z'
        Groups parallel nodes
        r   z2ComboKernels: Generating with num_ck_nodes = %d...r  Nz)ComboKernels: Not speeding up %d-th groupr   Tr  z0ComboKernels: Combining %d nodes for %d-th groupc                    | j                   S rT   r  r  s    rX   r   z5Scheduler.create_combo_kernel_nodes.<locals>.<lambda>8  s
    q{{ rZ   rF  zEGenerated ComboKernel nodes: %d ComboKernels, totally %d -> %d nodels)r   r  rk   r   r?  r  r+  rh  rW  speedup_by_combo_kernelr   r  rK   r   ro  r!  r9  r
  rC  rU   rK  r  r:  )rV   r  rI  r  num_nodes_orignumr  rH  r  rL   rO  s              rX   r  z#Scheduler.create_combo_kernel_nodes  s    !,TZZ		FU'&DDTJ
 	NC 3CCINI9~!'EL,@//	:		EsKQJE$;;a?O4!&&*. /	K HHBI
 " )""4()OOK(##**4?4I4I4KLq{*L7	< K-BC
33DJJ?
S

O		
 	!!$**- Ms   !G=
c                H    |D ]  }|j                  | j                          y rT   )r:  r9  )rV   r  rL   s      rX   r:  zScheduler.prune_redundant_depsB  s%     	?D%%d&=&=>	?rZ   c                   	
 g 	t        t        t        t        f             
d	
 fd}t        j                  t
              }|D ]=  } j                  |      r|j                         D ]  }||   j                  |        ? |j                         D ]
  } ||        t        j                  rat        j                  t
              }|D ]&  }t        |dd      }|s||   j                  |       ( |j                         D ]
  } ||         j                  	      		j                   j                  d       t         j#                  dt%        	             	S )z^
        Helper to find all legal fusion opportunities, sorted by self.score_fusion()
        c                P   t        |       D ]  \  }}| |dz   d  D ]  }||f}|v rj                  |       j                  ||      rj                  |       A|j	                         s|j                         sbj                  ||      suj                  ||f         y r1  )r  r!  r:  r  ri  rm  )r  node1_indexr8  r9  rG  possible_fusionsr?  rV   s        rX   check_all_pairsz7Scheduler.get_possible_fusions.<locals>.check_all_pairsO  s    &/&6 @"U";?#45 @E %.Cd{ HHSM}}UE2(//4++-1A1A1CuJ )//?@@rZ   r   NT)rG  reversezfound %d possible fusionsr  r4  r   r   )r   r  rG   r  r   r   unfusable_noder  r  r   r   aggressive_fusionr   *get_possible_fusions_with_highest_priorityr  score_fusion_keyr>  r?  rk   )rV   r  r  buffer_names_groupingrL   r   node_groupinggroup_groupingr   r  r?  s   `        @@rX   r  zScheduler.get_possible_fusionsF  sh    % 13D DEFH	@  !, 7 7 = 	8D""4(--/ 8%c*11$78	8
 399; 	+MM*	+ ##(44T:N 7gt4"5)0067 "0!6!6!8 /./  JJ
 	$"7"7F4c:J6KLrZ   c                    t        t                  d fd|j                         j                  j	                         |j                         j                  j	                         z  |j
                  j                  j	                         |j
                  j                  j	                         z  z
  t         fdD              }|r t        ||      d       |S )z~
        Finds whether there's a path from node1 to node2 (or vice-versa)
        caused indirectly by other fusions.
        c                   t        | t              rq| vrmj                  |        | j                         j	                        ryt        | j                  z        xs" t        fd| j                  z
  D              S y)NFc              3  H   K   | ]  } j                   |           y wrT   rE  r  rO  
found_pathrV   s     rX   r  zIScheduler.will_fusion_create_cycle.<locals>.found_path.<locals>.<genexpr>  s+      H #4#:#:1#=>H   ")ry   r  r!  rD  issubsetr   r   r  )rL   combined_ancestorscombined_namesr	  rV   visiteds    rX   r	  z6Scheduler.will_fusion_create_cycle.<locals>.found_path  s    $ 23G8KD!++-667IJ !   ?@ C H!%2D!DH E  rZ   c              3  H   K   | ]  } j                   |           y wrT   rE  r  s     rX   r  z5Scheduler.will_fusion_create_cycle.<locals>.<genexpr>  s!     WqJt66q9:Wr
  zwill create cyclerL   rG   r   r   )r   r  rD  _dictr  r   r  r7  )rV   r8  r9  cycler  r  r	  r  s   `   @@@@rX   r  z"Scheduler.will_fusion_create_cyclex  s     /02	 	2 %%'--224'')//4467 	
 OO!!&&(5??+@+@+E+E+GG WDVWW#IeU#$78rZ   c                    ddl m 	 	 	 	 d fd} ||      } ||      }t        fd|D              }t        fd|D              }|j                  |      }d}	|D ]  }
	 |	t	        |
d         z  }	  j                  ||      }t        j                  j                  j                  |	d	|z        ry
y# t
        $ r Y  yw xY w)a  
        Return true if fusing the two nodes can potentially increasing peak memory.

        The implementation is more like a heuristic since we don't really know if we are at peak
        or not when trying to fuse these two ndoes. The order of nodes may change later which makes the
        peak memory estimation hard.

        Here is how we decide the LOWER BOUND of extra memory allocation if we fuse these 2 nodes:
        1. find all buffers read by each node with a single user. These buffers are supposed to
           be reused if we don't fuses these 2 nodes
        2. find the intersection of these buffers for the two node and sum the total buffer size.
           If we don't fuse these two nodes, we can at lease avoid this much memory allocation.
           Note that the extra memory allocation is not necessarily causing peak memory increase.
           This is just a heuristic.

        We return true only if the saving for fusion can not trade off the extra memory allocation.
        r   )buffer_reuse_keyc                0   g }| j                   j                  D ]y  }j                  j                  |j                        }|s+t        |j                        dk(  sD|j                  j                         s_|j                  |j                         { |S r1  )
r   r   r   r  r]   rk   rQ   rL   has_tensor_outputr  )rL   r   r5  r   rV   s       rX   _find_single_user_inputszKScheduler.can_fusion_increase_peak_memory.<locals>._find_single_user_inputs  sw     F&&,, ,&&**27733syy>Q.3883M3M3OMM#((+, MrZ   c              3  .   K   | ]  } |        y wrT   r   r  r   r  s     rX   r  z<Scheduler.can_fusion_increase_peak_memory.<locals>.<genexpr>       #Sc$4S$9#Sr7  c              3  .   K   | ]  } |        y wrT   r   r  s     rX   r  z<Scheduler.can_fusion_increase_peak_memory.<locals>.<genexpr>  r  r7  r   r  F    T)rL   rG   r   zlist[ir.Buffer])r  r  r   intersectionr   r  score_fusion_memoryrD   r|   r  statically_known_gt)rV   r8  r9  r  lhs_dep_nodesrhs_dep_nodeslhs_reuse_keysrhs_reuse_keyscommon_reuse_keysmemory_overheadrG  	bw_savingr  s   `           @rX   can_fusion_increase_peak_memoryz)Scheduler.can_fusion_increase_peak_memory  s    * 	6	#		 1707##S]#SS##S]#SS*77G$ 	C3s1v;.	 ,,UE:	 77//iP  s   $B88	CCc                    t        t        |j                  |j                  z
        t        |j                  |j                  z
              }|dkD  S )aB  
        This function prevents fusion for nodes that can increase memory
        footprint. This problem is more common in horizontal fusion, where nodes
        that are far apart in the original order get fused, lengthening the live
        intervals of tensors. This is very evident in models with activation
        checkpointing, where the recomputed nodes from different checkpointed
        regions get fused and significantly increase the memory footprint.

        The current attempt is a quick, possibly hacky, heuristic to prevent the
        fusion of nodes that are far away in the original order.

        A better but difficult to implement heurisitic would be to use live
        intervals of the buffers, find region of peak pressure in the original
        program and prevent fusion that crosses that peak region. We might need
        special care or good approximation in this implementation, as fusion of
        node changes live intervals, and re-computing live intervals and peak
        memory after each fusion can introduce large compilation overhead.
        @   )r  r  r   r   )rV   r8  r9  proximity_scores       rX   are_long_distant_nodesz Scheduler.are_long_distant_nodes  sE    * %//12%//12
 ##rZ   c                   i }|j                   j                         D ci c]  }|j                  | }}|j                   j                         D ci c]  }|j                  | }}|D ]}  }t        j                  j                  |      }	||   }
||   }t        |
t              rt        |t              sdt        |
       dt        |       ||<   k|
j                         |j                         k7  r(d|
j                          d|j                          ||<   t        |
j                        t        |j                        k7  rd||<   |
j                         }|j                         }||k7  rd| d| ||<   |
j                         |j                         k(  rd|
 d| ||<   Ed}t        |	t        j                        sd|	j                    }d	|
 d| d
| ||<    t#        |      S c c}w c c}w )z}
        Try to decide reasons why fusion fail due to no shared memory even though
        there are common buffers.
        znot MemoryDep: z v.s. zdifferent numel: 	broadcastzdifferent offset: zMismatch loop orders: r   zLayout: zUnknown reason: z. )r   r|  r]   rD   r|   r  ry   r'   re   r  rC   r  
get_offsetnormalize_with_stride_orderr   r  rg   r   )rV   r8  r9  common_buf_namesreasonsr   node1_name2depnode2_name2deprX  r   lhs_deprhs_deplhs_offrhs_off
layout_strs                  rX   decide_fusion_fail_reasonz#Scheduler.decide_fusion_fail_reason  s    383D3D3U3U3WXC#((C-XX383D3D3U3U3WXC#((C-XX( ,	H''$$X.C$X.G$X.Ggy1GY9W%d7m_F4=/J !   "g&7&7&99'(9(9(;'<F7CTCTCVBWX !  W\\*mGLL.II$/!((*G((*G'! '9	y$Q! 3356689 '=WIVG9$U! Jc2#5#56'

|4
"7)6'"ZLI HU,	\ 7|c YXs   G5G:c                   t         j                  rt        d ||fD              ry|j                  j	                         }|j                  j	                         }||z  }|sy|j                  j                         D ci c]  }|j                  | }}|j                  j                         D ci c]  }|j                  | }}g }	|D ]y  }
||
   }||
   }|j                         |j                         k(  s/|	j                  t        j                  j                  j                  |j                         d      ||f       { t        |	      dk(  ryt        |	d       \  }}}t!        |t"              rt!        |t"              sy|j$                  |j$                  k7  r3|j'                         |j'                         k(  r| j)                  |      S y|j+                         s|j-                  ||       nV|j+                         s|j-                  ||       n3t.        j1                  d|j3                         |j3                                | j5                  ||      S c c}w c c}w )z
        Right now just greedily reorder the loop of node1 to be compatible with node2,
        but ideally we should have some heuristics to reorder the loop for node2
        to be compatibile with node1 if that's more efficient.
        c              3  <   K   | ]  }|j                           y wrT   )ra  rN  s     rX   r  z>Scheduler.shared_data_after_reordering_loop.<locals>.<genexpr>C  s      8
AHHJ8
rB  r   r  c                    | d   S r  r   r  s    rX   r   z=Scheduler.shared_data_after_reordering_loop.<locals>.<lambda>g  s
    1 rZ   rF  z?Don't reorder loops since both nodes are reductions: %s v.s. %s)r   r{  r  r   buffer_namesr|  r]   r/  r  rD   r|   r  r  r  rk   r  ry   r'   r  ru  dep_size_hintrd  r   r  r?  rU   r  )rV   r8  r9  node1_buffer_namesnode2_buffer_namescommon_buffer_namesr   r2  r3  
candidatesbuffer_namer4  r5  _numels                 rX   !shared_data_after_reordering_loopz+Scheduler.shared_data_after_reordering_loop8  sA    00C 8
!&8
 5
 "..;;="..;;=03EE"383D3D3U3U3WXC#((C-XX383D3D3U3U3WXC#((C-XX 
. 	K$[1G$[1G3356689 !!((2273D3D3FQR2S	 z?a $'z~#F '9-Z5Sw///
   "g&7&7&99))'22 !!#++GW=##%++GW=##Q   ''u55e YXs   >I0I$c                    t        |t        t        f      xr) |j                          xr t	        |j
                         S )z>
        Is this node unfusable under any conditions.
        )ry   r  r  ri  rA   rL   r  s     rX   r  zScheduler.unfusable_node  sD    
 t79OPQ C$$&&C7		BB	
rZ   c                   |j                         t        j                  j                  k  ry|j	                         }|j                         }d}|||z  kD  r	 |d       yt        d |j                         D              }|t        j                  j                  j                  j                  fk(  r	 |d       yd	d} ||j                         j                        r|j                         s	 |d       yy)
zT
        Heuristics to avoid benchmarking predictably slow prologue fusions
        Tg?z@prologue fusion will not increase amount of bytes read in kernelFc              3     K   | ]J  }|j                   <|j                   j                         D ]  }|j                  dk(  r|j                   ! L y w)Ncall_function)rL   r  rW   r  )r  rO  r  s      rX   r  zEScheduler.check_prologue_fusion_heuristics_fusable.<locals>.<genexpr>  sT      
vv!VV'')	
 tt&	 HH

s   AAz\prologue fusion will not increase attempt to fuse in padding bc it increases unaligned readsc                <    | j                   dk  xr | j                  S )Nr  )itemsizeis_floating_point)r  s    rX   low_prec_fpzGScheduler.check_prologue_fusion_heuristics_fusable.<locals>.low_prec_fp  s    >>Q&B5+B+BBrZ   zVprologue fusion that must be upcast to fp32 not profitable for low precision templates)r  ztorch.dtyper   r   )rD  rD   r|   invoke_quant_opsr  r  r  rC  r   opsatenconstant_pad_nddefaultr  r  rR  )	rV   prologue_noder!  r@  
read_byteswrite_bytesBYTES_THRESHOLD_MULTIPLIERr  rM  s	            rX   (check_prologue_fusion_heuristics_fusablez2Scheduler.check_prologue_fusion_heuristics_fusable  s     ,,.!''2J2JJ"88:
#::< &)"'AABRS  
",,.
 
 uyy~~55==??n 	C @@BHHI!>>@h rZ   c                   ||u ryt        ||      }|j                         r0| j                  |j                               j	                  ||      ryt        |t              st        |t              r	 |d       yt        |t        t        f      r|j                         s	 |d       yt        |t        t        f      r|j                         s	 |d       y|j                         |j                  z  r	 |d       y|j                         r!t        j                  s	 |d       y|j                         s|j                         r	 |d       y|j                         }t        |t        j                         s	 |d	       y|j#                         }t%        d
 |j&                  D              |z
  }|j)                         |z  r	 |d       y|j+                         s|j+                         r	 |d       y|j-                         dd D ]B  }|j/                         }|D ]+  }	t1        fd|	j2                  D              r" |d         y D t        |t4              s|gn*|j6                  D 
cg c]  }
|
j                         s|
 c}
}t9        |      dk(  sJ |d   }t9        d   j:                        dk(  rSt9        d   j:                  d   j2                        dk(  r+d   j:                  d   j2                  d   j<                  |u s	 |d       y| j?                  |||      sy|j                         r9|j+                         s |j                         st        j@                  s	 |d       y|j)                         tB        jD                  jF                  z  s+|j)                         tB        jD                  jF                  z  r	 |d       y|j                         }|j                         }||k7  r |d||       y~| jI                  ||      }|t        jJ                  k  r"t        jL                  r| jO                  ||      }tP        jS                  tT        jV                        r4tP        jY                  d|j[                         |j[                         |       tB        j\                  j_                  | |||      sy|j                         |j                  z  rY| ja                  ||      xrE tB        j\                  ja                  | |||      xr! | j                  |      ja                  ||      S tB        j\                  jc                  | |||      xr! | j                  |      jc                  ||      S c c}
w )zj
        Determine if it is possible to combine node1 and node2 into a
        single fused node.
        FTz/grouped node must not be fused with other nodesznode1 is extern or nopznode2 is extern or nopznode1 must go before node2zprologue fusion turned offz2prologue fusion only supported for pointwise nodesz2prologue fusion only supported for TritonTemplatesc              3  <   K   | ]  }|j                           y wrT   r>  )r  inps     rX   r  z%Scheduler.can_fuse.<locals>.<genexpr>  s     Ec3<<>ErB  z;prologue fusion not implemented for kernel for these inputsz:template prologue can only fuse functional pointwise nodesNr  c              3  :   K   | ]  }|j                   v   y wrT   r  )r  ro   prologue_nodess     rX   r  z%Scheduler.can_fuse.<locals>.<genexpr>  s     QttyyN:Qry  z7template prologue can only fuse nodes with a single user   r   zEtemplate prologue can only fuse nodes with a single use into templateztemplate epilogue not satisfiedz#fusion for buffer explicit disabledzdevice mismatch (%s vs %s)z%s and %s has %s shared data)2r7  ri  ry  r[  can_fuse_multi_outputs_templatery   r  r  r  rD  r   r   prologue_fusionrd  r  r   r  get_allowed_prologue_inpsr   r  rH  r  rC  r   rQ  rQ   r  r  rk   r   rL   rW  r  rD   r|   no_fuse_buffer_namesr  score_fusion_memory_thresholdr{  rE  r  r  r  r  r?  rU   choicesr:  can_fuse_verticalcan_fuse_horizontal)rV   r8  r9  r@  r  r  unsupported_prologue_argsrL   	node_outsr   rO  template_snodestemplate_snoder`  device2shared_data_scorer\  s                   @rX   r:  zScheduler.can_fuse  s
    E>u%4#3#3$

)
)%
7$8 e12j'7
 ABu8:PQR%%'()u8:PQR%%'()$$&8,-))01!!#u'8'8':HI779Hh(?(?@HI$,$F$F$H! EX__EE'( &
 %%'*CCQR--/53Q3Q3SPQ"__.N&s+ % ,,.	$ %CQsyyQQUV$%% "%);< !&AAaA 
 '1,,,,Q/N N2&../14r*2215;;<A"2&..q177:??>Q[ @@sS**,!!#))12""$qww'C'CC""$qww'C'CC56!!#""$W,fg> 44UEB D DD11 $ F Fue T))'--8##.  !	 yy!!$u6GH$$&8 &&ue4 MII//eUDUVM$$V,>>ueL 9900eU$5 M""6*>>ueLMC Bs   5VVc                   |j                         }t        ||      }t        t              }|j                  D ]j  }| j
                  j                  |j                  |j                        }t        |t              r| j                  |||      rW||   j                  |       l |j                  j                  D ]  }t        |t              s|j                  | j
                  j                  |j                  |j                              }	|	sV|	D ]&  }
| j                  |
|      s|	j!                  |
       (  t#        d t$        j&                  j)                  |j+                               D              }||z  r	 |d       y|j-                         }|D ]E  }| j.                  |   j1                         }|| j2                  |   j4                  z  s= |d        y y)a  
        Check if it is legal to fuse a consumer (node2) into a producer (node1).

        We can fuse them if all the reads of node2 either match
        corresponding writes in node1, or are written by nodes that can
        be scheduled before the fusion of node1 and node2.
        c              3  4   K   | ]  }|j                     y wrT   r  r  s     rX   r  z.Scheduler.can_fuse_vertical.<locals>.<genexpr>z  s      $
 HH$
r  zmemory deps did not matchFz(intermediate nodes between node1 & node2T)rH  r7  r   r   r   r  r  r]   ry   r)   fusable_weak_depr  r   r   r'   fusable_read_and_writero  r   r  r  r  r   rD  r   rY   r9  r   )rV   r8  r9  node1_buf_namesr@  remaining_deps_by_namer   r]   cd	remainingr5  remaining_depsnode1_op_namesr.  s                 rX   rc  zScheduler.can_fuse_vertical[  s     002u%7B47H++ 	5C((,,SXXsxx@D#w'D,A,A#ue,T"4(//4		5 ##** 		-Bb),.22%%))"''277;I # -B222r:!((,-		- $ $
 445K5R5R5TU$
 

 O+
 +,224" 	D&&t,==?G 7 7 @ J JJ>?		 rZ   c                    |j                   |j                         vry|j                  j                  D cg c]  }|j                   |j                  k(  r| }}t        |      dk7  ry|d   t        t              sJ t        j                  t        j                        ry| j                  |j                     }|j                  j                  D cg c]  }|j                   |k(  s| }}t        fd|D              S c c}w c c}w )NFr   r   c              3     K   | ]q  }t        |t              xr[ t        |j                  t        j
                         xr4 |j                  j                  k(  xr |j                  j                  k(   s y wrT   )ry   r'   r   r  r   TMPr  )r  r  r  s     rX   r  z-Scheduler.fusable_weak_dep.<locals>.<genexpr>  sm      

 	 tY' ('

DHH==(

ekk)( 		UZZ'(
s   A7A:)r]   rH  r   r   r  rk   ry   r'   r   r  r   rw  r  r   rQ  )	rV   weak_depr8  r9  r  mutating_writes	real_namer  relevant_readss	       `    rX   rm  zScheduler.fusable_weak_dep  s
    == 6 6 88 **11
zzX222 
 

 1$"%+++u{{DHH5++H,A,AB	"..44
		Y8ND
 
  

 '
 
 	
#

s   "DD,Dc                   t        |t              rH| j                  j                  |j                  |j                        }||j                  k7  sHt        |j                  t        j                        s$t        |j                  t        j                        ryt        j                  r9|j                  |j                  k7  r |j                         }|j                         }|j                  |j                  k(  xr\ t        |j                        t        |j                        k\  xr/ |j                  d t        |j                         |j                  k(  S t        |t              r| j                  j                  |j                  |j                        }| j                  j                  |j                  |j                        }|j                   |j                   k(  r|j                   ||k(  ryyr   )ry   r'   r  r  r]   r   r  r   rw  r   r{  r  ru  rk   r  r(   r  )rV   r  r  	read_name
write_names        rX   rn  z Scheduler.fusable_read_and_write  s`   dI&--11$))TYYGI UZZ'&tzz488<&u{{DHH=00T]]enn5T ~~') 

ekk) ?		Nc%**o5?II/EJJ0EJJ>
 g&--11$))TYYGI..225::uzzJJ		UZZ'JJ*+rZ   c                    d}|| j                   vr2	 |j                         s|j                         }|| j                   |<   |S | j                   |   }|S # t        $ r Y -w xY wr  )r  has_unbacked_symbolsnumbytes_hintKeyError)rV   r   ress      rX   r>  zScheduler.dep_size_hint  sz    d000//1++-C /2D&&s+ 
 ,,S1C
   	s    A 	A A c                2    t        |j                  j                        t        |j                  j                        z   }t        |j                  j                        t        |j                  j                        z   }t	        ||      dz  t        ||      k  r||kD  r|}|}|}|j                  j                  |j                  j                  z  D cg c]4  }||j                  j                  v s||j                  j                  v r|6 }}t         fd|D              S |j                  j                  |j                  j                  z  |j                  j                  |j                  j                  z  z  }t         fd|D              S c c}w )zn
        The first term in our fusion score that estimates number of saved
        memory operations.
        rH  c              3  @   K   | ]  }j                  |        y wrT   r>  r(  s     rX   r  z0Scheduler.score_fusion_memory.<locals>.<genexpr>  s     ?3t))#.?rz  c              3  @   K   | ]  }j                  |        y wrT   r  r(  s     rX   r  z0Scheduler.score_fusion_memory.<locals>.<genexpr>  s     Is4%%c*Irz  )rk   r   r   r   r  r  r  )	rV   r8  r9  node1_dep_lennode2_dep_lentmpr   r$  common_memory_depss	   `        rX   r  zScheduler.score_fusion_memory  sh    E--334s5;L;L;S;S7TTE--334s5;L;L;S;S7TT }m,q03}m3TT}, !,,22U5F5F5M5MM%++111SE<M<M<T<T5T D  ?$???#//558I8I8P8PP##e&7&7&>&>>
 I6HIIIs   9Fc                   t        |      dk(  r|S i }|D ]  \  }}|j                         |j                         k(  sJ |j                         }t        | j                  |      j	                  ||            }||vr	||fg||<   p||   j                  ||f        t        |j                         t        j                  d            d   }t        |      dkD  sJ |S )Nr   rF  r   )
rk   r[  r   ry  get_fusion_pair_priorityr  r  r	  operator
itemgetter)rV   r  "possible_fusions_group_by_priorityr8  r9  r`  fusion_pair_priority&possible_fusions_with_highest_prioritys           rX   r   z4Scheduler.get_possible_fusions_with_highest_priority  s   
  A%##  	+ - 	LE5##%)9)9);;;;%%'F#&  (AA%O$  $+MMENL23GH 33GHOOEN	 25.446H<O<OPQ<R2

2. 9:Q>>>55rZ   c                B    t        j                  j                  | g| S )z-
        Shim for list.sort(key=...)
        )rD   rb  score_fusionr  s     rX   r  zScheduler.score_fusion_key%  s     yy%%d3U33rZ   c                    t        t        j                  j                               }t	        | j
                        D ]9  }|j                  || j                         |j                  |j                         ; y)zg
        Populate node.last_usage recursively (also for the nodes within a FusedSchedulerNode)
        N)
r   rD   r|   r  r  r  r  r  r
  r   )rV   r  rL   s      rX   r  zScheduler.compute_last_usage-  s]    
 ))A)A)CDTZZ( 	8D 3T5L5LM&&t7	8rZ   c                   t        | j                  t        j                  j                  z
  t        j                  j
                  j                  z
        D ]N  }|| j                  v rT| j                  |   }|j                         s2t        j                  j
                  j                  |j                         f|t        j                  j                  v st        j                  j                  |   }t        |t        j                        r*t        j                  j
                  j                  |       |j                  }t        |t        j                         r|j#                         sJ t        j                  j
                  j                  |j                         Q | j                  j%                          y)z*Free any buffers that are no longer neededN)rK  r  rD   r|   r  r}   freedr   r   codegen_freerL   r  ry   r   r  r   r  is_input_bufferclear)rV   r]   r   rZ  storages        rX   free_bufferszScheduler.free_buffers8  s8   %%gg%%&gg""(()
 	DD
 t'''&&t,<<>GG((55chh?---gg**40c2#5#56GG((55c:!hhG"7BMM:w?V?V?XXGG((55gllC%	D( 	!!'')rZ   c                    | j                   j                         D ]  }|j                           | j                          y rT   )r  r   flushr  )rV   rc  s     rX   r  zScheduler.flushP  s3    }}++- 	GMMO	rZ   c                   t        |t              sJ t        d   dxx   dz  cc<   t        j                  t        d            5  |j                          |j                          d d d        |j                  }t        |t        j                        sJ dt        |             |j                  t        j                  j                         | j                          y # 1 sw Y   |xY w)Ninductorextern_callsr   F)increase_kernel_countztype(node)=)ry   r  r   rD   set_kernel_handlerr$   r  r  rL   r   r  re   r  r|   r}   r  )rV   scheduler_noderL   s      rX   codegen_extern_callzScheduler.codegen_extern_callU  s    .*CDDD
 	^,1,!!&u"EF 	&002##%	& ""$0B[T$ZM2BB0QWW))*	& 	&s   !C""C+c                P   t        |j                        r|j                  
J | d       t        j                  j                  |       t        |j                        }|t        d|j                         t               s|j                  dk(  rLt        j                  j                  |      x}j                  dk  rt        |t        j                               t        |j                        r,|j                  dk(  st!        t        j                                ||       S )Nz( should have been normalized in loweringzUnsupported device type: cuda   mps)r?   re   r  rD   r|   add_device_infor#   rF  r   r   r  get_device_propertiesmajorr*   inspectcurrentframer+   )rV   r`  device_schedulingdevice_propss       rX   create_backendzScheduler.create_backendd  s    &++&&,,*B 	
h>?	
B 	
'5fkkB$!:6;;-HII|v%%*ZZ%E%Ef%MM\TTWXX(w7K7K7MNN$V[[E-A#G$8$8$:;; &&rZ   c                    |J || j                   vr| j                  |      | j                   |<   | j                   |   S rT   )r  r  r_  s     rX   ry  zScheduler.get_backendy  sB    !!!&$($7$7$?DMM&!}}V$$rZ   c                    d fd}|j                         D ci c]8  }|j                  *|j                  j                         D ]  } ||      |fd  : }}}t        |j	                               }|rMt        |t        j                  d            \  }}t        j                  j                  j                  |       y y c c}}w )Nc                    | j                   vrLj                   j                  t        | j                  j                        D  ci c]  \  }} | |
 c} }       j                       S c c} }w rT   )r  r
  r  r|   r  )rO  r  rV   s     rX   	get_orderz*Scheduler.enter_context.<locals>.get_order  s\    ,,,$$++i>V,WdaQT,WX''** -Xs   A+
r   rF  )rO  ztorch.fx.Noder   r   )rC  rL   r  r   r  r  r  r  rD   r|   r}   enter_context)rV   rL   r  rO  r  r  r  lasts   `       rX   r  zScheduler.enter_context  s    	+ ^^%
vv!VV'')	
  q\1t#

 
 w||~&'x':':1'=>GAtGG  ..t4 
s   =Cc                    	 | j                   |   j                  }t        fd|D              xr || j                  vxr || j
                  vS # t        $ r Y yw xY w)NFc              3  ^   K   | ]$  }|j                   xs |j                         v  & y wrT   )r  rU   )r  ro   fused_node_namess     rX   r  zAScheduler.can_buffer_be_removed_through_fusion.<locals>.<genexpr>  s)     VC3C CCVs   *-)r   rQ   r  rQ  r  r  )rV   r]   r  rQ   s     ` rX   $can_buffer_be_removed_through_fusionz.Scheduler.can_buffer_be_removed_through_fusion  sn    	$$T*00E VPUVV 4D1114D333	
  		s   A 	AAc                   |j                         sy|j                  yt        |j                  t        j                        ryt        |j                  t        j
                        ryt        |j                  dd      ryt        |j                  d      r1t        d |j                  j                  j                  D              ryy)zBReturn True if we should partition the inductor graph on this nodeTNunbacked_bindingsrg   c              3  l   K   | ],  }t        |t        j                        xr |j                   . y wrT   )ry   r  rN  r  )r  exprs     rX   r  z-Scheduler.should_partition.<locals>.<genexpr>  s1      0
 tUZZ(>T->->>0
s   24F)r?   rL   ry   r   
DeviceCopyConditionalr   r   r  rg   r  r  s     rX   should_partitionzScheduler.should_partition  s    {{}99dii/dii0499148499h'C 0
		((--0
 -
 rZ   c                    i }|j                  t        j                  j                         | j                  D ]3  }|j
                  j                         D ]  \  }}|j                  ||<    5 |S )z~
        Return a mapping from name strings to the corresponding graph inputs or
        base scheduler node outputs.
        )r
  rD   r|   r  r  r   r	  rL   )rV   r2  rL   r]   scheduler_buffers        rX   get_name_to_nodeszScheduler.get_name_to_nodes  sr     UWAGG001JJ 	;D*.*>*>*D*D*F ;&&%5%:%:T";	; rZ   c           	        g }t        t        j                  j                               }| j	                         }t        t        |      t        |            D ]e  \  }}t               }|D ]+  }	|j                  |	j                  j                                - |j                  |      }
t        j                  j                  |D 	cg c]  }	|	j                   c}	      }t        |j                  |j                   z  D cg c]  }|j"                   c}      |z
  }t               }|D ]  }	|j                  |	j$                          |D ci c]  }||v r|||    }}|D ci c]  }||v r	|||v rdnd }}|
D cg c]  }||   	 }}|j'                  t)        ||||             |j+                  ||
z
        }h |ddd   S c c}	w c c}w c c}w c c}w c c}w )z
        Gets signature for each graph partition, including input nodes, output nodes, and
        whether deallocating an input within graph partition.
        TFNr  )r   rD   r|   r  r  r  r  r
  r   r  r  r   r  r  r   r   r   r]   r   r  r.   r  )rV   
partitionsskip_cudagraphs
signaturesunmet_output_namesr2  	partitionskip_cudagraphoutput_namesrL   returned_output_namesr   r  partition_input_namesr  r]   input_nodesinput_deallocationoutput_nodess                      rX   get_graph_partition_signaturez'Scheduler.get_graph_partition_signature  s     
'(@(@(BC--/),Z (?";*
 -	%I~ -7LL! A##D$8$8$=$=$?@A %1$=$=>P$Q! '11<<.78d!!8K K,=,=@R@R,RSqAFFST "
 5?L ! =$++DOO<=
 2<' l4((K  2"<' d&::dE" "
 <QQ4L.QLQ' &"	 "7!<!<"%::"W-	^ $B$E 9 T
"
 Rs   G	
G
G%G?Gc                D   g }d}g }g }| j                   D ]Q  }| j                  |      }|r)||k7  r$|j                  |       |j                  |       g }|}|j                  |       S |r"|j                  |       |j                  |       || j                  ||      fS )z
        Given a list of BaseSchedulerNodes, split into a list of
        graph partitions and compute partition input/output signatures.
        T)r  r  )r  r  r  r  )rV   r  r  cur_partitionr  rL   r  s          rX   graph_partitionzScheduler.graph_partition  s     +-
')JJ 	'D#44T:3C!C!!-0&&~6 "-N  &	' m,"">24==!? > 
 
 	
rZ   c                    t        d      5  t        j                  j                  j                  r| j                         n| j                  | j                        	 cd d d        S # 1 sw Y   y xY w)NzScheduler.codegen)r   r   r   r   r  _codegen_partitions_codegenr  r^   s    rX   r  zScheduler.codegen"  sX    -. 	 ??))99 ((*]]4::.	 	 	s   AA&&A/c                4   t         j                  j                  }t        | j                        }t         j                  j                         5  t         j                  j                  dd| ||       | j                  |       t         j                  j                  j                  t         j                  j                        \  }}ddd       t         j                  j                  j                  j                         t         j                  j                  j                  ||       t         j                  j                  j                  j                  |j                  D cg c]  }|j!                          c}       y# 1 sw Y   xY wc c}w )z,Codegen a partition given its inputs/outputsT
partition_)is_subgraphsubgraph_nameparent_wrapper_codepartition_signaturesN)rD   r|   r}   r  r  set_current_wrapper_codeinit_wrapper_coder  generateis_inferencedefine_subgraph_launcher_fnvaluecodegen_partition_call	allocatedr
  r  rU   )rV   r  	signaturer  graph_partition_idpartition_coder  rL   s           rX   _codegen_partition_wrapperz$Scheduler._codegen_partition_wrapper*  s)     gg22!$"?"?@WW--/ 	TGG%%  *+=*>?$7%.	 &  MM)$ ! 4 4 = =agg>R>R SNA	T 	
889M9MN	334F	R	&&--)2)?)?@T]]_@	
	T 	T As   A<F	*F	Fc                p   | j                         \  }}t        ||      D ]V  \  }}t        |      dk\  sJ dt        |              |j                  r| j	                  |       E| j                  ||       X t        | j                        }t        j                  j                  j                  |       y)z
        Split nodes into partitions and codegen each partition into separate functions.
        This allows further applying different optimizations (e.g., cudagraph) to
        each function.
        r   z5Each partition must have at least one node but found N)r  r  rk   r  r  r  r  r  rD   r|   r}   set_all_partition_names)rV   r  r  r  r  num_partitionss         rX   r  zScheduler._codegen_partitionsD  s     "&!5!5!7
J$'
J$? 	F Iyy>Q& GIGWX& ''i(//	9E	F d;;<	44^DrZ   c                
   t         j                  rdd l}t        j                         }t               }t        |      D ]  }|j                  dk(  r/|j                  |j                  j                  j                  k(  r nQ|j                  |j                  f}||vs"J d|j                   d|j                   d       |j                  |        d | _        |D ]  }t        j!                  t"        j$                        r4	 t        j'                  d|j)                         |j+                                | j/                  |       |j1                         x}r|| j                  k7  s |j3                         s|j5                         r| j7                          || j                  k7  r| j                  rGt9        | j                  j:                        r(t<        j>                  j@                  jC                          || _        t9        |j:                        rF|jD                  J d       t<        j>                  j@                  jG                  |jD                         | jH                  jK                  |jL                         |j5                         rP|jO                  tQ        |jS                                     \  }	}
}| jU                  |      jW                  |
||	       n|j3                         r,tY        jZ                  t\        |      }| j_                  |       n|ja                         rqtY        jZ                  tb        |      }| jU                  |      }d	d
l2m3} d	dl4m5} tm        |||f      r|}nto        dt;        |             |jq                  |       nYtm        |tr        tt        f      r!| jU                  |      jw                  |       n"tm        |tx              sJ |j{                          t         j|                  j~                  r| jU                  |      j                          | j                  jK                  |j                                | j                  jK                  |j                                tm        |tx              r|j1                         }|| jU                  |      j                         s| j7                           | j                  rGt9        | j                  j:                        r(t<        j>                  j@                  jC                          | j7                          y # t,        $ r( t        j'                  d|j)                                Y ;w xY w)Nr   _compile_innerzDuplicate stack frame :zs; did you add a decorator to one of the functions in this stack trace?  If so, try using a context manager instead.z5Generating code for node %s with estimated runtime %fz6Generating code for node %s with estimated runtime 0.0zdevice should have an indexr   )CUDACombinedSchedulingr  ztype(self)=)Fr   "check_stack_no_cycles_TESTING_ONLYtorch._dynamo.convert_frame	tracebackextract_stackr   r  r]   filename_dynamoconvert_frame__file__linenor!  r  r   r  r  r  r?  rU   r  r   r  r[  rk  ri  r  r9   re   rD   r|   r}   codegen_device_guard_exitr  codegen_device_guard_enterr  r
  r   r#  r   rC  ry  codegen_templater=  r>  r  r  rm  r+   codegen.cuda_combined_schedulingr  r  r  ry   r?  codegen_combo_kernelr  rM  codegen_noder  r  r  debug_sync_kernelcodegen_syncr'  rH  r  rD  ready_to_flush)rV   r  r   stackr?  framerG  rL   r`  r   r!  r"  backend_r  r  rc  s                   rX   r  zScheduler._codegenY  s4   44.++-E7A|D!%  JJ"22%--*E*E*N*NN~~u||4$ ,U^^,<Aell^ LJ J
  # G	!D.
IIO224 t$**v*d111~~''')JJLT000**/@++000 ,,FFH*0D'(5%||7V9VV7,,GGU%%,,T__=!484W4W)*51-   (99!8X !{{#<dC((."{{#=tD++F3T8h9O(PQ&G(KDJ=)9::,,T2D#5}"EF  (55d;!$(>???}}..  (557''..t/D/D/FG%%,,T-E-E-GHd$:;*%$*:*:6*B*Q*Q*SJJLOG	!R #4T5H5H5M5M#N GG  ::<

M ! IIPs   3T??-U0/U0c                    |d   j                         }| t        j                  _        || _        |J | j                  |      }|j                  |      S )r^  r   )r[  rD   r|   rK   r  ry  benchmark_combo_kernel)rV   r  r`  rc  s       rX   r	  z Scheduler.benchmark_combo_kernel  sW     1((* $!!!""6*--i88rZ   c                   t         j                  sy|}|d   j                         }||j                  dk(  ryddlm} dg }}t        |      D ]  \  }}|j                         }	| j                  |	      rt        j                  d       	 | j                  |	      \  }
}t        j                  |
      rt        j                  d|        y		 ||
z  }|j                  |        	 | j                  |      \  }}}||z
  dk  xs |dk  }t        j!                  t"        j$                        rP||kD  s|r%t        j                  dt'        ||z  d             n$t        j                  dt)        ||z  d             ||z
  |k  xs |S # |$ r.}d
t        |      v rt        j                  d       Y d}~ y d}~ww xY w# |$ r-}d
t        |      v rt        j                  d       Y d}~y d}~ww xY w)r  Tr   Nr^  r  g        z<ComboKernel: benchmarking may not accurate due to atomic_addz;ComboKernel benchmark: register spilling of %d-th subkernelFr  zCComboKernel benchmark: return True because of loop-carried variableg333333?z/can fuse (benchmark): fusing causes %sx speedupr  z3cannot fuse (benchmark): fusing causes %sx slowdown)r   r	  r[  re   r  r  r  rC  r  r>  r?  r_  r  r  r   r  r  r  r  r4   r5   )rV   r  subkernel_nodesr`  r  r  
path1_listr  r  r  msr  r  r  	ms2_clone_path2_listsmall_kernels                    rX   r  z!Scheduler.speedup_by_combo_kernel  s   
 ,, #..0 >V[[E1;rZ!/2 	$HAu)I ##I.  R55i@D::b>$$U ! " 2ICd#7	$:
	*.*E*Eo*V'CK Y,9c	""7==1SyL  E#)C2
   Ic	#0
 Y$44M $ *c!f4$$]     	&#a&0  Y 	s<   ?F	&F? 	F<"F76F77F<?G1"G,+G,,G1c                p    | j                   |   }|j                  J |j                  j                         S rT   )r   rL   
get_layout)rV   rX  r   s      rX   get_buffer_layoutzScheduler.get_buffer_layout  s5    x(xx###xx""$$rZ   c                   | j                   D ]  }|j                         s|j                  j                  D ]  }t        j
                  j                  j                  |j                        }|s9t        |      dk(  sHt        |j                  t              rc|j                         g k(  swt        j
                  j                  j                  |j                           y r]  )r  r?   r   r   rD   r|   r"  r  r]   r-   ry   rg   r0   r   zero_dim_cpu_tensor_listr!  )rV   rL   r  r  s       rX   r  z$Scheduler.update_zero_dim_cpu_tensor  s    JJ 
	HD{{} ,,22 HDWW3377		BF+F3u< *6==:K L"OO-388<<TYYGH
	HrZ   )r  zlist[ir.Operation]r   r   )r   z!dict[str, SchedulerDonatedBuffer]r.  )r`  r/  r   r   r   )r  r   r   r   )rL   r$  r   rG   rs  )r  rG   r   r4  )r   rt  r  r  r   tuple[float, str]r  r  rf  r   r   r   )ri  r   r`  r'  r   r  )r  r  r   r   )r8  rG   r9  rG   r   zUnion[bool, Callable[[], bool]])rL   rG   r   rG   rT   )r  zOptional[int]r   r   r  )r  r4  r   1list[tuple[BaseSchedulerNode, BaseSchedulerNode]]r8  rG   r9  rG   r   r   )r8  rG   r9  rG   r0  z"Union[tuple[str], OrderedSet[str]]r   r   r8  rG   r9  rG   r   r   r  )rS  rG   r!  rG   r@  r7  r   r   )rx  r)   r8  rG   r9  rG   r   r   )r  r&   r  r'   r   r   )r   r&   r   r   )r  r  r   r  )r  z+tuple[BaseSchedulerNode, BaseSchedulerNode]r   r   )r  r  r   r   )r`  r'  r   BaseScheduling)r`  r/  r   r  )rL   rG   r   r   )r]   r   r  r)  r   r   )r   z;dict[str, Union[ir.IRNode, ir.TorchBindObject, sympy.Expr]])r  zlist[PartitionType]r  z
list[bool]r   zlist[GraphPartitionSignature])r   z9tuple[list[PartitionType], list[GraphPartitionSignature]])r  PartitionTyper  r.   r   r   r  r  r   z(tuple[float, float, list[Optional[str]]])r  r4  r   r   )rX  r   r   z	ir.Layout)Frf   r   r   r   r   r  r  propertyr  setterr  r  r  r  r  r  r  rI  rZ  r  r  r~  r  r_  re  rh  r  r  r  r{  rX  r  r:  r  r  r'  r+  r9  rE  r  rW  r:  rc  rm  rn  r>  r  r   r  r  r  r  r  r  ry  r  r  r  r  r  r  r  r  r  r  r	  r  r  r  rh  ri  s   @rX   rJ   rJ     s   ))s
j	# & & ( (7#,"HOSb(#T,	 6S(4#&$66	808	8$T0TDHT	T
> 
>*6
>	
>L@\
~(&~(/@~(	(~(@>h,h	 hT..`?0 ,0 	:0 d,&,/@,	,\7&7/@7	7r$&$/@$	$6< < !< =	<
 
<|I6&I6/@I6	I6V
9(9 )9 	9
 
9vQMf3&3/@3	3j

(9
BS
	
J D J&J/@J	J<6 Q6	:6@4@4	4	8*0
'*%5$

+:
	
2	D ; -; @J; 	&; z
	B
<
 
 +
 
	
4E*dL949	19I5V%
HrZ   c                      e Zd Zd fdZddZddZ	 	 	 	 	 	 ddZ	 	 	 	 	 	 ddZ	 	 	 	 	 	 ddZ	 	 	 	 	 	 ddZ		 	 	 	 ddZ
	 	 	 	 	 	 	 	 dd	Z	 	 	 	 	 	 dd
ZddZddZddZddZ	 	 	 	 ddZddZ	 	 	 	 	 	 ddZ	 	 	 	 d dZ xZS )!r  c                0    t         |           || _        y rT   )r_  r   rK   )rV   rK   r  s     rX   r   zBaseScheduling.__init__,  s    "rZ   c                R    | j                   r| j                   j                          y y rT   )rK   r  r^   s    rX   free_buffers_in_schedulerz(BaseScheduling.free_buffers_in_scheduler0  s    >>NN'') rZ   c                    t               S )z0Return a set of .codegen.common.BackendFeature()r   r_  s     rX   get_backend_featuresz#BaseScheduling.get_backend_features4  s
    |rZ   c                    t         )zO
        Check whether node1 and node2 can be vertically fused or not.
        r  r<  s      rX   rc  z BaseScheduling.can_fuse_vertical8  
     "!rZ   c                    t         )zQ
        Check whether node1 and node2 can be horizontally fused or not.
        r  r<  s      rX   rd  z"BaseScheduling.can_fuse_horizontal@  r(  rZ   c                     y)au  
        A Multi-Output Template (referenced in #144012) is a template node
        with MultiOutputLayout, and its output buffers are instances of MultiOutput.
        In this context, we verify whether node1 represents the Multi-Output Template
        and node2 corresponds to one of its outputs. If so, we further check if
        backend supports this fusion.
        Fr   r<  s      rX   r]  z.BaseScheduling.can_fuse_multi_outputs_templateH  s     rZ   c                    |j                         s|j                         rt        j                  ||      S t        j                  ||      S )z 
        Fuse two nodes
        )rm  r+  r  r  r<  s      rX   r  zBaseScheduling.fuseT  sA     !1!1!3-225%@@%**5%88rZ   c                    t         )z[
        Process the iteration sizes in case a transformation needs to be applied.
        r  )rV   r  s     rX   rz  zBaseScheduling.group_fn_  r(  rZ   c                    t         )z
        Given a template node, generate a kernel.

        This function is only available for triton now. If the third-party backend behaves as a sub-class
        of TritonScheduling, it can override it or reuse it.
        r  )rV   r!  epilogue_nodesr\  s       rX   r  zBaseScheduling.codegen_templateg  s
     "!rZ   c                    t         zD
        Generate a kernel given a list of pre-fused nodes.
        r  )rV   r  rf  s      rX   re  z.BaseScheduling.generate_kernel_code_from_nodesu  r(  rZ   c                    t         r0  r  r  s     rX   r  zBaseScheduling.codegen_node}  
     "!rZ   c                    t         )zt
        Generate synchronization code for the kernel. This method depends on the hardware characteristics.
        r  r^   s    rX   r  zBaseScheduling.codegen_sync  r2  rZ   c                     y)z
        Check whether the backend is requesting the scheduler to flush the generated kernel.
        If not supported, please return False.
        Fr   r^   s    rX   r  zBaseScheduling.ready_to_flush  s    
 rZ   c                    t         )z]
        Flush the generated kernel and python wrapper code to the source code file.
        r  r^   s    rX   r  zBaseScheduling.flush  r2  rZ   c                    t         )r^  r  r  s     rX   r_  z$BaseScheduling.benchmark_fused_nodes  
     "!rZ   c                    t         )z
        Benchmark a compiled module and return the execution time
        in milliseconds on randomly generated inputs.
        r  )rV   ri  s     rX   rh  z)BaseScheduling.benchmark_codegened_module  s
    
 "!rZ   c                     y)z
        Return an unsigned integer which represents the priority of this fusion pair.
        The smaller is with higher priority.
        r   r   r<  s      rX   r  z'BaseScheduling.get_fusion_pair_priority  s     rZ   c                    t         )z
        Benchmark the list of nodes to combine and return the execution time
        and memory copy time in milliseconds on randomly generated inputs.
        r  r  s     rX   r	  z%BaseScheduling.benchmark_combo_kernel  r7  rZ   )rK   zOptional[Scheduler]r   )r`  r'  r   zOrderedSet[BackendFeature]r  r$  )r  r  r   z"tuple[tuple[sympy.Expr, ...], ...])r!  rG   r.  r  r\  r  r   zOptional[str]r  )rL   z(Union[FusedSchedulerNode, SchedulerNode]r   r   r   r  )ri  r   r   r  r  r  )rf   r   r   r   r$  r&  rc  rd  r]  r  rz  r  re  r  r  r  r  r_  rh  r  r	  rh  ri  s   @rX   r  r  +  sC   #*"&"/@"	""&"/@"	"
&
/@
	
	9&	9/@	9		9"3"	+""(" 4" 4	"
 
""0"DH"	"""""0"	""&/@	"4"	1"rZ   r  )rN  r   r   r   )rL   rG   r9  r,  r   zdict[str, SchedulerBuffer]r   r   )r  /Union[FusedSchedulerNode, GroupedSchedulerNode]r   r   )r  r;  rK   rJ   r  r4  r   r   )r   )r  zlist[list[int]]r  r  r  ztuple[int, ...]r   z	list[int])
__future__r   r  r   r  r  r  r  r  r  r  rL  rM  r  r=  r   r   r   r   r   r	   r
   r   r   collections.abcr   typesr   r  r   torch._inductor.async_compiletorch._dynamo.utilsr   r   torch._inductor.codecacher   r   torch._inductor.metricsr   r   %torch.fx.experimental.symbolic_shapesr   torch.utils._ordered_setr   torch.utils._sympy.symbolr   r   torch.utils._tritonr   r   r   r   r   r   r    analyze_preserves_zero_maskr!   codegen.commonr"   r#   r$   comm_analysisr%   r&   r'   r(   r)   excr*   r+   r,   r-   r.   r/   r0   	loop_bodyr1   r  r2   r3   runtime.runtime_utilsr4   r5   r  r6   utilsr7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   virtualizedrD   	getLoggerrf   r   _logginggetArtifactLoggerr>  r  r   r  	dataclassrI   r   rG   r7  ri   r   r8  rO  rP  convolutionmmbmmaddmm
_scaled_mmr  r  r  rM  r  r  r  r+  r  r  r  r  r  rJ   r  r   rZ   rX   <module>rX     s   "         	     , R R R (    $ 6 ? M G / ? * 6 6 D M M ; : : 2    J 7 &     g!^^--hA
NN44XO () e. e. e.P 4_ 4 4w
1 w
1t
 
,  &K
&K4&K ,&K 
	&KV #()).."<"<**))..,,!IINN00!&!:!: W 1 W"5. 5~+% ~+B@	$@ $ 
	,P** P*fy:!3 y:x	?, ?J %'+#++ "+ 	+\ 
 
 
> +9??, l%H l%H^KK" K"rZ   