
    Vh4                        d dl mZ d dlZd dlmZmZ d dlZddlmZ ddlm	Z	 ddl
mZmZ ddlmZmZ dd	lmZmZmZ dd
lmZ erd dlZd dlmZ ddlmZ ddlmZ  G d dej:                        Z G d d      Zy)    )annotationsN)AnyTYPE_CHECKING   )config)
write_text)get_metric_tableis_metric_table_enabled)DevicePropertiesReductionHint)BaseSchedulerNode	Scheduler	WhyNoFuse)V)
OrderedSet)SIMDKernelFeatures)TritonKernelc                      e Zd ZdZddZy)Sortablez>Anything that can be used as a list.sort() key (int/tuple/etc)c                     y N )selfothers     G/home/dcms/DCMS/lib/python3.12/site-packages/torch/_inductor/choices.py__lt__zSortable.__lt__   s        N)r   ztyping.Selfreturnbool)__name__
__module____qualname____doc__r   r   r   r   r   r      s    H5r   r   c                  (   e Zd ZdZ	 	 	 	 	 	 	 	 	 	 ddZedd       Ze	 	 	 	 	 	 dd       Zedd       Ze	 	 	 	 	 	 	 	 	 	 dd       Z	e	 	 	 	 	 	 	 	 	 	 dd       Z
e	 	 	 	 	 	 	 	 	 	 dd       Ze	 	 	 	 	 	 	 	 	 	 dd	       Ze	 	 	 	 	 	 	 	 dd
       Zy)InductorChoicesax  
    This class contains a collection of default heuristics that effect performance of our generated
    code.  We try to not put correctness requirements in this file.

    You can override the choices made here by doing:

            class MyHeuristics(InductorChoices):
                ...

            torch._inductor.virtualized.V.set_choices_handler(MyHeuristics())
    c                    |S )zTHook to change the kwargs passed to TritonKernel, used to apply fixed configurationsr   )r   
kernel_clsfeaturesgroupskernel_kwargss        r   triton_kernel_kwargsz$InductorChoices.triton_kernel_kwargs+   s
     r   c                   t         j                  j                  ryt         j                  j                  r+t        j
                  j                         j                  dk(  ryt        j
                  j                  j                  | j                  d      }|dk  rd|z  }n	|dk  rd	}nyt        j
                  j                  j                  | j                  |      S )
z>Heuristic to decide if a cooperative reduction should be used.TcpuF   )fallback   i      i    )r   tritonforce_cooperative_reductionscooperative_reductionsr   graphget_current_device_or_throwtypesizevars	size_hintnumelstatically_known_geqreduction_numel)r(   xhint	thresholds      r    should_use_cooperative_reductionz0InductorChoices.should_use_cooperative_reduction5   s     ==5544ww22499UB  **8>>A*FA:Ib[Iww44$$i
 	
r   c                   t         j                  j                  syt        j                  dij                  | j                         d      }|rD	 |dt        t        j                  j                  j                  | j                        d      z  z  }t         j                  j                  r|dz  }t        j                  j                  j                  | j                   |      S # t        $ r Y ^w xY w)zO
        Heuristic to decide if a persistent reduction should be used.
        Fi   @       r1   )r   r2   persistent_reductionsr   INNERgetget_reduction_hintminr   r5   r8   r9   r:   
ValueErrormulti_kernelstatically_known_leqr<   )r(   cooperative_reductionr>   s      r   should_use_persistent_reductionz/InductorChoices.should_use_persistent_reductionL   s     }}22

#h))+R
0 	 !R3qww'7'7'A'A(..'QSU#VVV	 ==%%OIww44$$i
 	
  s   AC% %	C10C1c                    | j                         t        j                  k(  xr4 t        j                  j
                  j                  | j                  d      S )a  
        Heuristic to decide if we should drop the X dimension from a persistent reduction kernel.
        So the [XBLOCK, RBLOCK] block becomes a [RBLOCK] block and XBLOCK is forced to be always 1.
        Strangely this is faster than a [1, RBLOCK] block in some cases.
           )rF   r   rD   r   r5   r8   r;   r<   )r(   s    r   want_no_x_dimzInductorChoices.want_no_x_dimj   sG     '')]-@-@@ U  55h6N6NPST	
r   c                   t        j                  |       }|j                  }d}dd}||z  |z  }|z  |z  }	d}
d|
z  }|r|d|z  k\  ry|dk  ry||z  |k  r|}n||z  |	k  rm||z  d|z  z  }||z   dz
  |z  }|||z  z   dz
  ||z  z  t        j                  |      }t        |fd	      }t        |z
        d
k  rt        ||      }n>}n;t        j                  |      }t        |fd	      }t        |z
        dk  r|}n}|||z  z   dz
  ||z  z  S d}d}||z   dz
  |z  }||z  |k  r|}n||z  |	k  rj||z  |z  }||z   dz
  |z  }|||z  z   dz
  ||z  z  t        j                  |      }t        |fd	      }t        |z
        dk  rt        ||      }n>}n;t        j                  |      }t        |fd	      }t        |z
        dk  r|}n}|||z  z   dz
  ||z  z  S )zHeuristic to decide the RSPLIT used for split reductions.
        When a reduction has a small number of outputs there is not enough parallelism,
        so we will do the reduction in two phases.rB   i   i   r0   r.   r   i    c                     t        | z
        S r   absxtmp_split_sizes    r   <lambda>z8InductorChoices.reduction_split_factor.<locals>.<lambda>       c!n:L6M r   )key   c                     t        | z
        S r   rR   rU   max_elements_per_threads    r   rW   z8InductorChoices.reduction_split_factor.<locals>.<lambda>       c!>U:U6V r   2         c                     t        | z
        S r   rR   rT   s    r   rW   z8InductorChoices.reduction_split_factor.<locals>.<lambda>   rX   r      c                     t        | z
        S r   rR   r\   s    r   rW   z8InductorChoices.reduction_split_factor.<locals>.<lambda>   r^   r   )r   createmulti_processor_countsympydivisorsrG   rS   max)devicereduction_numel_hint
numel_hintinner_reductionpropsnum_smmin_elements_per_threadthreads_per_smmin_elements_per_devicemax_elements_per_device	num_warpsnum_threads
split_sizetarget_blocksblocks_per_outputrh   closestrvals_per_threadxvals_per_blockxblocksr]   rV   s                       @@r   reduction_split_factorz&InductorChoices.reduction_split_factorv   s    !''/,,"$"%"9F"B^"S"9F"B^"S	9n QZ'#t+#j04KK4
%
25LL & 7AO L%2Z%?!%C
$R!(;9J+JJQN!$55"7 !>>*>?h,MNw/025!$W.E!FJ!/J >>*>?h,VWw!889B>!(J!8J(:+CCaG[(   !!O!O3a7OKG#j03JJ4
%
25LL & 7K H!.!81!< H(+;m+KKaO&6"8 !>>*>?h,MN~/025!$W.E!FJ!/J >>*>?h,VWw!889B>!(J!8J(+;j+HH1L :- r   c                    |dk(  rt         j                  r j                         sj                         rt        d      rvj                  j                         j                  j                         z  t              dkD  r3t        d      j                   fd        t              d       y t              d       yj                         sgj                         sWt        j                               t        j                               z   t         j                  kD  r t              d       y j                        r t              d       yy	)
a  
        Heuristics to prevent fusion applied to both horizontal and vertical fusions.  Heuristics here should not
        be needed for correctness and tweaking them may yield additional performance.

        See also some related heuristics that can be changed via config:
            - config.triton.tiling_prevents_pointwise_fusion
            - config.triton.tiling_prevents_reduction_fusion
            - config.aggressive_fusion (will cause this function to be called more times)
        r   'fusion_failure_due_to_indexing_mismatchc                 B   t         j                  j                  t         j                  j                  j	                         j	                         t        j                               t        j                               t               j                         dS )N)pre_grad_graph_idpost_grad_graph_id
node1_name
node2_namenode1_debug_strnode2_debug_strcommon_buffer_namesfailure_reason)	r   r5   graph_idr   get_namer   	debug_strlistdecide_fusion_fail_reason)common_buf_namesnode1node2	schedulers   r   rW   z*InductorChoices.can_fuse.<locals>.<lambda>   su    121A1A23''2L2L*/..*:*/..*:/9%//:K/L/9%//:K/L378H3I.7.Q.Q %u.>/! r   z'no shared data due to indexing mismatchFzno shared datazexceeds max fusionz Fusion will increase peak memoryT)r   aggressive_fusionis_reductionr
   read_writesbuffer_nameslenr	   add_rowr   
is_foreach	get_nodesmax_fusion_sizecan_fusion_increase_peak_memory)r   r   r   shared_data_scorer   s   ``` @r   can_fusezInductorChoices.can_fuse   s7     !((E,>,>,@EDVDVDX&'PQ%%224u7H7H7U7U7WW ! '(1,$%NOWW ,IeU+,UV #IeU#$45   "$$&EOO%&U__->)??&BXBXX#IeU#$8944UEB#IeU#$FGr   c                     y)zCHook for heuristics to prevent vertical (producer/consumer) fusionsTr   r   r   r   r   s       r   can_fuse_verticalz!InductorChoices.can_fuse_vertical  s     r   c                    |t         j                  k  r t        ||      d       y| j                  ||      r t        ||      d       yy)zEHook for heuristics to prevent horizontal (consumer/consumer) fusionsscore_fusion_memory_thresholdFz=Nodes are too far away. Fusing them may increase peak memory.T)r   r   r   are_long_distant_nodesr   s       r   can_fuse_horizontalz#InductorChoices.can_fuse_horizontal  sS     vCCC#IeU#$CD++E59#IeU#O r   c                   | j                  ||      }t        t        |j                  |j                  z
        t        |j                  |j                  z
               }|j                         rd}n+d|j                         t        j                  k(  xr |dkD  z   }||j                         |j                         k(  xr |dkD  ||fS )a  
        Assign a score (higher comes first) to the fusion of node1 and node2.
        When different fusions conflict with each other, this is the way we
        decide what order to run them in.

        Our current score is based on:
        - The type of fusion (template/reduction/etc)
        - Estimate of the saved memory operations
        - Fusions closer together in original graph order
        r   r   )	score_fusion_memoryri   rS   	min_order	max_orderis_templater   epilogue_fusion_firstr   )r   r   r   memory_scoreproximity_scoretemplate_scores         r   score_fusionzInductorChoices.score_fusion"  s      !44UEB%//12%//12
 
 N""$(D(DD % 1$N  E$6$6$88M\A=M	
 	
r   N)
r'   ztype[TritonKernel]r(   r   r)   zlist[sympy.Expr]r*   dict[str, Any]r   r   )r(   r   r   r   )r(   r   rK   r   r   r   )
rj   ztorch.devicerk   intrl   r   rm   r   r   r   )
r   r   r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r    r!   r"   r#   r+   staticmethodr?   rL   rO   r}   r   r   r   r   r   r   r   r%   r%      s   
& % !	
 & 
 
 
, 
$
=A
	
 
: 	
 	
 SS!S S 	S
 
S Sj 77 7 !7 	7
 
7 7r   ! 	
 
    ! 	
 
 " #
#
 #
 !#
 
	#
 #
r   r%   ) 
__future__r   typingr   r   rg    r   	codecacher   metricsr	   r
   runtime.hintsr   r   r   r   r   r   virtualizedr   torchtorch.utils._ordered_setr   codegen.simd_kernel_featuresr   codegen.tritonr   Protocolr   r%   r   r   r   <module>r      sS    "  %   ! > : > >  3@,6v 6h
 h
r   