
    Vhh<                        d dl Z d dlZd dlZd dlZd dlmZmZ d dlmZ ddl	m
Z
 ddlmZmZmZ ddlmZ ddlmZmZ dd	lmZ d
dlmZmZ  ej2                  e      Zd ZddZd Zd Zd Z  G d d      Z! G d d      Z" G d d      Z#y)    N)get_metric_tableis_metric_table_enabled)
OrderedSet   )config)	code_hashCodeCacheFutureget_path)benchmarker)cache_on_selfIndentedBuffer)V   )	TensorArgWorkspaceArgc                 ~    | j                   j                         \  }}}}|D cg c]  }|j                   c}S c c}w N)argspython_argdefsname)kernelarg_defs_xs       T/home/dcms/DCMS/lib/python3.12/site-packages/torch/_inductor/codegen/multi_kernel.pyget_kernel_argdefsr      s5    224HaA$%qAFF%%%s   :c                     t        | t              d d  }|t        |t              d d  nd }| D ]1  }t        |      j                  t        |            r'J | d|         ||fS )N)keyz v.s. )maxlenr   issubset)	args_listarg_types_listall_args	arg_typesr   s        r   _get_all_argsr&      s{    9#&q)H3A3MN,Q/SWI 
$((H)=> 	
fF8*%	
>

 Y    c                 X    | D cg c]  }t        |       }}t        |      d   S c c}w )zb
    The logic here must match with `get_all_call_args`, except no need to get arg_types here
    r   )r   r&   )kernelsr   argdefs_lists      r   get_all_kernel_argdefsr+   %   s4     >EE6&v.ELE&q)) Fs   'c                     t        | |      S )a  
    Passed in the call_args for each subkernel and return the call_args for the
    combined multi-kernel.

    Note an algorithm as follows does not always work:
    ```
        all_call_args: Dict[
            Any, None
        ] = {}  # use a dict rather than set to maintain insertion order
        for call_args in call_args_list:
            all_call_args.update({arg: None for arg in call_args})

        all_call_args = list(all_call_args.keys())
    ```
    It will fail if any kernel has the same argument passed in multiple times.
    Check test_pass_same_arg_multi_times in test_multi_kernel.py

    Instead, we pick the longest call args and assert that other call args are
    a subset of it.
    )r&   )call_args_listr#   s     r   get_all_call_argsr.   .   s    * 88r'   c                     | j                   D cg c])  }|j                  r| j                  r|j                   d+ }}|S c c}w )Nnumel)range_treesis_reductioninside_reductionprefix)r   treenumel_argdefss      r   get_numel_argdefsr7   F   sP     &&  F$;$; ;;-uM  s   .Ac                       e Zd ZdZd Zd Zy)MultiKernelStatez
    Maintain state of multi-kernel compilation so we don't define duplicated
    multi-kernel for the same set of sub-kernels.

    V.graph.wrapper_code has a reference to MultiKernelState instance.
    c                     i | _         y r   )subkernel_to_kernel_name)selfs    r   __init__zMultiKernelState.__init__X   s
    (*%r'   c                 :   t        d |D              }|| j                  v r| j                  |   S dt        | j                         }|| j                  |<   t        j                  j
                  rt        j                  j                  s|S t               }|j                  d       |j                  | d|d       |j                         5  |D ]  }|j                  | d        	 ddd       |j                  d       t        j                  j                  }t        j                  j                  r;|j                  j                  |       ||j                  d	j!                  |      <   |S |j"                  j                  |       |S # 1 sw Y   xY w)
aK  
        Previously we name the multi kernel as "multi_kernel_{kernel_names[0]}".
        This has some minor issue.

        E.g. for persistent reduction https://gist.github.com/shunting314/39e7c00ff8bb2055942ed5a3255d61ca ,
        there are 2 flavors of non-persistent reduction:
          https://gist.github.com/shunting314/056d43d35907e87efb883970b35c17d4
        and
          https://gist.github.com/shunting314/02ee753b65c513c54e695626afe682bd

        The only different is cache eviction policy.

        We should name the multi-kernel differently in these 2 cases.
        c              3   4   K   | ]  }|j                     y wr   )kernel_name).0ks     r   	<genexpr>z1MultiKernelState.define_kernel.<locals>.<genexpr>j   s     <qQ]]<s   multi_kernel_ z = async_compile.multi_kernel(z, [,Nz])
)tupler;   r    r   graphcpp_wrapperr   tritonautotune_at_compile_timer   	writelineindentwrapper_codekernel_autotune_defssplicesrc_to_kerneljoinheader)r<   r)   kernel_namesmulti_kernel_namebufr   wrappers          r   define_kernelzMultiKernelState.define_kernel[   sn    <G<<488800>> ,C0M0M,N+OP6G%%l377v}}'M'M %$b !!?@Q?TTWX	
 ZZ\ 	*$ *aj)*	* 	d''&&==11((//4=NG!!$))L"9: !  NN!!#&  	* 	*s   FFN)__name__
__module____qualname____doc__r=   rY    r'   r   r9   r9   P   s    +-!r'   r9   c                       e Zd ZdZd Zedee   dee   fd       Zed        Z	d Z
d Zed	        Zed
        Zeed               ZdefdZy)MultiKernela  
    This class maintains the compile time state for multi kernels.

    Assume we do codegen for a MultiKernel encapsulating kernel1 and kernel2.
    The generated definition for the multi-kernel will looks like:
    ```
    multi_kernel_kernel1 = MultiKernelCall(
        [kernel1, kernel2], multi_kernel_definition_code
    )
    ```

    Here is an concrete example: https://gist.github.com/shunting314/d9f3fb6bc6cee3dbae005825ca196d39
    c                     t        |      dk\  sJ || _        t        j                  j                  j
                  j                  |      | _        t               | _	        y Nr   )
r    r)   r   rI   rO   multi_kernel_staterY   r@   objectr   )r<   r)   s     r   r=   zMultiKernel.__init__   sL    7|q   77//BBPP
 H	r'   leftrightc                    | |k(  r| S | D ci c]  }|j                   | }}|D ]P  }|j                   |v r1t        j                  ||j                      |      ||j                   <   B|||j                   <   R g |j                         S c c}w r   )
inner_namer   maximumvalues)re   rf   r   resultargs        r   _merge_workspace_argsz!MultiKernel._merge_workspace_args   s    5=K+/0a!,,/00 	-C~~')5)=)=3>>*C*s~~& *-s~~&	- "!! 1s   B	c                     t        |       dk  ry t        j                  t        j                  | D cg c]  }|j
                  j                   c}      }| D ]  }||j
                  _         |S c c}w rb   )r    	functoolsreducer`   rm   r   workspace_args)r)   r   rq   s      r   merge_workspaces_inplacez$MultiKernel.merge_workspaces_inplace   sj    w<!"))--6=>FV[[''>
  	8F)7FKK&	8	 ?s   A0
c                    || j                   k(  sJ t        j                  j                  j	                          | j
                  d   j                  j                         \  }}}}| j
                  dd D ]6  }|j                  j                         \  }}}}||k(  s	J ||f       ||k(  r6J  t        j                  j                  r9t        j                  j                  st        j                  | j                         }| j
                  d   j                  |||       | j
                  d   j                  j                  D ]+  }t        j                  j                  j!                  |       - t        j                  j                  j#                  |||       t%        | j
                  d   j                  j                        D ]+  }t        j                  j                  j'                  |       - y)zs
        Collect the union of arguments from all subkernels as the arguments
        for the multi-kernel.
        r   r   N)r%   )r@   r   rI   rO   write_triton_header_oncer)   r   r   rJ   r   rK   rL   MultiKernelCalllookup_choiceadd_numel_to_call_argsrq   generate_workspace_allocationgenerate_kernel_callreversedgenerate_workspace_deallocation)	r<   r@   r   	call_argsr%   r   other_call_argsother_arg_typeswss	            r   call_kernelzMultiKernel.call_kernel   s   
 d.....	557%)\\!_%9%9%H%H%J"9all12& 	0F5;[[5O5O5Q2A?/M)_1MM////	0
 77v}}'M'M *778H8HIK 	Q..{IyQ,,q/&&55 	CBGG  >>rB	C 	
11 	2 	
 4<<?//>>? 	EBGG  @@D	Er'   c                    t         j                  j                  }t        t                  }| j
                  D ]  }|j                  j                         \  }}}}t        ||      D ]Z  \  }}||v r|j                  |       t        |t              s-d| d}	|j                  |	       d| d}	|j                  |	       \  y )Nzassert not z.isnan().any().item()z.isinf().any().item())r   rI   rO   r   strr)   r   r   zipadd
isinstancer   rM   )
r<   rX   seenrB   r   r|   precompile_argsrl   precompile_arglines
             r   codegen_nan_checkzMultiKernel.codegen_nan_check   s    ''&&#  
	,A/0vv/D/D/F,Ay/1'*9o'F ,#^$;ni8(-BCD%%d+(-BCD%%d+,
	,r'   c                 t    t        j                  | j                  D cg c]  }|j                   c} S c c}w r   )r   intersectionr)   removed_buffersr<   rB   s     r   r   zMultiKernel.removed_buffers   s+    &&DLL(Qq):):(QRR(Q   5c                 t    t        j                  | j                  D cg c]  }|j                   c} S c c}w r   )r   r   r)   inplaced_to_remover   s     r   r   zMultiKernel.inplaced_to_remove   s+    &&t||(T!)=)=(TUU(Tr   c                     | j                   dd D ]*  }|j                  | j                   d   j                  k(  r*J  | j                   d   j                  S )zN
        Make sure all kernels have the same inplace update mappings.
        r   Nr   )r)   inplace_update_buffersr   s     r   r   z"MultiKernel.inplace_update_buffers   sV     ab! 	VA++t||A/U/UUUU	V||A555r'   r@   c                      y r   r^   )r<   r@   s     r   warn_mix_layoutzMultiKernel.warn_mix_layout  s    r'   N)rZ   r[   r\   r]   r=   staticmethodlistr   rm   rr   r   r   propertyr   r   r   r   r   r   r^   r'   r   r`   r`      s    
 "D$6 "tL?Q " " 
 
EB, S S V V 6  63 r'   r`   c                   ~    e Zd ZdZd Zd Zd Zd Zed        Z	d Z
eded	efd
       Zededefd       Zd Zd Zy)ru   zE
    This class is called at run time to actually run the kernel
    c                    t        |      dk\  sJ || _        || _        t        j                  j                  d      dk(  xs t        d      | _        d | _        t        j                  j                  dkD  rFt        j                  j                  dz
  }|t        | j                        k  sJ || _        d| _        y | j                  s| j                          d| _        y )Nr   (TORCHINDUCTOR_DISABLE_MULTI_KERNEL_CACHE1persistent_red_perfr   F)r    _kernelsrV   osenvirongetr   disable_cachepicked_kernelr   rK   multi_kernel
load_cache	_recorded)r<   rV   r)   picked_by_configs       r   r=   zMultiKernelCall.__init__  s    7|q   !2ZZ^^6
 C+,AB 	 "==%%)%}}99A=#c$--&8888!1D  ##OOr'   c           
         t        dj                  | j                  D cg c]2  }|j                  j                   |j
                  |j                  4 c}            }t        |d      \  }}}t        j                  |      S c c}w )NrF   r   )
r   rS   r)   fn	cache_key
size_hintstriton_metar
   pathlibPath)r<   rB   r   r   paths        r   cache_file_pathzMultiKernelCall.cache_file_path   s|    HH "\\ tt~~&q||&6q}}6GH
 c?3
1d||D!!s   7Bc                    | j                   J | j                         }|j                         r|j                         5 }t	        |j                               | _         | j                   dk\  r"| j                   t        | j                        k  sJ t        j                  d| j                   |       d d d        y y # 1 sw Y   y xY w)Nr   z(Load picked kernel %d from cache file %s)
r   r   existsopenintreadr    r   logdebugr<   r   fds      r   r   zMultiKernelCall.load_cache,  s    !!)))##%;;= %(^"))Q.43E3EMMI 4   		>@R@RTX   s   A3B<<Cc                 P   | j                   J | j                         }|j                  j                  dd       |j	                  d      5 }|j                  t        | j                                d d d        t        j                  d| j                   |       y # 1 sw Y   +xY w)NT)parentsexist_okwz'Store picked kernel %d to cache file %s)	r   r   parentmkdirr   writer   r   r   r   s      r   store_cachezMultiKernelCall.store_cache9  s    !!---##%$6YYs^ 	.rHHS++,-	.		;T=O=OQUV	. 	.s   %BB%c                     t        | j                        D ]3  \  }}t        |t              s|j	                         | j                  |<   5 | j                  S )z
        Read results from future.

        This should be called after parallel compilation is done.
        In case you call this before compilation is done,
        it may slow down the parallel compilation.
        )	enumerater   r   r	   rk   )r<   ir   s      r   r)   zMultiKernelCall.kernelsB  sJ     #4==1 	3IAv&/2#)==?a 	3 }}r'   c                     fd}| j                   D cg c]  }t        j                   ||      d      ! c}S c c}w )z
        Benchmark all the sub kernels and return the execution time
        (in milliseconds) for each of time.

        Unit test may mock this method to force a specific kernel to
        be picked.
        c                       fd}|S )Nc                  R     j                   i \  } } j                  | i |S r   )
clone_argsrun)
args_clonekwargs_cloner   r   kwargss     r   innerzEMultiKernelCall.benchmark_sub_kernels.<locals>.wrap_fn.<locals>.inner[  s5    +<6+<+<d+Mf+M(
L!vzz:>>>r'   r^   )r   r   r   r   s   ` r   wrap_fnz6MultiKernelCall.benchmark_sub_kernels.<locals>.wrap_fnZ  s    ? Lr'   (   )rep)r)   r   benchmark_gpu)r<   r   r   r   r   s    ``  r   benchmark_sub_kernelsz%MultiKernelCall.benchmark_sub_kernelsQ  s?    	 ,,
 %%gfo2>
 	
 
s   $>rV   picked_kernel_namec                     ddl m} t        t        j                  |      syt        j                  j
                  sy|t        j                  j                  | <   y)z
        Record the multi-kernel choice for cpp-wrapper after autotuning

        We should do nothing if this function is not called during codegen.
        r   )GraphLoweringN)torch._inductor.graphr   r   r   rI   record_multi_kernel_choicemulti_kernel_to_choice)rV   r   r   s      r   record_choicezMultiKernelCall.record_choicen  s=     	8!''=1ww11<N&&'89r'   returnc                     t         j                  j                  r| t         j                  j                  v sJ t         j                  j                  |    S r   )r   rI   r   r   )rV   s    r   rv   zMultiKernelCall.lookup_choice  sF     GG..!QWW%C%CC	
D ww--.?@@r'   c           
      ~   | j                    | j                  |i |}|j                  t        |            | _         | j                  d   }t
        j                  d| j                   | j                  D cg c]  }|j                  j                  d       c}|j                  |j                  j                  d      |       t        d      j                  t        j                  | j                  |             | j                  s| j!                          | j"                  sYd| _        | j                  | j                      j                  j                  d      }|J | j%                  | j&                  |       | j                  | j                      j(                  | _         | j(                  |i | y c c}w )Nr   zHpick %dth sub-kernel in %s. Size hints %s. Reduction hint %s. Timings %sr@   reduction_hintr   T)r   r   indexminr)   r   r   inductor_metar   r   r   add_rowro   partial_metrics_table_rowr   r   r   r   rV   r   )r<   r   r   timingsk0rB   r   s          r   r   zMultiKernelCall.run  ss   %0d00$A&AG!(s7|!<DaBIIZ""=A\\J$$]3J  $$%56 23;;!!$"9"97C %%  "~~!DN!%d.@.@!A!O!O!S!S" &111t557IJ<< 2 2377$!&!' Ks   7"F:c                 d   d }| j                   d   }|j                  |j                  j                  d      d}d}t	        |      |k  sJ t        |      D ]U  }|t	        | j                         k  r) || j                   |         |d| d<   ||   |d| d<   Dd	|d| d<   d	|d| d<   W |S )
Nc                 V    | j                   j                   j                  j                  S r   )r   __code__co_filename)rB   s    r   get_kernel_pathz;MultiKernelCall._metrics_table_row.<locals>.get_kernel_path  s    4477##///r'   r   r   )r   r      r   _path_latencyrE   )r)   r   r   r   r    range)r<   r   r   r   rowmax_kernelsr   s          r   r   z"MultiKernelCall._metrics_table_row  s    	0 \\!_-- ..223CD
 7|{***{# 	/A3t||$$)8a)IfQCu%&,3AJfQCx())+fQCu%&,.fQCx()	/ 
r'   N)rZ   r[   r\   r]   r=   r   r   r   r   r)   r   r   r   r   rv   r   r   r^   r'   r   ru   ru     s    (
"W  
: O O# O O  A A A A":r'   ru   r   )$ro   loggingr   r   torch._inductor.metricsr   r   torch.utils._ordered_setr   rE   r   	codecacher   r	   r
   runtime.benchmarkingr   utilsr   r   virtualizedr   commonr   r   	getLoggerrZ   r   r   r&   r+   r.   r7   r9   r`   ru   r^   r'   r   <module>r      s}      	  M /  < < . 1  + g!&
*908! 8!vy yxq qr'   