
    VhRC                       U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlm	Z	m
Z
mZ d dlmZ d dlmZ d dlmZmZ d dlmZmZmZmZ d dlZd dlmZ d d	lmZmZmZmZ d d
lmZ d dl m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* d dl+m,Z,m-Z- d dl.m/Z/ d dl0m1Z1m2Z2 d dl3m4Z4 d dl5m6Z6m7Z7 d dl8m9Z9 d dl:m;Z; erd dl<m=Z= d dl>m?Z? da@daAdeBd<   ej                  j                  eEd      ZF ej                  eE      ZHd ZId ZJd*dZKd*dZLej                  dk(  ZN ej                  eE      ZH e9e,          ZOd*dZPd ZQ	  ej                  eQ        d+d!ZTe4 G d" d#             ZU G d$ d%      ZVej                  j                  d&d'      d(k(  s7ej                  j                  d)d(      d(k7  s e;       r ej                         rneVj                           ej                  eP       y# eS$ r Y w xY w),    )annotationsN)FutureProcessPoolExecutorThreadPoolExecutor)BrokenProcessPool)partial)timetime_ns)AnyCallableOptionalTYPE_CHECKING) get_registered_device_interfaces)countersdynamo_timedget_metrics_contextset_feature_use)config)
_load_triton_kernel_from_source	code_hashCodeCacheFutureCppCodeCacheCppPythonBindingsCodeCacheCUDACodeCacheHalideCodeCacheLambdaFutureROCmCodeCache	torch_key)AnyPoolSubprocPool)_async_compile_initializer)_set_triton_ptxas_path_worker_compile_triton)clear_on_fresh_inductor_cache)	_Faketqdmtqdm)
OrderedSet)has_triton_package)
HalideMeta)CachingAutotunerg        zOptional[float]_t0kernel_codec                 R    t                	 ddlm}   |         y# t        $ r Y yw xY w)zG
    Setup that must be done prior to forking with a process pool.
    r   
triton_keyN)caching_device_propertiestriton.compiler.compilerr/   ImportErrorr.   s    M/home/dcms/DCMS/lib/python3.12/site-packages/torch/_inductor/async_compile.pypre_fork_setupr4   >   s)     7 s    	&&c                 ~    t               D ]0  \  } }|j                         s|j                  j                          2 y N)r   is_availableWorkerget_device_properties)_device_interfaces     r3   r0   r0   Q   s8    ?A <((*##99;<    c                 &    t         t               a y y r6   )r+   r	    r<   r3   _compile_startr?   W   s    
{f r<   c                 J    t         t               } t        | t         z
  z  ad a y y r6   )r+   r	   _cumulative_compile_time)t1s    r3   _compile_endrC   ]   s'    
V BH,  r<   win32c                 N    t         D ]  } | j                           t                y)z/Shut down all outstanding compile-worker pools.N)	_pool_setshutdown
after_fork)pools    r3   shutdown_compile_workersrJ   n   s     Lr<   c                 h    t         j                          t        j                  j	                          y)z7Reset pools to initial state without shutting them downN)rF   clearAsyncCompileprocess_poolcache_clearr>   r<   r3   rH   rH   u   s    OO))+r<   )after_in_childc                 |    t         j                  t        j                         t         _        t         j                  S )z}
    Temporary for internal rollout. Assign config.compile_threads lazily and return it.
    TODO: remove after rollout.
    )r   compile_threadsdecide_compile_threadsr>   r<   r3   get_compile_threadsrT      s-    
 %!'!>!>!@!!!r<   c                  x    e Zd ZU dZi Zded<   ed
d       Zedd       Zedd       Z	ed        Z
edd       Zy	)CompiledTritonKernelsa/  
    In memory cache for storing compiled triton kernels.

    Each triton kernel is keyed by the hash of its source code. Each value stored
    in the cache is a return value of AsyncCompile.triton().

    Currently, the cache stores Future objects, but it should be generalizable for any kernels.
    zdict[str, LambdaFuture]_cachec                ,    t        | t                     S )a  
        Generates a cache key given a triton kernel's full source code.
        This source includes the inductor meta, compilation metadata, the kernel itself, etc.
        `kernel_src` should be the exact string passed to async_compile.triton()'s first argument.
        )extra)r   r   )
kernel_srcs    r3   keyzCompiledTritonKernels.key   s     9;77r<   c                T    t         j                  |       }|t         j                  |<   y)a  
        Saves a compiled triton kernel to the cache.
        TODO: We store a LambdaFuture as that's the callable returned by async_compile.triton,
        but the real type we want to return here is actually an abstract triton kernel.

        TODO: Source code here is not just the kernel's source code, but also includes the inductor preamble, etc.
        so it could be less strict.
        NrV   r[   rW   )rZ   futurer[   s      r3   savezCompiledTritonKernels.save   s%     $''
3,2$$S)r<   c                l    t         j                  |       }t         j                  j                  ||      S r6   )rV   r[   rW   get)rZ   defaultr[   s      r3   ra   zCompiledTritonKernels.get   s+    #''
3$++//W==r<   c                     i t         _        y r6   )rV   rW   r>   r<   r3   rO   z!CompiledTritonKernels.cache_clear   s    ')$r<   c                v    t         j                  |       }|t         j                  v rt         j                  |= y y r6   r]   )rZ   r[   s     r3   remove_futurez#CompiledTritonKernels.remove_future   s4    #''
3'...%,,S1 /r<   N)rZ   str)rZ   rf   r^   r   )rZ   rf   rb   r   returnr   )rZ   rf   rg   None)__name__
__module____qualname____doc__rW   __annotations__staticmethodr[   r_   ra   rO   re   r>   r<   r3   rV   rV      sv     ')F#(8 8 
3 
3 > > * * 2 2r<   rV   c                     e Zd ZddZe ej                  d      dd              Zed        Ze ej                  d      dd              Z	e
dd       Ze
dd       Zd Zddd	Zdd
ZddZddZddZ	 ddZddZddZddZy)rM   c                     y r6   r>   selfs    r3   __init__zAsyncCompile.__init__   s    r<      c                 F    t               dkD  sJ t        t                     S Nrt   )rT   r   r>   r<   r3   rI   zAsyncCompile.pool   s$     #$q(((!"5"788r<   c                      y)z>No-op function to help mark when the subprocess pool is ready.readyr>   r>   r<   r3   
_get_readyzAsyncCompile._get_ready   s     r<   c            	        t               dkD  sJ t        j                  dt        j                  t                      t        j                  dk(  rt        t                     } nt        j                  dk(  rdt        j                  d<   t                t        j                  t        j                        }t        t               |t        t        t        j                                     } t        j                  j!                  d | j"                  t$        j&                         | j)                  t*        j,                        | _        t0        j3                  |        | S )	Nrt   z"Creating '%s' pool with %d workers
subprocessspawn0TORCH_WARM_POOL)
mp_contextinitializer)exitpriority)rT   loginfor   worker_start_methodr    osenvironr4   multiprocessingget_contextr   r   r!   getpidutilFinalizerG   sysmaxsizesubmitrM   ry   ready_futurerF   add)rI   ctxs     r3   rN   zAsyncCompile.process_pool   s     #$q(((0&&!	
 %%5245D))W403

,-!--f.H.HIC&#%#$>		LD   ))$CKK)X !KK(?(?@dr<   c                h    t               dk  ry t                | j                          t                y rv   )rT   r?   rN   rC   )clss    r3   	warm_poolzAsyncCompile.warm_pool   s'     A%r<   c                h    t               dk  r |       S | j                         j                  |      S rv   )rT   rI   r   )r   tasks     r3   r   zAsyncCompile.submit   s+     A%6Mxxz  &&r<   c                p    t               dkD  xr( | j                         j                  j                         S rv   )rT   rN   r   donerq   s    r3   use_process_poolzAsyncCompile.use_process_pool  s/    !A%Q$*;*;*=*J*J*O*O*Q	
r<   c                *   t         j                  d      x}rt        d   dxx   dz  cc<   |S t        d   dxx   dz  cc<   t        j	                  d       t                t        j                  j                  dd      d	k(  r=t        t        j                  j                  j                  j                              S t        j                  t               | j#                         }t%        d
|       |rddg}|D ci c])  }|t        j                  v s|t        j                  |   + }}| j'                         j)                  t*        |      fddfd}	t-        |	      }t         j/                  |       |S t1        dddd      5  t3               }
t5                        }|j7                  d       t3               |
z
  dz  }t9               j;                  d|       |cddd       S c c}w # 1 sw Y   yxY w)a  
        Async_compile.triton is more complicated than the other backends because
        we're trying to optimize compile time as much as possible for this hot callsite.

        First of all, the function is cached by CompiledTritonKernels; if there's a kernel
        already compiled, we grab it directly from the cache and return.

        Otherwise, if we have multiple compile threads, we kick off triton compilations on each
        worker process by giving it a kernel and source code to compile. The worker initializes
        a CachingAutotuner, runs triton compilation, and pickles the kernel back to us.
        We use TritonCompileResult to represent the objects being pickled back to us by each
        worker.

        Some maybe not obvious things that are pickled back to us:
        - Most of the time, we can avoid sending back CachingAutotuner.fn and other metadata
          and do not have to pay the cost of loading the triton kernel on the parent. But certain
          cases, like coordesc tuning and dynamic_scale_rblock, require us to reload the function
          in the parent lazily when we require it.
        - The AutotuneCache, if enabled, is constructed on each worker per triton config
          and pickled by to us via `CachingAutotuner.save_cache_hook`.
        Ninductorasync_compile_cache_hitrt   async_compile_cache_misszTriton Kernel:
%sTRITON_INTERPRETr}   1parallel_compile_post_warmupTORCHINDUCTOR_CACHE_DIRTRITON_CACHE_DIRc                 T    t        d      5           cd d d        S # 1 sw Y   y xY w)Nreload_kernel_in_parent)r   )load_kernels   r3   r   z4AsyncCompile.triton.<locals>.reload_kernel_in_parent;  s&    !";< )&=) ) )s   'c                     j                         \  } }t        j                         | j                  d       t	               j                  d|       | S )NF)warm_cache_onlyreload_kerneltriton_kernel_compile_times_us)resultrV   re   
precompiler   	add_top_n)kernel
elapsed_uskernel_namer   source_coder   s     r3   
get_resultz'AsyncCompile.triton.<locals>.get_result@  s]    %)[[]"
 &33K@!!$)9P "  $%//4k: r<   )r^   zasync_compile.precompileTtriton_compile_time_uslog_pt2_compile_eventdynamo_compile_column_uslog_waitcounterF)r   i  r   )rg   ztuple[CachingAutotuner, int])rV   ra   r   kernel_code_logr   r?   r   r   getattrtorch	_inductor	codecachePyCodeCacheload	functoolsr   r   r   r   rN   r   r#   r   r_   r   r
   r"   r   r   r   )rr   r   r   
device_strr^   is_parallelenv_varsv	extra_envr   start_nsr   r   r   r   r   s    ``          @@@r3   tritonzAsyncCompile.triton  s   , +..{DAA6AZ !:;q@;M78A=81;?::>>,c2c9))55::;G   ''+[+
 ++-6D 23EFH3;OaqBJJBJJqM)OIO$$&--&D)
  "*T:F!&&{F;M*&*)A $	  #9&($!!%!8%i(2t;
#%//4k:  = P< s   	H!H AH		Hc                    ddl m}  ||i |S )Nr   )MultiKernelCall)$torch._inductor.codegen.multi_kernelr   )rr   argskwargsr   s       r3   multi_kernelzAsyncCompile.multi_kernela  s    H ///r<   c                    t         j                  d|       t               dk  rt        j                  |      j
                  S t        j                  || j                        t        fd      S )NzCPP Kernel:
%srt   	submit_fnc                 &             j                   S r6   )r   )r   s   r3   <lambda>z"AsyncCompile.cpp.<locals>.<lambda>m  s    
(;(; r<   )	r   r   rT   r   r   r   
load_asyncr   r   )rr   r   r   s     @r3   cppzAsyncCompile.cppg  sX    .< A%$$[1888%00TJ ;<<r<   c                    t         j                  d|       t               dk  rt        j                  ||      S t        j
                  ||| j                        }t        |      S )NzCPP+Bindings Kernel:
%srt   r   )r   r   rT   r   load_pybindingload_pybinding_asyncr   r   )rr   argtypesr   r   s       r3   cpp_pybindingzAsyncCompile.cpp_pybindingo  sY    7E A%-<<X{SS3HH+J  
++r<   c                d    t         j                  d       fd}| j                  |      S )NzCUDA Kernel:
%sc                 f     rt        j                  d       t        j                        d   S )Nor   )r   compiler   )aot_compiledst_file_extr   s   r3   r   zAsyncCompile.cuda.<locals>.task|  s0     %%k37 %%k<@CCr<   r   r   r   rr   r   r   r   r   s    ``` r3   cudazAsyncCompile.cuday  s+    /=	D {{4  r<   c                d    t         j                  d       fd}| j                  |      S )NzROCm Kernel:
%sc                     rt        j                  d      } t        j                  j                  rt        j                  d      } t        j
                        d   S )Nr   )r   exer   )r   r   r   rocmgenerate_test_runnerr   )r:   r   r   r   s    r3   r   zAsyncCompile.rocm.<locals>.task  sO    !))+CH{{//!))+EJ %%k<@CCr<   r   r   s    ``` r3   r   zAsyncCompile.rocm  s-     	/=	D {{4  r<   c                    t         j                  d||       t               dk  rt        j                  ||      S t        j
                  ||| j                        }t        |      S )NzHalide Kernel:
%r
%srt   r   )r   r   rT   r   generate_halidegenerate_halide_asyncr   r   )rr   metar   r   s       r3   halidezAsyncCompile.halide  s[    5t[I A%"224EE(>>kT[[J  
++r<   c                    t               dkD  r4t        dddd      5  | j                  |       d d d        t                y t                y # 1 sw Y   t                y xY w)Nrt   zasync_compile.waitTr   r   )rT   r   _wait_futuresrC   )rr   scopes     r3   waitzAsyncCompile.wait  sS     1$$&*)A $	 * ""5)* 	* 	s   AA c           	        |j                         D ci c]  \  }}t        |t        t        f      r||  }}}t	        t        |      dt        j                  d      }|j                         D ][  \  }}t        j                  r!t        |t              s|j                  |       	 |j                         ||<   |j                  d       ] y c c}}w # t        $ r}t        d      |d }~ww xY w)NzInductor Compilationr   )totaldescdisabledelayzA compilation subprocess exited unexpectedly. This is likely due to a crash. To facilitate debugging, you can re-run with TORCHINDUCTOR_COMPILE_THREADS=1 to cause compilation to occur in the main process.rt   )items
isinstancer   r   r&   lenr   disable_progressverbose_progressr%   set_postfix_strr   r   RuntimeErrorupdate)rr   r   r[   valuekernelspbarr   es           r3   r   zAsyncCompile._wait_futures  s     $kkm
U%&/!:; J
 

 g,'++	
 #==? 	KC&&z$	/J$$S)#]]_c
 KKN	
" % "I
 s   #C(C	C/C**C/Nrg   rh   )rg   r   )rg   r   )r   zCallable[..., Any]rg   r   )r   )r   rf   r   rf   r   rf   )rg   r   )r   rf   )r   z	list[str]r   rf   )F)r   r)   r   rf   )r   zdict[str, Any]rg   rh   )ri   rj   rk   rs   rn   r   	lru_cacherI   ry   rN   classmethodr   r   r   r   r   r   r   r   r   r   r   r   r>   r<   r3   rM   rM      s     Y9  9   Y    D   ' '


Yv0=,
!  	!",
r<   rM   TORCH_TNT_IN_USEr}   r   r~   r   )rg   int)\
__future__r   atexitr   loggingr   r   r   concurrent.futuresr   r   r   concurrent.futures.processr   r   r	   r
   typingr   r   r   r   r   torch._dynamo.device_interfacer   torch._dynamo.utilsr   r   r   r   torch._inductorr   torch._inductor.codecacher   r   r   r   r   r   r   r   r   r   +torch._inductor.compile_worker.subproc_poolr   r    'torch._inductor.compile_worker.watchdogr!   %torch._inductor.runtime.compile_tasksr"   r#   torch._inductor.utilsr$   	torch.hubr%   r&   torch.utils._ordered_setr'   torch.utils._tritonr(   torch._inductor.runtime.hintsr)   )torch._inductor.runtime.triton_heuristicsr*   rA   r+   rm   _logginggetArtifactLoggerri   r   	getLoggerr   r4   r0   r?   rC   platform_IS_WINDOWSrF   rJ   rH   register_at_forkAttributeErrorrT   rV   rM   r   ra   	is_fbcoder   registerr>   r<   r3   <module>r!     s   "     	 
 N N 8   9 9  K  #   M N @ % / 2 8J  _ ..228]Kg!&< llg%g!  Jw!	,	Bz2
" 02 02 02fF FT JJNN%s+s2	zz~~'-4 v ( ){
  		s   4G G&%G&