
    Vh                        d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlmZmZ d dlmZ d dlmZmZmZmZ d dlmZmZmZmZmZ d dlZd dlZd dlmZ d dlmZ d d	lm Z  d d
l!m"Z" d dl#m$Z$m%Z%m&Z&m'Z'm(Z( d dl)m*Z*m+Z+ d dl,m-Z- d dl.m/Z/ erd dl0m1Z1 d dl2m3Z3 d dl4m5Z5 d dl6m7Z7 ddl8m9Z9 ddl:m;Z; ddl8m<Z< ddl=m>Z> ddl?m@Z@ dZAdaB e-eCd      ZD ej                  eC      ZF G d d      ZG G d d      ZH G d  d!eI      ZJej                  d<d"       ZLej                   G d# d$             ZNej                   G d% d&             ZO eO       ZPee"j                  e"j                  f   ZSej                   G d' d(             ZTej                   G d) d*             ZU G d+ d,eU      ZV G d- d.      ZW G d/ d0      ZX G d1 d2eU      ZY G d3 d4eWeY      ZZ G d5 d6eXeY      Z[ G d7 d8eWeU      Z\ G d9 d:eXeU      Z]	 	 	 	 d=d;Z^y)>    )annotationsN)IterableSequence)ThreadPoolExecutor)byrefc_size_tc_void_pCDLL)AnyCallableOptionalTYPE_CHECKINGUnion)multiprocessing)get_interface_for_device)rand_strided)ir)CppCodeCacheCUDACodeCache
DLLWrapperget_hashPyCodeCache)get_gpu_typeis_gpu)getArtifactLogger)
OrderedSet)BaseProcess)Queue)
ModuleType)TritonTemplateCaller   )WorkspaceArg)config)WorkspaceZeroMode)benchmarker)VCUDA_VISIBLE_DEVICESF
autotuningc                      e Zd Zy)PingN__name__
__module____qualname__     P/home/dcms/DCMS/lib/python3.12/site-packages/torch/_inductor/autotune_process.pyr*   r*   ;       r0   r*   c                      e Zd Zy)PongNr+   r/   r0   r1   r4   r4   ?   r2   r0   r4   c                      e Zd Zy)!NonzeroWorkspaceNotSupportedErrorNr+   r/   r0   r1   r6   r6   C   r2   r0   r6   c              #  p  K   | d yt         j                  j                  t              }t	        |       t         j                  t        <   	 d |t         j                  t        = y|t         j                  t        <   y# |t         j                  t        = w |t         j                  t        <   w xY ww)z
    Context manager to set the CUDA_VISIBLE_DEVICES environment variable to the
    specified single device. If device is None, don't manipulate the environment.
    N)osenvirongetr'   str)devicecurrents     r1   set_cuda_visible_devicer>   G   s      ~jjnn12G'*6{BJJ#$7?

/0/6BJJ+, ?

/0/6BJJ+,s   AB6B 0B61B33B6c                      e Zd ZU dZdZded<   dZded<   dZded<   dZded	<   e		 	 	 	 	 	 dd
       Z
e	dd       ZddZddZddZddZ	 d	 ddZddZddZdddZy)TuningProcessz
    Abstraction for launching a helper process to benchmark kernels. Spawns
    the parent process and uses multiprocessing queues to send benchmark
    requests and return results.
    NOptional[int]r<   zOptional[BaseProcess]processzOptional[Queue[Any]]request_queueresponse_queuec                    t         j                  dt        j                  j	                  t
                     	 t        j                  | |       y# t        $ r t         j                  d       Y yw xY w)z4
        Entry point for the child process.
        z2Entering TuningProcess child. Visible devices = %szException in TuningProcessN)
autotuning_logdebugr8   r9   r:   r'   r@   workloop	Exception	exception)rC   rD   s     r1   process_mainzTuningProcess.process_maini   s\     	@JJNN/0	
	C""=.A 	C$$%AB	Cs   A A10A1c                   	 | j                         }|yt        |t              r|j                  t	                      nGt        |t
              r |j                  |j                                nt        dt        |             )z<
        Work loop for the benchmarking subprocess.
        NzInvalid request type )	r:   
isinstancer*   putr4   BenchmarkRequest	benchmarkRuntimeErrortype)rC   rD   objs      r1   rH   zTuningProcess.workloopz   sr    
 ##%C{C&""46*C!12""3==?3"%:49+#FGG r0   c                ^    | j                   duxr | j                  duxr | j                  duS )z?
        True if the sub-process has been initialized.
        NrB   rC   rD   selfs    r1   validzTuningProcess.valid   s;    
 LL$ 0""$.0##4/	
r0   c                .    dx| _         x| _        | _        y)z2
        Reset to an uninitialized state.
        NrU   rV   s    r1   clearzTuningProcess.clear   s     CGFFt)D,?r0   c                   | j                         ryt        j                  d      }|j                         | _        |j                         | _        |j                  | j                  | j                  | j
                  f      | _        | j                  J t        | j                        5  | j                  j                          ddd       y# 1 sw Y   yxY w)z
        Create child process, request/response queues, and do the warm up.
        Set the environment to make only the provided GPU device visible
        to the process.
        Nspawn)targetargs)rX   r   get_contextr   rC   rD   ProcessrK   rB   r>   r<   start)rW   ctxs     r1   
initializezTuningProcess.initialize   s     ::< ))'2 YY[!iik{{$$""## # 
 ||'''$T[[1 	!LL 	! 	! 	!s   ,CCc                v    | j                          | j                  J | j                  j                  |       y)z8
        Push a work item to the child process.
        N)rc   rC   rN   )rW   rS   s     r1   rN   zTuningProcess.put   s4    
 	!!---s#r0   c                   | j                   J | j                  J 	 	 |}d}|(|dk\  r#|dz  }	 | j                  j                  d      }	 || j                  j                  |      }|S # t        j                  $ r | j                   j                         s Y nw xY w|W|dk\  r^# t        j                  $ r> | j                   j                  }|| j                  ||        | j                           w xY w)a,  
        Get a response from the child process. Raises queue.Empty on timeout
        or if the process dies.

        This method is (so far) only used by TuningProcessPool, where torch._inductor.config entries are being used
        to populate the timeouts:

        Arguments:

            @param result_timeout: Timeout in seconds, defaults to 120.0 or to
                                   config.max_autotune_subproc_result_timeout_seconds when called by TuningProcessPool
            @param graceful_timeout: Timeout in seconds to allow graceful shutdown (SIGTERM is sent after this time).
                                    Defaults to 3.0 or to config.max_autotune_subproc_graceful_timeout_seconds
            @param terminate_timeout: Timeout in seconds after SIGTERM, until we send SIGKILL if the process
                                      remains alive. Defaults to 1.0 or to
                                      config.max_autotune_subproc_terminate_timeout_seconds.
        Returns:
            A response from the child process (Any type)
        N      ?g      ?timeout)graceful_timeoutterminate_timeout)	rB   rD   r:   queueEmptyis_aliveexitcodekillrZ   )rW   result_timeoutri   rj   remaining_timeoutresstatuss          r1   r:   zTuningProcess.get   s)   , ||'''""...$2!'38IS8P%,%""1155c5B ;--11:K1LC
 !;; "#||446!  7" (38IS8P ;; 
..>II)9*;    JJL
s5   B+ A-  B+ -.BB+ BB+ $B+ +AC<c                    | j                         r8| j                  J | j                  J | j                  j                  d       yy)z8
        Signal the child process to terminate.
        N)rX   rB   rC   rN   rV   s    r1   	terminatezTuningProcess.terminate   sH     ::<<<+++%%111""4( r0   c                r    | j                   +| j                   j                          | j                          yy)z5
        Wait for the child process to exit.
        N)rB   joinrZ   rV   s    r1   waitzTuningProcess.wait   s,     <<#LLJJL $r0   c                H   | j                   | j                          | j                   j                  |       | j                   j                         rt        j                  d| j                   j                         | j                   j                          | j                   j                  |       | j                   j                         rDt        j                  d| j                   j                         | j                   j                          | j                          y y )Nrg   z&Sending SIGTERM to process with PID %dz&Sending SIGKILL to process with PID %d)
rB   ru   rw   rm   rF   warningpiderrorro   rZ   )rW   ri   rj   s      r1   ro   zTuningProcess.kill  s    
 <<#NNLL&67||$$&&&<LL$$ &&(!!*;!<<<((*"((@(( LL%%'JJL! $r0   )rC   
Queue[Any]rD   r}   returnNone)r~   boolr~   r   )rS   r   r~   r   )g      ^@g      @rf   )r~   r   )g      @rf   )r,   r-   r.   __doc__r<   __annotations__rB   rC   rD   staticmethodrK   rH   rX   rZ   rc   rN   r:   ru   rx   ro   r/   r0   r1   r@   r@   \   s     !FM %)G")*.M'.+/N(/C!C"C 
C C  H H 
G!2$ MP1	1f)r0   r@   c                  ^    e Zd ZU dZdZded<   dZded<   ddZddZdd	Z	dd
Z
	 	 	 	 ddZy)TuningProcessPoolz
    Maintains a pool of TuningProcesses to benchmark kernels in parallel
    across devices. By default, we create one TuningProcess per device and
    set the sub-process environment to make only that device visible.
    Nz$Optional[queue.Queue[TuningProcess]]	processeszOptional[ThreadPoolExecutor]executorc                   | j                   du | j                  du k(  sJ | j                   y| j                         }t        j	                  d|       t        j                         | _         |D ]R  }t        |      }|j                          |j                  t                      | j                   j                  |       T | j                   j
                  D ]$  }t        |j                  d      t              r$J  t        t        |            | _        t         s"daddl}|j%                  | j&                         yy)z,
        Start the child processes.
        Nz$Sub-process autotune device list: %s)r<   )rp   )max_workersTr   )r   r   get_device_listlogrG   rk   r   r@   rc   rN   r*   rM   r:   r4   r   lenEXIT_HANDLER_REGISTEREDatexitregisterru   )rW   devicesr<   pr   s        r1   rc   zTuningProcessPool.initialize$  s    $&DMMT,ABBB>>%&&(		8'B  	"FV,ALLNEE$&MNNq!		" %% 	@Aaee4e8$???	@ +s7|D
 '&*#OODNN+	 'r0   c                l   t         j                  sdgS t               }t        |      }|j	                         }t
        t        j                  v rNt        j                  t
           j                  d      D cg c]  }t        |       }}t        |      |k  sJ |S t        t        |            S c c}w )zD
        Gather the list of devices to be used in the pool.
        N,)r#   autotune_multi_devicer   r   device_countr'   r8   r9   splitintr   listrange)rW   gpu_typedevice_interfacecountdr   s         r1   r   z!TuningProcessPool.get_device_listI  s     ++6M>3H= --/  2::-')zz2F'G'M'Mc'RS!s1vSGSw<5(((NE%L!!	 Ts   7B1c                2   | j                   !| j                   j                          d| _         | j                  ^| j                  j                  D ]  }|j	                           | j                  j                  D ]  }|j                           d| _        yy)z:
        Signal all child processes to terminate.
        N)r   shutdownr   rk   ru   rx   )rW   r   s     r1   ru   zTuningProcessPool.terminate]  s     ==$MM""$ DM>>%^^)) ^^)) !DN &r0   c                F   |j                   J | j                  J | j                  j                         }|j                  |j                          	 |j                  t        j
                  t        j                  t        j                        | j                  j                  |       S # t        j                  $ rB t        j                  d| d       t        d      cY | j                  j                  |       S w xY w# | j                  j                  |       w xY w)z
        Entry point for the thread-pool helper threads: Wait for an open TuningProcess,
        remove it from the queue, execute the benchmark in that subprocess, and return
        the TuningProcess to the queue.
        zFailed to benchmark choice 'z['. It will be ignored. Please debug the root cause in case the choice can bring perf gains.inf)bmreqr   r:   rN   r#   +max_autotune_subproc_result_timeout_seconds-max_autotune_subproc_graceful_timeout_seconds.max_autotune_subproc_terminate_timeout_secondsrk   rl   warningswarnfloat)rW   choicerB   s      r1   r]   zTuningProcessPool.targetl  s     ||'''~~)))..$$&FLL!	(;;BBDDEE NNw' {{ 	 MM.vh 7W W
 <NNw'	  NNw's$   <B+ +7D "D ?D  D D c                    | j                   J d       | j                  J i }t        || j                  j                  | j                  |            D ]
  \  }}|||<    |S )z>
        Benchmark each choice in a separate process.
        z&Tuning process pool is not initialized)r   r   zipmapr]   )rW   choicesresultsr   results        r1   rP   zTuningProcessPool.benchmark  sp     ~~)S+SS)}}((( "'4==+<+<T[['+RS 	%NFF$GFO	% r0   r   )r~   zSequence[Optional[int]])r   r    r~   r   r   zlist[TritonTemplateCaller]r~   z!dict[TritonTemplateCaller, float])r,   r-   r.   r   r   r   r   rc   r   ru   r]   rP   r/   r0   r1   r   r     sK     7;I3:-1H*1#,J"("(6+ 
+r0   r   c                  p    e Zd ZU ded<   ded<   ded<   ded<   d	ed
<   dZded<   e	 	 	 	 dd       ZddZy)
TensorMetaztorch.devicer<   ztorch.dtypedtypeztorch._prims_common.ShapeTypesizesztorch._prims_common.StrideTypestridesr   offsetNzOptional[str]namec           
     :   t        |t              r4|D cg c]  }| j                  |       }}t        d |D              sJ |S |}t        |t        j
                        rt	        j                  d|      }|j                         }|J |j                         }|J t        ||t        j                  j                  j                  |j                         t        j                         t        j                  j                  j                  |j#                         t        j                         t        j                  j                  j%                  |j'                         j(                  t        j                         |j+                               S c c}w )Nc              3  <   K   | ]  }t        |t                y wN)rM   r   .0xs     r1   	<genexpr>z*TensorMeta.from_irnodes.<locals>.<genexpr>  s     AQz!Z0A   fake)r   layout)fallback)r<   r   r   r   r   r   )rM   r   from_irnodesallr   LayoutBuffer	get_dtype
get_devicer   r&   graphsizevars
size_hintsget_sizer#   unbacked_symint_fallback
get_stride	size_hint
get_layoutr   get_name)clsirnodesr   r   noder   r<   s          r1   r   zTensorMeta.from_irnodes  sU    gx(>E F!1!1!!4 FF FA&AAAAMdBII&99&6D    "!!!''""--88 .  GG$$//!88 0  77##--!((88 .  
 	
 !Gs   Fc                    t        | j                  | j                  | j                  | j                  | j
                        S )N)r<   r   
extra_size)r   r   r   r<   r   r   rV   s    r1   	to_tensorzTensorMeta.to_tensor  s2    JJLL;;**{{
 	
r0   )r   z/Union[LayoutOrBuffer, Sequence[LayoutOrBuffer]]r~   #Union[TensorMeta, list[TensorMeta]])r~   torch.Tensor)r,   r-   r.   r   r   classmethodr   r   r/   r0   r1   r   r     sQ    ((++KD-!
E!
	,!
 !
F
r0   r   c                  x    e Zd ZdZ	 	 	 	 	 	 	 	 	 	 d	dZ	 	 	 	 	 	 d
dZddZdd	 	 	 	 	 ddZdd	 	 	 	 	 ddZy)rO   a1  
    Only handle triton template benchmark for now. The extern kernel benchmark
    can be done inside the same process since they usually don't cause crash.

    Important: Instances of this class and subclasses have to be serializable
    across process boundaries. Do not put CUDA Tensors in here!
    c                    || _         t        |t              r|g}|| _        t        t        t
        f      r)t              dkD  rt        fdD              sJ d   | _        || _	        y )Nr!   c              3  d   K   | ]'  }d D ]   }t        d   |      t        ||      k(   " ) yw))r<   r   r   r   r   r   N)getattr)r   r   attroutput_tensor_metas      r1   r   z,BenchmarkRequest.__init__.<locals>.<genexpr>  sG       Q  .q148GAt<LLLs   -0r   )
kernel_namerM   r   input_tensor_metatupler   r   r   r   
extra_args)rW   r   r   r   r   s      ` r1   __init__zBenchmarkRequest.__init__  s     ''4!2 3!2(5$-8%&* /   
 "4A!6"4$r0   c                   t         r   NotImplementedErrorrW   output_tensorinput_tensorss      r1   make_run_fnzBenchmarkRequest.make_run_fn  s
     "!r0   c                     y r   r/   rV   s    r1   cleanup_run_fnzBenchmarkRequest.cleanup_run_fn  s    r0   Nr   c                   t         r   r   rW   fnr   r   s       r1   do_benchzBenchmarkRequest.do_bench  s
     "!r0   c                   t         j                  t        j                        }|rt	        j                         }|Ft        |      dk(  sJ t        d | j                  D              }| j                  j                         }|r+t	        j                         z
  }t	        j                         }	  | j                  |d|i}|r+t	        j                         z
  }t	        j                         } | j                  |g|| }|r9t	        j                         z
  }	t         j                  dt!        |       |	       | j#                          |S # t        $ r# t         j                  d       t        d      cY S w xY w)Nr   c              3  <   K   | ]  }|j                           y wr   )r   r   s     r1   r   z-BenchmarkRequest.benchmark.<locals>.<genexpr>  s     !PA!++-!Pr   r   z0Skipping op due to nonzero workspace requirementr   z6InChildProcess %s: load %f, create tensor %f, bench %f)rF   isEnabledForloggingDEBUGtimer   r   r   r   r   r   r6   infor   r   rG   r;   r   )
rW   r   r   rG   start_tscreate_tensor_elapser   load_elapseoutbench_elapses
             r1   rP   zBenchmarkRequest.benchmark  sQ   
 ++GMM:yy{H  }%***!!P9O9O!PPM 33==?M#'99;#9 yy{H	 !!!=NNB ))+0Kyy{HdmmB>>>99;1L  HD	$ 	
+ 1 	  RS<	 s   0E )E=<E=)
r   r;   r   r   r   r   r   Iterable[Any]r~   r   r   r   r   r   r~   zCallable[[], None]r   r   r   r   zOptional[torch.Tensor]r~   r   )	r,   r-   r.   r   r   r   r   r   rP   r/   r0   r1   rO   rO     s    %% ?% @	%
 "% 
%6"*";G"	"
 15	" %" .	"
 
" 15)$) .) 
	)r0   rO   c                  2    e Zd ZdZdddZdd	 	 	 	 	 ddZy)	TestBenchmarkRequestz
    Supports unit testing. Defined in this file so that the TuningProcess
    sub-process knows how to unpickle these objects.
    Nc                    || _         y r   )value)rW   r  s     r1   r   zTestBenchmarkRequest.__init__?  s	    
r0   r   c               H    | j                   t        d      | j                   S )NzFailed to run)r  rI   r   s      r1   rP   zTestBenchmarkRequest.benchmarkB  s#     ::O,,zzr0   r   )r  zOptional[float]r~   r   r  )r,   r-   r.   r   r   rP   r/   r0   r1   r  r  9  s0    
 UY*;Q	r0   r  c                  $    e Zd Zdd	 	 	 	 	 ddZy)GPUDeviceBenchmarkMixinNr   c                  t        d g ||D              }t        |      dk  s
J d|        t        d |D        d      }t        |      }t        |      dk(  rt        t	        |            }n|j                         }|j                  |      5  t        j                  |      }|j                          d d d        |S # 1 sw Y   S xY w)Nc              3     K   | ]i  }t        |t        j                        rMt        |j                  j
                        r.|j                  j                  |j                  j                   k y wr   )rM   torchTensorr   r<   rR   indexr   tensors     r1   r   z3GPUDeviceBenchmarkMixin.do_bench.<locals>.<genexpr>Q  sR      $
&%,,/v}}))*##/	 MM$
s   A/A1r!   zCan not mix devices c              3     K   | ]9  }t        |j                  j                        r|j                  j                   ; y wr   )r   r<   rR   r  s     r1   r   z3GPUDeviceBenchmarkMixin.do_bench.<locals>.<genexpr>Z  s4      &--,,- ""s   ?Acuda)
r   r   nextr   itercurrent_devicer<   r%   benchmark_gpusynchronize)	rW   r   r   r   device_idx_setdevice_typer   
device_idxr   s	            r1   r   z GPUDeviceBenchmarkMixin.do_benchK  s     $ $
9M9=9$
 
 >"a'P+??O)PP'+
 
 4K@~!#d>23J)88:J$$Z0 	+++B/C((*	+ 
		+ 
s   &CCr  r,   r-   r.   r   r/   r0   r1   r
  r
  J  s*    
 15	 % .	
 
r0   r
  c                  $    e Zd Zdd	 	 	 	 	 ddZy)CPUDeviceBenchmarkMixinNr   c               ,    t        j                  |      S r   )r%   benchmark_cpur   s       r1   r   z CPUDeviceBenchmarkMixin.do_benchn  s     ((,,r0   r  r  r/   r0   r1   r  r  m  s*    
 15	- %- .	-
 
-r0   r  c                  ~     e Zd Z	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZ	 	 	 	 	 	 ddZd ZddZ xZS )	TritonBenchmarkRequestc                    t         |   ||||       || _        || _        || _        || _        |	| _        |
| _        || _        || _	        y r   )
superr   module_pathmodule_cache_key
num_stages	num_warpsmatrix_instr_nonkdimwaves_per_eukpackworkspace_arg)rW   r   r   r   r   r%  r&  r'  r(  r)  r*  r+  r,  	__class__s                r1   r   zTritonBenchmarkRequest.__init__z  sW     	&79KZX& 0$"$8!(
*r0   c                 	
 t        j                  | j                  | j                        }t        j                  d| j                  | j                         t        || j                        j                  	t        | j                        d	j                  _        i dd l}d|j                  	      j                  v rdd<   j                   j"                  dk(  rd
nPj                   j"                  }t%        |      }|j'                  | j(                  j                   j*                        
| j,                  | j,                  	
fd}|S t/        t        || j                        t0        j2                  j4                  j6                  j8                        r!t;        j<                  	gi d
iS t;        j<                  	gi 
dd	S )
Nz"benchmark module key: %s, path: %sFr   warmupcpuc                    j                   } t        j                  | fdt        j                  j                        }j
                  t        j                  k7  r|j                           g |i dd y )N)r!   r   r<   Tstreambenchmark_run)	r   r  empty_strideduint8r<   	zero_moder$   UNINITIALIZEDzero_)	workspace_sizeworkspace_tensorr   r   r   
run_methodr4  
warmup_argr,  s	     r1   run_with_workspacez>TritonBenchmarkRequest.make_run_fn.<locals>.run_with_workspace  s    !.!4!4#(#6#6#%++(//	$  !**.?.M.MM$**,  "! %  	
 ! ""&r0   r4  Tr3  )r   load_by_key_pathr&  r%  rF   rG   r   r   runr   r   __self__with_bandwidth_infoinspect	signature
parametersr<   rR   r   get_raw_streamr   r  r,  rM   r  	_inductorruntimetriton_heuristicsDebugAutotuner	functoolspartial)rW   r   r   modrD  r  r   r?  r   r=  r4  r>  r,  s    ``     @@@@@r1   r   z"TritonBenchmarkRequest.make_run_fn  s    **4+@+@$BRBRS0!!	
 S$"2"2377
$//*
27
/ 
w((4???#(Jx $$-F'..33K7D%44''..44F ) ..M 2 &%C))*OO##55DD
 $$  	
    $$  	
  " r0   c                    t        j                  | j                  | j                        }t	        || j
                        j                          y r   )r   r@  r&  r%  r   r   
precompile)rW   rN  s     r1   rP  z!TritonBenchmarkRequest.precompile  s9    **4+@+@$BRBRST%%&113r0   c                T    d| j                   d| j                  d| j                  S )Nself.kernel_name=z, self.module_path=z, self.module_cache_key=)r   r%  r&  rV   s    r1   __str__zTritonBenchmarkRequest.__str__  s2    #$""$$8t'7'7&99RD<Q<Q;STTr0   )r   r   r   N)r   r;   r   r   r   r   r   r  r%  r;   r&  r;   r'  r   r(  r   r)  r   r*  r   r+  r   r,  zOptional[WorkspaceArg]r~   r   r  r~   r;   )r,   r-   r.   r   r   rP  rS  __classcell__r-  s   @r1   r"  r"  w  s     %&04++ ?+ @	+
 "+ + + + + "+ + + .+ 
+2R*R;GR	Rh4Ur0   r"  c                      e Zd Zy)TritonGPUBenchmarkRequestNr+   r/   r0   r1   rX  rX    r2   r0   rX  c                      e Zd Zy)TritonCPUBenchmarkRequestNr+   r/   r0   r1   rZ  rZ    r2   r0   rZ  c                  p     e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 d fdZd Z	 	 	 	 	 	 d	dZd
dZd Zd
dZddZ	 xZ
S )CUDABenchmarkRequestc                    t         |   ||||       || _        d| _        d | _        d | _        d| _        d| _        d| _        t        j                  | j                  d      \  | _        | _        y )Nr   F so)r$  r   source_coder;  	workspaceDLL_workspace_size_updatedhash_keysource_filer   writerW   r   r   r   r   r`  r-  s         r1   r   zCUDABenchmarkRequest.__init__  sr     	&79KZX&#$15)-',$ "*7*=*=d>N>NPT*U't'r0   c                    t         j                  d|        t        j                  | j                  d       t         j                  d|        y )NPrecompiling %sr_  Done precompiling %s)rF   rG   r   compiler`  rV   s    r1   rP  zCUDABenchmarkRequest.precompile  s<     	.5d..53T:r0   c          	     B   | j                          | j                          t        |      |gz   D cg c]  }t        |j	                                }}t
        j                  d| j                  | j                  | j                  | j                  || j                         t        t        j                  j                         j                        }t!        | j                  | j                        }t        d      }| j"                  dkD  rht        j$                  | j"                  dz   dz  t        j&                  |j(                        | _        t        | j*                  j	                               }t-        j.                  |g|| j                  d || S c c}w )Nzqmake_run_fn: self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%sr         r2  )ensure_dll_loadedupdate_workspace_sizer   r	   data_ptrrF   rG   r   re  rd  rb  r   r  r  current_streamcuda_streamr   r;  zerosfloat64r<   ra  rL  rM  )rW   r   r   r  r^   
stream_ptrr=  workspace_ptrs           r1   r   z CUDABenchmarkRequest.make_run_fn  sy    	 ""$ }-?
 V__&'
 
 	MMHHOO	
 ejj779EEF
TXXt'7'78
 ""[[$$q(Q.mm$++DN
 %T^^%<%<%>?M   

 __
 	

 
 
 	
3
s    Fc           
        | j                   ry | j                          t        | j                  D ch c]  }|j                   c}      }t        |dz         D cg c]  }t        d        }}t        t        j                  j                         j                        }t        | j                  | j                        }t               } |g || j                  t!        |      d |  t        j                  j#                          |j$                  | _        t(        j+                  d| j&                  | j                  | j,                  | j.                  | j                  || j                         d| _         y c c}w c c}w )Nr!   zupdate_workspace_size called: new workspace size=%d, self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%sT)rc  ro  r   r   r   r   r	   r  r  rr  rs  r   rb  r   r   r   r   r  r  r;  rF   rG   re  rd  )rW   metaunique_input_count_r^   rv  r=  c_workspace_sizes           r1   rp  z*CUDABenchmarkRequest.update_workspace_size;  sW   ''  #'#9#9:4TYY:
 )..@1.D(EF1FFejj779EEF
TXXt'7'78
#: 	
	
__	
  	
 	
 	
 	

 .44 hMMHHOO		
 (,$; ;Fs   E<Fc                    | j                   4t        j                  | j                  d      \  | _         | _        | _        y y )Nr_  )rb  r   loadr`  rd  re  rV   s    r1   ro  z&CUDABenchmarkRequest.ensure_dll_loaded_  s:    888E8J8J  $95DHdmT%5 r0   c                ^    | j                   | j                   j                          d | _        y r   )rb  closera  rV   s    r1   r   z#CUDABenchmarkRequest.cleanup_run_fne  s!    88HHNNr0   c                T    d| j                   d| j                  d| j                  S )NrR  z, self.source_file=z, self.hash_key=)r   re  rd  rV   s    r1   rS  zCUDABenchmarkRequest.__str__j  s0    #$""$$8t'7'7&99JDMM;KLLr0   r   r;   r   r   r   r   r   r  r`  r;   r~   r   r  r   rT  )r,   r-   r.   r   rP  r   rp  ro  r   rS  rU  rV  s   @r1   r\  r\    s    VV ?V @	V
 "V V 
V$;%
*%
;G%
	%
N",H
Mr0   r\  c                  b     e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 d fdZd Z	 	 	 	 	 	 ddZddZd	dZ xZS )
CppBenchmarkRequestc                f    t         |   ||||       || _        t        |      | _        d | _        y r   )r$  r   r`  r   rd  rb  rg  s         r1   r   zCppBenchmarkRequest.__init__r  s5     	&79KZX& -6:r0   c                    t         j                  d|        t        j                  | j                  d       t         j                  d|        y )Nri  r0  r  rj  )rF   rG   r   r~  r`  rV   s    r1   rP  zCppBenchmarkRequest.precompile  s<     	.5$**>3T:r0   c               \   t        j                  | j                  d      | _        t	        |      |gz   D cg c]  }|j                          }}t        j                  d| j                  | j                  || j                         t        | j                  | j                        }t        d | j                  D              sJ t        j                  gt        |      t        t	        | j                              z   z  |_        t!        j"                  |g|| j                   S c c}w )Nr0  r  zJmake_run_fn: self.kernel_name=%s, self.DLL=%s, args=%s, self.extra_args=%sc              3  P   K   | ]  }t        |t        j                           y wr   )rM   ctypesc_ulonglong)r   args     r1   r   z2CppBenchmarkRequest.make_run_fn.<locals>.<genexpr>  s     R3:c6#5#56Rs   $&)r   r~  r`  rb  r   rq  rF   rG   r   r   r   r   r  r  r   argtypesrL  rM  )rW   r   r   r  r^   r=  s         r1   r   zCppBenchmarkRequest.make_run_fn  s     $$T%5%55I04]0C}o0UVf!VVXHHOO	
 TXXt'7'78
R$//RRRR%112ID122


   

 __
 	
! Ws   D)c                    | j                   3	 t        | j                   d      r| j                   j                          y y y )Nr  )rb  hasattrr  rV   s    r1   r   z"CppBenchmarkRequest.cleanup_run_fn  s9    88 txx)  *	  r0   c                     d| j                   S )NrR  )r   rV   s    r1   rS  zCppBenchmarkRequest.__str__  s    #$""$%%r0   r  r  r   rT  )	r,   r-   r.   r   rP  r   r   rS  rU  rV  s   @r1   r  r  n  so    ;; ?; @	;
 "; ; 
;;
*
;G
	
6!&r0   r  c                ,    t         j                  |       S )zO
    Do benchmarking in a subprocess and return the perf number (latency).
    )tuning_poolrP   )r   s    r1   benchmark_in_sub_processr    s       ))r0   )r<   rA   r   )_
__future__r   
contextlibr  dataclassesrL  r   r8   rk   r   r   collections.abcr   r   concurrent.futuresr   r   r   r	   r
   typingr   r   r   r   r   r  torch._inductor.async_compiler   torch._dynamo.device_interfacer   torch._dynamo.testingr   torch._inductorr   torch._inductor.codecacher   r   r   r   r   torch._inductor.utilsr   r   torch._loggingr   torch.utils._ordered_setr   multiprocessing.processr   multiprocessing.queuesr   typesr    torch._inductor.select_algorithmr    codegen.commonr"   r^  r#   r$   runtime.benchmarkingr%   virtualizedr&   r'   r   r,   rF   	getLoggerr   r*   r4   rI   r6   contextmanagerr>   	dataclassr@   r   r  r   r   LayoutOrBufferr   rO   r  r
  r  r"  rX  rZ  r\  r  r  r/   r0   r1   <module>r     s   "      	    . 1 2 2 @ @  $ ! C .   7 , / 3, E,  - -  .  "8\:g!	 		 			 	 7 7( y y yx ~ ~ ~B  ! ryy"))+, 3
 3
 3
l ] ] ]@+ "   F- -uU- uUp	 79O 		 79O 	tM24D tMn<&13C <&~*'*&*r0   