
    Vh+                        d dl Z d dlZd dlmZmZ d dlmZ d dlmZ d dl	m
Z
mZ d dlmZmZmZmZ d dlZd dlmZmZ d dlmZ ej.                  j1                  ed	      Zexr ej6                  j9                         Zd
Z ed      Z ed      Zdeee
ef   ef   deee
ef   ef   fdZ  G d d      Z! G d de!      Z" G d de"      Z#er e#       Z$y e"       Z$y)    N)cached_propertywraps)chain)median)AnyCallable)Concatenate	ParamSpecSelfTypeVar)countersdynamo_timed)use_experimental_benchmarkerbenchmarkingi  PTfnreturnc           	           t               dt        dt        j                  dt        j                  dt
        f fd       }|S )zWraps `fn` with `dynamo_timed` context, and increments the appropriate dynamo
    counters. It is expected that `fn` is a method of `Benchmarker` or one of its
    subclasses; typing limitations prevent us from declaring this directly.
    selfargskwargsr   c                     | j                   j                   dj                   }t        d   d| xx   dz  cc<   t        |d      5   | g|i |cd d d        S # 1 sw Y   y xY w)N.inductorzbenchmarking.   T)log_pt2_compile_event)	__class____name__r   r   )r   r   r   fn_qual_namer   s       T/home/dcms/DCMS/lib/python3.12/site-packages/torch/_inductor/runtime/benchmarking.pywrapperztime_and_count.<locals>.wrapper"   so    ..112!BKK=A}\N;<A<,dC 	-d,T,V,	- 	- 	-s   A!!A*)r   r   r   r   r   r   )r   r"   s   ` r!   time_and_countr#      sF     2Y-c -!&& -AHH - - - N    c                       e Zd ZdeddfdZedededef   deedf   de	e
ef   d	edefd
       Ze	 ddedeg ef   dededef
d       Zededed	edefd       Zy)Benchmarkerr   r   Nc                      y N )r   s    r!   __init__zBenchmarker.__init__-   s    r$   r   .fn_args	fn_kwargsr   c                 v   d}t        j                               D ]F  }t        |t        j                        s||j
                  }-|j
                  |k7  s=t        d       |t        d      fd}|t        j
                  d      k(  r | j                  |fi |S  | j                  |fi |S )a  Benchmark `fn(*fn_args, *fn_kwargs)` and return the runtime, in milliseconds (the
        actual runtime calculation is dictated by the benchmarking implementation, but may be
        one of [mean, median, minimum, etc.]). Functions as a convenience wrapper around
        device-specific implementations, like `benchmark_cpu` and `benchmark_gpu`. Raises
        `ValueError(...)` if we can't safely infer the device type of `fn`; for example,
        if multiple device types are found in `fn_args` and `fn_kwargs`, or if no device
        types are found.

        Arguments:
        - fn: The function to benchmark.
        - fn_args: The function's arguments.
        - fn_kwargs: The function's kwargs.

        Keyword Arguments:
        - **kwargs: The benchmarking implementation's kwargs.

        Returns:
        - The runtime of `fn(*fn_args, **fn_kwargs)`, in milliseconds.
        NzcCan't safely infer the device type of `fn` with multiple device types in `fn_args` and `fn_kwargs`!zCan't safely infer the device type of `fn` with no device types in `fn_args` or `fn_kwargs`! You should be calling `.benchmark_cpu` or `.benchmark_gpu` directly.c                        i S r(   r)   )r   r+   r,   s   r!   <lambda>z'Benchmarker.benchmark.<locals>.<lambda>Y   s    B595 r$   cpu)	r   values
isinstancetorchTensordevice
ValueErrorbenchmark_cpubenchmark_gpu)r   r   r+   r,   r   inferred_devicearg_or_kwarg	_callables    ```    r!   	benchmarkzBenchmarker.benchmark0   s    6 !'9+;+;+=> 	LlELL9&"."5"5$$7 y 	 " t  6	ell511%4%%i:6:: "t!!)6v66r$   r;   warmuprepc                 h    dt         dt        t           ffd} ||       t         ||            S )a  Benchmark the CPU callable, `_callable`, and return the median runtime,
        in milliseconds.

        Arguments:
        - _callable: The CPU callable to benchmark.

        Keyword Arguments:
        - warmup: Optionally, the duration, in milliseconds, to run `_callable`
        before benchmarking starts.
        - rep: Optionally, the duration, in milliseconds, to run `_callable`
        during benchmarking.

        Returns:
        - The median runtime of `_callable`, in milliseconds.
        msr   c                     g }t        j                         }	 t        j                         }         t        j                         }|j                  ||z
  t        z         ||z
  t        z  | kD  r	 |S ]r(   )timeperf_counterappendMILLISECONDS_PER_SECOND)r@   timingsrun_start_tstart_tend_tr;   s        r!   run_forz*Benchmarker.benchmark_cpu.<locals>.run_foru   sp    G++-K++-))+3JJK[(,CCrIN r$   )intlistfloatr   )r   r;   r=   r>   rJ   s    `   r!   r7   zBenchmarker.benchmark_cpua   s2    (
	 
	U 
	 	gcl##r$   r   c                     t         r(   )NotImplementedError)r   r   r   s      r!   r8   zBenchmarker.benchmark_gpu   s    !!r$   )   d   )r   
__module____qualname__r   r*   r#   r   r   tupledictstrrM   r<   rK   r7   r8   r)   r$   r!   r&   r&   ,   s    t   .7.7S#X.7 sCx.7 S>	.7
 .7 
.7 .7` OR $ $'C0 $:= $IL $	 $  $D "D " " " " "r$   r&   c            	       \    e Zd Zedededef   fd       Zededeg ef   dede	fd       Z
y)	TritonBenchmarkerr   r   .c                 N    	 ddl m} |S # t        $ r}t        d      |d}~ww xY w)z"Lazily import Triton's `do_bench`.r   )do_benchzrequires TritonN)triton.testingrZ   ImportErrorrO   )r   rZ   es      r!   triton_do_benchz!TritonBenchmarker.triton_do_bench   s4    	@/   	@%&78a?	@s   
 	$$r;   r   c                 0   t        j                  | j                        j                  }t	        |j                               D ]
  }||vs||=  d|v r | j                  |fi |d   S d|v r | j                  |fi |S  | j                  |fi |ddiS )a  Benchmark the GPU callable, `_callable`, and return the runtime, in milliseconds.

        Arguments:
        - _callable: The GPU callable to benchmark.

        Keyword Arguments:
        - quantiles: Optionally, a tuple of floats denoting the requested quantiles.
        - return_mode: Optionally, the requested return mode. Currently, Triton's
        `do_bench` supports min, max, mean, and median return modes.
        - **kwargs: Additional kwargs passed to Triton's `do_bench`.

        Returns:
        - The runtime of `callable`, in milliseconds. If `kwargs["quantiles"]` is specified,
        this is the first requested quantile. Else, if `kwargs["return_mode"]` is specified,
        this is the requested return mode. Otherwise, this is the median.
        	quantilesr   return_moder   )inspect	signaturer^   
parametersrL   keys)r   r;   r   do_bench_paramskwargs        r!   r8   zTritonBenchmarker.benchmark_gpu   s    $ "++D,@,@ALL&++-( 	"EO+5M	" & '4''	<V<Q??f$'4''	<V<<#t##INNXNNr$   N)r   rR   rS   r   r   r   r   r^   r#   rM   r8   r)   r$   r!   rX   rX      sk    d xS'9   OD OXb#g-> O# ORW O Or$   rX   c                   T   e Zd Zededefd       Zdededeee	j                  j                  e	j                  j                  f      fdZdedeee	j                  j                  e	j                  j                  f      defdZe	 	 	 	 ddedeg ef   d	ed
ededededefd       Zy)InductorBenchmarkerr   r   c                     t         j                  j                         }t         j                  j                  |      }|j                  S )z7Get the L2 cache size, in bytes, of the current device.)r3   cudacurrent_deviceget_device_propertiesL2_cache_size)r   r5   propss      r!   rn   z!InductorBenchmarker.L2_cache_size   s6     **,

008"""r$   itersc                     t        |      D cg c]B  }t        j                  j                  d      t        j                  j                  d      fD c}S c c}w )z!Get `iters` pairs of CUDA events.T)enable_timing)ranger3   rk   Event)r   rp   _s      r!   get_event_pairsz#InductorBenchmarker.get_event_pairs   sV     5\

  

  t 4

  t 4
 	
 
s   AAevent_pairsc           	      h    t        |D cg c]  \  }}|j                  |       c}}      S c c}}w )zIGet the minimum timing, in milliseconds, for a group of CUDA event pairs.)minelapsed_time)r   rw   start_event	end_events       r!   get_event_pairs_min_timingz.InductorBenchmarker.get_event_pairs_min_timing   s=      /:*K ((3
 	
s   .
r;   estimation_itersmemory_warmup_itersbenchmark_itersmax_benchmark_durationr   c           	         t         j                  j                           |        t         j                  j                          t        j                  | j                  dz  t         j
                  d      }|j                          | j                  |      }|D ]<  \  }	}
|j                          |	j                           |        |
j                          > t         j                  j                          | j                  |      }t        t        |t        ||z              d      }t        |      D ]  }|j                           | j                  |      }|D ]<  \  }	}
|j                          |	j                           |        |
j                          > t         j                  j                          | j                  |      }~t        ||      S )a<  Benchmark a GPU callable using a custom benchmarking implementation.

        Arguments:
        - _callable: The callable to benchmark.

        Keyword Arguments:
        - estimation_iters: Optionally, the number of iterations to run `_callable`
        during runtime estimation.
        - memory_warmup_iters: Optionally, the number of iterations to flush the L2
        cache before starting benchmarking.
        - benchmark_iters: Optionally, the number of iterations to run `_callable`
        during the benchmarking.
        - max_benchmark_duration: Optionally, the maximum duration of the benchmarking,
        in milliseconds. An estimated duration is calculated based on the values
        of `memory_warmup_iters` and `benchmark_iters`, along with the estimated
        runtime of `_callable` and various other factors, and we then shrink
        `benchmark_iters` to fit in the alloted maximum duration.
        - **kwargs: Additional kwargs that may be passed to the fallback.

        Returns:
        - The minimum runtime of `_callable`, in milliseconds.
           rk   )dtyper5   r   )r3   rk   synchronizeemptyrn   rK   zero_rv   recordr}   maxry   rs   )r   r;   r~   r   r   r   r   bufferrw   r{   r|   estimated_timingru   benchmarked_timings                 r!   r8   z!InductorBenchmarker.benchmark_gpu   s   B 	

  	

  T//14EIIfU **+;<&1 	"KLLN K		
 	

 ::;G %;?O%O!PQST

 *+ 	ALLN	 **?;&1 	"KLLN K		
 	

 !<<[I  #%788r$   N)   rQ   rQ      )r   rR   rS   r   r   rK   rn   rL   rT   r3   rk   rt   rv   rM   r}   r#   r   r   r8   r)   r$   r!   ri   ri      s#   #D #S # #





	eEJJ$$ejj&6&667	8

	
	
!%eEJJ,<,<ejj>N>N,N&O!P	
		
  !"#&"&(M9M9BG$M9 M9 !	M9
 M9 !$M9 M9 
M9 M9r$   ri   )%rb   rB   	functoolsr   r   	itertoolsr   
statisticsr   typingr   r   typing_extensionsr	   r
   r   r   r3   torch._dynamo.utilsr   r   torch._inductor.configr   _logginggetArtifactLoggerr   loggerrk   is_availablerE   r   r   r#   r&   rX   ri   benchmarkerr)   r$   r!   <module>r      s      ,     C C  6 ? 
	)	)(N	C >UZZ%<%<%> 
  cNCLS!V$a'(k#q&!1$%$Z" Z"z$O $ONm9+ m9b : ?P?R r$   