
    kVhPN                         d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZ d dlm	Z	m
Z
mZ ddlmZ ddlmZ d Zd Zd	 Zdd
ZddZddZ G d d      Z G d d      Zd ZddZddZd Zedd       ZddZy)    N)contextmanager)AnyDictList   )language)runtimec                    dj                  |       } dddd| z   dg}t        j                  |      }|j                  t        j
                  j                        j                  d      }|D cg c]  }t        |       }}|S c c}w )N,
nvidia-smi-i0z--query-gpu=z--format=csv,noheader,nounits)	join
subprocesscheck_outputdecodesysstdoutencodingsplitint)attrscmdoutretxs        >/home/dcms/DCMS/lib/python3.12/site-packages/triton/testing.pynvsmir      sy    HHUOEsNU$:<[
\C

!
!#
&C
**SZZ((
)
/
/
4C
a3q6
C
J  s   -Bc                 t     t               t                 fd}|D cg c]
  } ||       c}S c c}w )Nc                     d| cxk  rdk  st        d       t        d      | dz
  z  }t        j                  |      }t        j                  |      }||z
  }d|z
  |   z  ||   z  z   S )Nr   r   z%Quantiles must be in the range [0, 1])
ValueErrormathfloorceil)qpointloweruppertans        r   get_quantilez_quantile.<locals>.get_quantile   s}    Q!DEE DEEQU

5!		% EMA5!A%L00    )lensorted)r*   r%   r,   r+   s   `  @r   	_quantiler0      s4    AAq	A1 &''LO'''s   5c                     |!t        | |      }t        |      dk(  r|d   }|S |dk(  r| S |dk(  rt        |       S |dk(  rt        |       S |dk(  rt	        j
                  |       S |dk(  rt	        j                  |       S y )Nr   r   allminmaxmeanmedian)r0   r.   r3   r4   
statisticsr5   r6   )times	quantilesreturn_moder   s       r   _summarize_statisticsr;   *   s    y)s8q=a&C
e		5z		5z		u%%		   '' 
!r-   c                    ddl }|dv sJ |j                  j                  |j                  j                               5   |         |/|D ]*  }|j	                          |j                  d       d|_        , |j                  j                  d      }|j                  j                  d      }|j                          t        d      D ]	  }	 |          |j                          |j                  j                          |j                  |      dz  }
t        dt        ||
z              }|j                  j                         }|j                  j                  |      5  t        |      D ]  }	||D ]	  }d|_          |          	 ddd       |j                  j                          g }d}t        |      D ]  }	|j                  j                  d      }|j                  j                  d      }|j                          |j!                          |j                          |j                  j                          ||j                  |      |z  gz  } t#        |||      cddd       S # 1 sw Y   xY w# 1 sw Y   yxY w)	a  
    Benchmark the runtime of the provided function.

    :param fn: Function to benchmark
    :type fn: Callable
    :param rep: Repetition time (in ms)
    :type rep: int
    :param grad_to_none: Reset the gradient of the provided tensor to None
    :type grad_to_none: torch.tensor, optional
    :param return_mode: The statistical measure to return. Options are "min", "max", "mean", "median", or "all". Default is "mean".
    :type return_mode: str
    r   Nr3   r4   r5   r6   r2   Tenable_timing   r   
   )torchcudastreamStreamdetach_requires_grad_gradEventrecordrangesynchronizeelapsed_timer4   r   	CUDAGraphgraphreplayr;   )fnrepgrad_to_noner9   r:   rB   r   start_event	end_event_estimate_msn_repeatgr   	n_retriess                  r   do_bench_cudagraphr[   <   sS    AAAA			5::,,.	/ ,B
#! 		  & jj&&T&:JJ$$4$8	q 	AD	

 !..y9A=q#cK/01 JJ  "ZZa  	8_ +) &!%&		 	

 	y! 	DA*****>K

((t(<I HHJJJ""$K,,Y7(BCCC	D %S)[AY,B ,B4	 	5,B ,Bs%   D#I<!(I0
CI<0I9	5I<<Jc                    |dv sJ t         j                  j                  j                         } |         |j	                          t         j                  j                  j                         }|j                  d      }|j                  d      }	|j                          t        d      D ]2  }
t         j                  j                  j                  |        |         4 |	j                          |j	                          |j                  |	      dz  }t        dt        ||z              }t        dt        ||z              }t        |      D cg c]  }|j                  d       }}t        |      D cg c]  }|j                  d       }	}t        |      D ]	  }
 |          t        |      D ]h  }||D ]	  }d|_         t         j                  j                  j                  |       ||   j                           |         |	|   j                          j |j	                          t        ||	      D cg c]  \  }}|j                  |       }}}t        |||      S c c}w c c}w c c}}w )a  
    Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with
    the 20-th and 80-th performance percentile.

    :param fn: Function to benchmark
    :type fn: Callable
    :param warmup: Warmup time (in ms)
    :type warmup: int
    :param rep: Repetition time (in ms)
    :type rep: int
    :param grad_to_none: Reset the gradient of the provided tensor to None
    :type grad_to_none: torch.tensor, optional
    :param quantiles: Performance percentile to return in addition to the median.
    :type quantiles: list[float], optional
    :param return_mode: The statistical measure to return. Options are "min", "max", "mean", "median", or "all". Default is "mean".
    :type return_mode: str
    r=   Tr>   r@   r   N)r	   driveractiveget_device_interfacerL   get_empty_cache_for_benchmarkrI   rJ   rK   clear_cacherM   r4   r   rH   zipr;   )rQ   warmuprR   rS   r9   r:   dicacherT   rU   rV   rW   n_warmuprX   ir   ser8   s                      r   do_benchrj   {   s*   $ AAAA				3	3	5BDNNNN!!??AE (((.Kt,I1X ))%0
 NN**959K 1c&;./0H1c#+,-H9>xIA288$8/IKI7<XG!-GIG8_ 
 8_  #!  	))%0A
! NN+.{I+FG41aQ^^AGEG 	;??- JG( Hs   I-;I2I7c                    ddl }ddl}t        | |j                        s|j	                  |       } t        ||j                        s|j	                  |      }|d}t        |      r || j                        n|}|d}t        |      r || j                        n|}t        | |j                        rU| j                  |j                  k(  r| j                         } | j                         j                         j                         } t        ||j                        rU|j                  |j                  k(  r|j                         }|j                         j                         j                         }| j                  dkD  s|j                  dkD  r!|j                  j                  | |||d       y|j                  | |||      st        | d	|  d
| d| d| d
      y)a  
    Asserts that two inputs are close within a certain tolerance.

    :param x: The first input.
    :type x: scala, list, numpy.ndarray, or torch.Tensor
    :param y: The second input.
    :type y: scala, list, numpy.ndarray, or torch.Tensor
    :param atol: The absolute tolerance. Default value is 1e-2.
    :type atol: float, optional
    :param rtol: The relative tolerance. Default value is 0.
    :type rtol: float, optional
    :param err_msg: The error message to use if the assertion fails.
    :type err_msg: str
    r   Ng{Gz?g        r   T)atolrtol	equal_nan)rl   rm    z is not close to z (atol=z, rtol=))numpyrB   
isinstanceTensortensorcallabledtypebfloat16floatcpudetachsizetestingassert_allcloseallcloseAssertionError)r   yrl   rm   err_msgnprB   s          r   assert_closer      s     a&LLOa&LLO|$TN4=D|$TN4=D !U\\"77enn$	AEEGNN""$!U\\"77enn$	AEEGNN""$ 	vvzQVVaZ


""1ad"N;;q!$T;2y!,=aSvWUYTZZ[\]] 3r-   c                   r    e Zd ZdZ	 	 	 	 	 ddee   dee   dedee   dee   ded	eeef   d
edededefdZ	y)	Benchmarkzk
    This class is used by the :code:`perf_report` function to generate line plots with a concise API.
    Nx_namesx_valsline_arg	line_vals
line_names	plot_nameargsxlabelylabelx_logy_logc                     || _         || _        |
| _        || _        || _        || _        || _        || _        || _        |	| _	        || _
        || _        y)aq  
        Constructor.
        x_vals can be a list of scalars or a list of tuples/lists. If x_vals is a list
        of scalars and there are multiple x_names, all arguments will have the same value.
        If x_vals is a list of tuples/lists, each element should have the same length as
        x_names.

        :param x_names: Name of the arguments that should appear on the x axis of the plot.
        :type x_names: List[str]
        :param x_vals: List of values to use for the arguments in :code:`x_names`.
        :type x_vals: List[Any]
        :param line_arg: Argument name for which different values correspond to different lines in the plot.
        :type line_arg: str
        :param line_vals: List of values to use for the arguments in :code:`line_arg`.
        :type line_vals: List[Any]
        :param line_names: Label names for the different lines.
        :type line_names: List[str]
        :param plot_name: Name of the plot.
        :type plot_name: str
        :param args: Dictionary of keyword arguments to remain fixed throughout the benchmark.
        :type args: Dict[str, Any]
        :param xlabel: Label for the x axis of the plot.
        :type xlabel: str, optional
        :param ylabel: Label for the y axis of the plot.
        :type ylabel: str, optional
        :param x_log: Whether the x axis should be log scale.
        :type x_log: bool, optional
        :param y_log: Whether the y axis should be log scale.
        :type y_log: bool, optional
        :param styles: A list of tuples, where each tuple contains two elements: a color and a linestyle.
        :type styles: list[tuple[str, str]]
        N)r   r   r   r   r   r   r   stylesr   r   r   r   )selfr   r   r   r   r   r   r   r   r   r   r   r   s                r   __init__zBenchmark.__init__   sY    ^ 
 "$
"	r-   ) r   FFN)
__name__
__module____qualname____doc__r   strr   r   boolr    r-   r   r   r      s     ;c; S	; 	;
 9; I; ; 38n; ; ; ; ;r-   r   c            	       8    e Zd Zd Z	 	 d	dedededefdZd
dZy)Markc                      || _         || _        y NrQ   
benchmarks)r   rQ   r   s      r   r   zMark.__init__5  s    $r-   bench	save_path
show_plots
print_datac                 	   dd l }dd lm}	 dd l}
|j                  }|j                  D cg c]  }| d	 }}|j                  D cg c]  }| d	 }}t        |j                        }|
j                  ||z   |z   |z         }|j                  D ]  }t        |t
        t        f      s|D cg c]  }| }}t        |      t        |      k7  rt        dt        |       d|       t        t        ||            }g g g }}}|j                  D ]I  } | j                   di ||j"                  |i|j$                  |}	 |\  }}}||gz  }||gz  }||gz  }K t        |      |z   |z   |z   |j(                  t        |      <    |j*                  r|	j-                          |	j/                         }|d   }t1        |j                        D ]  \  }}||dz      ||dz      }}|j2                  r|j2                  |   d   nd }|j2                  r|j2                  |   d   nd }|j5                  ||   ||   |||       |j7                         j9                         r|j7                         j9                         r|j;                  t<              }|j;                  t<              }|j?                  ||   ||d	|
        |jA                          |jC                  |jD                  xs |       |jG                  |jH                         |jK                  |jL                  rdnd       |jO                  |jP                  rdnd       |r|	jS                          |r8|	jU                  |jV                  jY                  ||j*                   d             |||j                  z      }|r=|jZ                  d   dk(  r+|j\                  j_                         \  }}||   ||   z
  |d<   |r1ta        |j*                  dz          ta        |jc                                |r?|je                  |jV                  jY                  ||j*                   d      d| dd       |S c c}w c c}w c c}w # t&        $ r
 |d d }}}Y 8w xY w)Nr   z-minz-max)columnsz	Expected z values, got r   )labelcolorlsg333333?)alphar   loglinearz.png   Diff:z.csvz%.fF)float_formatindexr   )3osmatplotlib.pyplotpyplotpandasr   listr   	DataFramer   rr   tupler.   r!   dictrb   r   rQ   r   r   	TypeErrorlocr   figuresubplot	enumerater   plotisnullr2   astyperx   fill_betweenlegend
set_xlabelr   
set_ylabelr   
set_xscaler   
set_yscaler   showsavefigpathr   shaper   tolistprint	to_stringto_csv)r   r   r   r   r   diff_colsave_precisionkwragsr   pltpdy_meanr   y_miny_maxr   dfrV   x_argsrow_meanrow_minrow_maxr   r   axfirst_xrg   colstycol0col1s                                  r   _runz	Mark._run9  s!   '!!%*%5%56A3d66%*%5%56A3d66u}}%\\'F"2U":U"B\C 	EAa$/ '(1Q((1vW% 9S\N-s!KLL#gq/*F)+RwgH__ #dggVV5>>1*=VVvV;+.(FE5 VH$E7"E7"# #1g07:WDBFF3r7O'	E* ??JJLBajG!%"2"23 V1!!f*~r!f*~u,1LLell1oa(d,1LLell1oa(d7RU!33G||~))+ELLN4F4F4H!LL/E!LL/EOOBwKTQTOUV IIKMM%,,1'2MM%,,'MM5;;%H=MM5;;%H=
BGGLLu6Gt4LMN%***+q(**,JD$DBtH,BvJ%//C'(",,.!IIbggll90A.FGXZ[iZjjkVl!  #	y 76 ) ! ;+.d5EF;s#   QQ#,	Q(Q--R ?R c           	         t        | j                  t              }|r| j                  gn| j                  }g }|rRt        j                  |d       t        t        j                  j                  |d      d      }	|	j                  d       |D ]I  }
|j                   | j                  |
|||fi |       |s+	j                  d|
j                   d       K |r!	j                  d       |	j                          |r	|r|d	   S |S y )
NT)exist_okzresults.htmlwz<html><body>
z<image src="z.png"/>
z</body></html>
r   )rr   r   r   r   makedirsopenr   r   writeappendr   r   close)r   r   r   r   	return_dfkwargshas_single_benchr   
result_dfshtmlr   s              r   runzMark.run~  s    %dooyA*:doo&

KK	D1Y?EDJJ'( 	HEidiiy*j[TZ[\

]5??*;:FG	H JJ)*JJL!!}$!!r-   N)F   )FFr   F)	r   r   r   r   r   r   r   r   r   r   r-   r   r   r   3  s>    % chC) C C CSW CJr-   r   c                       fd}|S )z
    Mark a function for benchmarking. The benchmark can then be executed by using the :code:`.run` method on the return value.

    :param benchmarks: Benchmarking configurations.
    :type benchmarks: List of :class:`Benchmark`
    c                     t        |       S r   )r   r   s    r   <lambda>zperf_report.<locals>.<lambda>  s    b*- r-   r   )r   wrappers   ` r   perf_reportr     s     .GNr-   c                    ddl }ddlm} | s|j                  j	                         } |j
                  j                  j                  |       d   }|j
                  j                  j                  |       d   }||z  dz  dz  d	z  }|S )
z return DRAM bandwidth in GB/s r   Nr   r]   mem_clock_ratemem_bus_widthr   g    .A   )rB   r	   r]   rC   current_devicer^   utilsget_device_properties)devicerB   r]   mem_clock_khz	bus_widthbw_gbpss         r   get_dram_gbpsr    sz    **,MM''==fEFVWM##99&A/RIi'!+c1A5GNr-   c                 J   dd l }ddlm} |s|j                  j	                         }|j
                  j                  j                  |      d   dz  }|j                  j                  |      }|d   dk  r| |j                  k(  sJ d}n| |j                  |j                  fv rd}nr| |j                  |j                  |j                  fv rd}nJ| |j                  t        j                   t        j"                  t        j$                  fv rd	}nt'        d
      ||z  |z  dz  }|S )Nr   r   r   multiprocessor_count   r      i   i   dtype not supported&.>)rB   r	   r]   rC   r   r^   r  r  get_device_capabilityfloat16float32int32rw   int16int8tl
float8e4nvfloat8e4b15float8e5RuntimeError	rv   
clock_rater  rB   r]   num_subcores
capabilityops_per_sub_coretflopss	            r   get_max_tensorcore_tflopsr    s   **,==&&<<VDE[\_``L11&9J!}q%%%U]]EKK00"u}}ennekkBB"uzz2==".."++NN#455J&)99D@FMr-   c                        fd}|S )Nc                 F     t        j                          fd       }|S )Nc                  r   dd l }|j                  t        j                               j	                         }
j                         |j                         k  }|r|dk7  rt        j                  j                  j                  d         }t        j                  d   dd}d|v sJ d       |d   j                  j                  j                  }| d	j                   d
| d}t        j                  ddd|gd|      }	|	j                   dk(  sJ d       dt#        |	j$                        v sJ y  | i | y )Nr   zcuda-memcheck__file__PATH1)r$  PYTORCH_NO_CUDA_MEMORY_CACHINGrequestz@memcheck'ed test must have a (possibly unused) `request` fixturez::[]pytestz-vsT)capture_outputenvz7cuda-memcheck returned an error: bounds checking failedzERROR SUMMARY: 0 errors)psutilProcessr   getppidnameitemsr   realpath__globals__environnodecallspecidr   r   r   
returncoder   r   )r   r   r-  	ppid_namerun_cuda_memcheckr   r,  test_idr   r   target_kwargstest_fns             r   r   z1cuda_memcheck.<locals>.decorator.<locals>.wrapper  s!   rzz|499;I - 3 3 5 G Y/%Aww''(;(;J(GH!zz&1UXY F*n,nn* +0099<<b!1!1 2!G9A> nnox%L]agjk~~*e,ee*0C

OCCC((r-   )	functoolswraps)r=  r   r<  s   ` r   	decoratorz cuda_memcheck.<locals>.decorator  s%    		!	) 
"	)" r-   r   )r<  r@  s   ` r   cuda_memcheckrA    s    , r-   c           	   #     K   	 t        j                  g d       t        j                  dddd|  d|  g       t        j                  dddd| d| g       t        dg      d	   }t        d
g      d	   }t        || z
        dk  sJ d|  d       t        ||z
        dk  sJ d| d       d| z  }d|z  dz  }||f t        j                  g d       t        j                  g d       t        j                  g d       y # t        j                  g d       t        j                  g d       t        j                  g d       w xY ww)N)r   r   r   -pmr%  r   r   r   z--lock-gpu-clocks=r   z--lock-memory-clocks=zclocks.current.smr   zclocks.current.memoryrA   zGPU SMs must run at z MHzg 3O?i   gMbP?)r   r   r   rC  r   )r   r   r   z-rgc)r   r   r   z-rmc)r   r   r   abs)ref_sm_clockref_mem_clockcur_sm_clockcur_mem_clockr  gbpss         r   set_gpu_clockrJ    sm    C EF a~>	!
 	 	#M?!M?C	!
 	 123A6678;<,./"4_8L\NZ^6__4==01B6b:N}o]a8bb6)L8&-dl EF AB AB 	 EF AB ABs   EB>D AEAEEc                    dd l }ddlm} |s|j                  j	                         }|j
                  j                  j                  |      d   dz  }|j                  j                         }|d   dk  r/| |j                  k(  rd}nW| |j                  k(  rd}nEt        d	      | |j                  k(  rd}n(| |j                  |j                  fv rd}nt        d	      ||z  |z  d
z  }|S )Nr   r   r   r	  r
  r       @   r  r  )rB   r	   r]   rC   r   r^   r  r  r  r  r  r  rw   r  s	            r   get_max_simd_tflopsrN    s    **,==&&<<VDE[\_``L113J!}qEMM!!emm#!455EMM!!u}}enn55!455J&)99D@FMr-   )   NNr5   )   d   NNr5   )NNr   r   )iF  i  )r>  r"   r   r7   r   r   
contextlibr   typingr   r   r   r   r   r  r	   r   r0   r;   r[   rj   r   r   r   r   r  r  rA  rJ  rN  r   r-   r   <module>rT     s      	   
 % " "  ( ($<B~?@D0^f@ @F` `F
:6 C C8r-   