
    Vh[9                        d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
 d dlZd dlmZ d dlmZ ddlmZ dd	lmZmZ  G d
 de
      Zg dZdedefdZdedefdZdefdZdede	eeef      ddfdZe j:                   G d d             Zdedej>                  j@                  jB                  de"de#deddfdZ$de"de#de#ded eddfd!Z%ded eddfd"Z&d eddfd#Z'ded eddfd$Z(y)%    N)defaultdict)
ModuleType)AnyOptionalProtocol)
DeviceType)
OrderedSet   )benchmarker)create_bandwidth_info_strget_num_bytesc                        e Zd ZdededefdZy)BenchmarkCallableTypetimesrepeatreturnc                      y N )selfr   r   s      Q/home/dcms/DCMS/lib/python3.12/site-packages/torch/_inductor/wrapper_benchmark.py__call__zBenchmarkCallableType.__call__   s        N)__name__
__module____qualname__intfloatr   r   r   r   r   r      s    =c=3=5=r   r   )foreachpersistent_reduction	pointwise	reduction
split_scantemplatesrc_coder   c                 j    t         D cg c]  }d| | v s| }}t        |      dk(  r|d   S yc c}w )z
    Similar to get_kernel_category but use the source code. Call this API
    if we have not compile the src_code to module yet.
    z@triton_heuristics.r
   r   unknown)_kernel_category_choiceslen)r%   chchoicess      r   "get_kernel_category_by_source_coder,      sM     .3Frd1Kx1WG  7|qqzs   00
kernel_modc                 x    t         D cg c]  }|| j                  v s| }}t        |      dk(  r|d   S yc c}w )a  
    Given the module defining a triton kernel, return the category of the kernel.
    Category can be one of:
    - pointwise
    - reduction
    - persistent_reduction

    Currently we simply decide the category depending on what decorator is imported
    by the kernel.
    r
   r   r'   )r(   __dict__r)   )r-   r*   r+   s      r   get_kernel_categoryr0   ,   sC     5Rbj>Q>Q8QrRGR
7|qqz	 Ss   77modc                     ddl m} | j                  j                         D cg c]$  \  }}|j	                  d      rt        ||      r|& }}}t        |      dk(  sJ |d   S c c}}w )Nr   )CachingAutotunertriton_r
   ))torch._inductor.runtime.triton_heuristicsr3   r/   items
startswith
isinstancer)   )r1   r3   kv	cand_lists        r   get_triton_kernelr<   >   sn    J LL&&(Aq<<	"z!5E'F 	
I 
 y>QQ<s   )A%benchmark_namebenchmark_all_configsc                    ddl m} d}|j                  D ]$  j                  }t	        d      rt	        d      s)t              }t              }j                         t        |j                  j                  D cg c]  }|j                  d      r| c}      }|j                  j                  dd      t        d|id	z  	 dd
t        dt         t"           dt         t"           dt         t"           dt$        dt$        ffd}	| dd|dd j'                          d|dd  }
|rt	        d      sJ j)                        }t+        |
       |j-                         D ]G  \  }}t+        d |	||j.                  |j0                  |j2                         d|j4                          I nt7        j8                  fdd      }t        |j:                        dk(  sJ d       |j:                  d   }t+         |	||j.                  |j0                  |j2                  |
 d             |dz  }' |dk(  rt+        d       yyc c}w ) aX  
    An experimental API used only when config.benchmark_kernel is true.

    Run the kernel benchmarks for all the kernels cached in PyCodeCache.
    Used in the compiled modules.

    Put this method here rather than codegen it for convenience since its implementation
    does not change based on different graph modules being compiled.
    r   )PyCodeCacheget_argscall
in_out_ptrkernel_num_gbNnum_in_out_argsg    eAmsn_regsn_spillssharedprefixr   c                     t        d |||fD              sd|dd|dd|dd}nd}| d	z  z  }t        | |||
      S )Nc              3   $   K   | ]  }|d u  
 y wr   r   ).0xs     r   	<genexpr>z>benchmark_all_kernels.<locals>.get_info_str.<locals>.<genexpr>s   s     EQqDyEs     3z regs  z	 spills  8z shared mem g     @@)rJ   suffix)anyr   )rF   rG   rH   rI   rJ   kernel_detail_strgb_per_snum_gbs          r   get_info_strz+benchmark_all_kernels.<locals>.get_info_strl   sl     E68V*DEE
'(1YvajT " %'!c*H,FHV<M r   20    
   r>   rP   z @ c                  &    j                         S r   )rB   )argsr-   s   r   <lambda>z'benchmark_all_kernels.<locals>.<lambda>   s    :??43H r   (   )repr
   z.Autotuner should have selected the best config)rJ   zpNo kernel with benchmark functionality found. Make sure you run inductor with config.benchmark_kernel being True)rS   )torch._inductor.codecacher@   moduleskeyhasattrr<   r0   rA   r)   fn	arg_namesr7   inductor_metagetr   r   r   r   strupperr>   printr6   rG   rH   rI   configr   benchmark_gpu	launchers)r=   r>   r@   nfound
kernel_keytriton_kernelkernel_categoryarg_namenum_in_out_ptrsrY   kernel_descbench_resultlauncherrF   r_   r-   rX   s                 @@@r   benchmark_all_kernelsrz   J   s    6F!)) A
^^
z:.gj&6Q)*5-j9""$ !. 0 0 : :&&|4 
 ,,00$G>"DJ/JSPF 		SM	 sm	 SM		
 	 	( b!?2A#6#<#<#>"?qCR@QR 	 !:'>???%;;DAL+ , 2 2 4 "b(//8;L;Lhoo^__bckcrcrbst
 **+HbQB}../14 @4 %..q1HOO%%OO)]!, 	!CAD {~	
 ss   I
c                   6    e Zd ZU eed<   eed<   eed<   eed<   y)ProfileEventcategoryre   self_device_time_mscountN)r   r   r   rk   __annotations__r   r   r   r   r|   r|      s    M	H Lr   r|   
event_listwall_time_msnrunsdevice_namec                    	
 dt         j                  j                  j                  dt        ffd
t        t              	dt         j                  j                  j                  dt        dd f	
fd}|D ]  }|j                  rJ d       |j                  t        j                  k(  r4d}|j                  j                  d      r\|j                  j                  d	      rd
}n>|j                  j                  d      rd}n |j                  j                  d      rd}nd} |||        dt        dt        t           dt        ffdd	 fd} |        y )Nevr   c                 (    | j                   dz  z  S )zV
        ev.self_device_time_total is in microsecond. Convert to millisecond.
          )self_device_time_total)r   r   s    r   get_self_device_timez6parse_profile_event_list.<locals>.get_self_device_time   s     ((4/%77r   r}   c                     t        || j                   |       | j                  z        }|   j                  |       y )N)r}   re   r~   r   )r|   re   r   append)r   r}   
profile_ev
all_eventsr   r   s      r   	add_eventz+parse_profile_event_list.<locals>.add_event   sB     " 4R 8((U"	

 	8##J/r   z!Don't support the legacy profilerr'   r4   
triton_poitriton_pointwise
triton_redtriton_reduction
triton_pertriton_persistent_reductiontriton_unknownprofile_eventsc           	         syddl m } |j                  d d       g }d}t        d|  d       |D ]]  }||j                  z  }|j                  z  d	z  d
d}|j	                  |j
                  d d |j                  |j                  |g       _ |j	                  d|d|z  d	z  d
dg       t         ||ddj                          dddg             |S )N        r   )tabulatec                     | j                   S r   )r~   )r   s    r   r`   zCparse_profile_event_list.<locals>.report_category.<locals>.<lambda>   s    2+A+A r   T)re   reversez
  == z category kernels == d   .2f%x   TotalrS   KernelzSelf z
 TIME (ms)CountPercent)headers)r   sortrm   r~   r   re   r   rl   )	r}   r   r   rows
total_timer   percentr   r   s	          r   report_categoryz1parse_profile_event_list.<locals>.report_category   s   % A4P
z!678  	SB"000J//,>DSIKGKKr'='=rxxQR	S 	j"l)BS)H(MQ&OP	
 	K--/0
;	
	
 r   c                     g d} t        j                               j                  t        |             sJ t        j                                       i }d}| D ]  }|v s ||         }|||<   ||z  } |z  dz  dd}
r t	        d
j                          d|        nt	        d       t	        d	d
d       d	 }| D ]&  }|j                  |d      z  dz  dd}|d| z  }( |d| dd
dz  }t	        |       y )N)r   r   r   r   r'   r   r   r   r   z
Percent of time when z
 is busy: zNo device detectedzTotal wall time .3fz mszOutput for tabulate: z, rF   )r	   keysissubsetlistrm   rl   rj   )category_listper_category_wall_timetotal_device_msr}   _timedevice_busy_percenttabulate_liner   r   r=   r   r   r   s           r   reportz(parse_profile_event_list.<locals>.report   sz   
 *//+,55j6OP 	
JOO%&'	
P "$% 	)H:%'*X2FG38&x05(		) "1<!?#!Ec J!L)+*;*;*=)>jI\H]^ &' c 2#67 0/?@% 	,H)--h<|KcQRUVVWX  r'^+M		,
 	212"\#4FbIImr   )r   N)torchautogradprofiler_util	EventListr   r   r   rk   	is_legacydevice_typer   CPUre   r7   r|   )r=   r   r   r   r   r   r   r}   r   r   r   r   s   ` ```    @@@r   parse_profile_event_listr      s6   8NN((228	8 1<D0AJ
0NN((22
0
0 

0   <<D!DD>>Z^^+66Y'vv  .-""<0-""<08+"h# &# tL7I e >* *X Hr   r   r   benchmark_compiled_module_fnc                    t         j                  j                  d      5 } |||       d d d        t        j                          d}j                  |       t        d| d       t        d|        |j                  d      }t        |j                  d	d
             t        ||| ||z  |j                         y # 1 sw Y   xY w)NT)record_shapesr   r   z/compiled_module_profile.jsonz4Profiling result for a compiled module of benchmark :z+Chrome trace for the profile is written to )group_by_input_shaper   r]   )sort_by	row_limit)r   profilerprofiletempfile
gettempdirexport_chrome_tracerm   key_averagestabler   
use_device)r   r   r   r=   r   ppathr   s           r   perf_profiler   )  s     
		d		3 Aq$5@A !!#$$ABD$	@@PPQ
RS	7v
>?T:J	*

#;r

JK
L%&.!,,A As   CCc                    dd l }dd l}dd l}|j                  |      }|j                  j                  |      }|j                  j                  |j                  j                  |            d   }t        j                         }t        j                  j                         j                  d      }	|j                  j                  |d|	 d      }
d| d| d}dd	d
dddddddddddd|
dd|g}	 |j                  |d       t        d|  d       t        d|
        y # |j                   $ r}t        d|        Y d }~y d }~ww xY w)Nr   z%Y%m%d_%H%M%Sncu_output_z.ncu-repz import sys; sys.path.insert(0, 'z	'); from zO import benchmark_compiled_module; benchmark_compiled_module(times=1, repeat=1)ncuz--target-processesallz--replay-modekernelz--kernel-name-basefunctionz--print-unitsbasez--setfullz--import-sourceyesz--force-overwritez--exportpython-cT)checkz%
NCU profiling results for benchmark r   zNCU report has been written to z!NCU profiling failed with error: )inspectos
subprocessgetfiler   dirnamesplitextbasenamer   r   datetimenowstrftimejoinrunrm   CalledProcessError)r=   r   r   r   r   module_file
module_dirmodule_namencu_dir	timestamp
ncu_output
python_cmdncu_cmdes                 r   ncu_analyzerr   >  sc    //">?K-J''""277#3#3K#@A!DK!!#G!!%%'00AIgYKx'HIJ,ZL 9 ;	;  	'G,wd+6~6FaHI/
|<=(( 1!56s   00D! !E0EEc                    t         j                  j                         sJ t         j                  j                  j	                  d        | dd       t        j                          d}t         j                  j                  j                  |       t         j                  j                  j	                  d        t        d|        y )	Ni )max_entriesr]   r
   r   z/memory_snapshot.pickle)enabledz0The collect memory snapshot has been written to )	r   cudais_availablememory_record_memory_historyr   r   _dump_snapshotrm   )r   snapshot_paths     r   collect_memory_snapshotr   r  s     ::""$$$	JJ,,,@ r!4**,--DEM	JJ$$]3	JJ,,T,:	<]O
LMr   c                 j   ddl }|j                         }|j                  dddd       |j                  dd	dd
       |j                  dddd       |j                  ddd       |j                  ddd       |j                         }|j                  rt        | |j                         yd}d}t        j                  j                         rt        j                  j                           |||      dz  }t        j                  j                         r1t        j                  j                         }t        d|dz  dd       t        j                  j                         r|j                  rt        |       |j                  rt!        |||| |       |j"                  rt%        | |       yy)zM
    This is the function called in __main__ block of a compiled module.
    r   Nz--benchmark-kernelsz-k
store_truez,Whether to benchmark each individual kernels)actionhelpz--benchmark-all-configsr   z8Whether to benchmark each individual config for a kernelz	--profilez-pz&Whether to profile the compiled modulez--cuda-memory-snapshotz
            Whether to collect CUDA memory snapshot. Refer to
            "https://pytorch.org/blog/understanding-gpu-memory-1/
            for details about how to visualize the collected snapshot
        z--ncuzWhether to run ncu analysisr]   r   r   zPeak GPU memory usage g    .Ar   z MB)argparseArgumentParseradd_argument
parse_argsbenchmark_kernelsrz   r>   r   r   r   reset_peak_memory_statsmax_memory_allocatedrm   cuda_memory_snapshotr   r   r   r   r   )	r=   r   r  parserr_   r   r   r   peak_mems	            r   compiled_module_mainr    s    $$&F
;	   !G	   5	       *  
 Dnd.H.HI::""$JJ..03%ORVV::""$zz668H*8c>#*>cBC::""$)B)B#$@A<<, 88)EF r   ))dataclassesr   r   collectionsr   typesr   typingr   r   r   r   torch.autogradr   torch.utils._ordered_setr	   runtime.benchmarkingr   runtime.runtime_utilsr   r   r   r(   rk   r,   r0   r<   dictrz   	dataclassr|   r   r   r   r   r   r   r   r   r   r  r   r   r   <module>r     s      #  * *  % / - K>H >   J 3 $	: 	T
T
08c3h0HT
	T
n   {{,,66{ { 	{
 { 
{|  	
 #8 
*117L1	1h
N"7
N	
NEGEG7LEG	EGr   