
    Vh                   Z	   U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dlmZ d dlmZmZmZ d dl m!Z! d dlm"Z" d dl#m$Z$ d d	l%m%Z%m&Z& d d
l'm(Z( d dl)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1 d dl2m3Z3 d dl4Z4d dl5m6Z7 d dl4m8Z8m9Z9 d dl:m;Z;m<Z<m=Z= d dl>m?Z?m@Z@mAZA d dlBmCZC d dlDmEZEmFZF d dlGmHZHmIZImJZJmKZKmLZLmMZMmNZNmOZOmPZPmQZQ d dlRmSZS d dlTmUZUmVZV d dlWmXZXmYZY d dlZm[Z[m\Z\ d dl]m^Z^m_Z_ d dl`maZambZbmcZcmdZd d dlemfZf d dlgmhZhmiZimjZj d dlkmlZl d dlmm?Zn d dlompZpmqZq d dlrmsZsmtZtmuZu d dlvmwZw d d!lxmyZy d d"lzm{Z{ d d#l|m}Z} d d$l~mZ d d%lmZ  e?j                         rd d&lmZ d d'lmZmZmZmZ ndd(Zdd)Zdd*Zdd+Ze/rPd d,lmZmZmZ d d-lmZ d d.lmZmZ d d/lmZ d d0lmZ d d1lmZmZ d d2lzmZmZ d d3lmZmZ d d4lmZ d d5lmZ  e0d6      ZejH                  d7k(  Zd8Ze4jN                  jQ                  ed9      Z ejV                  e      Zdd:Z ej\                  d      dd;       Z G d< d=      Z G d> d?e      Z G d@ dAe      ZddBZddCZdddDZ	 d	 	 	 	 	 	 	 ddEZ	 d	 	 	 	 	 	 	 ddFZ	 	 	 d	 	 	 	 	 	 	 	 	 	 	 ddGZddHZ	 	 d	 	 	 	 	 	 	 	 	 ddIZejv                   G dJ dK             ZddLZddMZ G dN dOej~                        Z	 	 	 	 	 	 	 	 ddPZ ej\                  d      ddQ       ZddRZejv                   G dS dT             Z G dU dVeū      Z G dW dX      Z	 	 	 	 	 	 	 	 	 	 ddYZddZZ G d[ d\      Z ej\                  d      dd]       Zeb G d^ d_             Z G d` da      Zebej\                  ddb              ZddcZdaddede<   ddfZeb G dg dh             Z	 	 	 	 	 	 ddiZeb G dj dkeӫ             Zeb G dl dmeի             Zeb G dn doeի             ZddpZddqZeb G dr ds             Z	 	 	 	 	 	 ddtZdduZddvZddwZddxZddyZ	 d	 	 	 	 	 	 	 	 	 ddzZ G d{ d|      Zeb G d} d~             Zeb G d d             Z G d d      Z G d de      Zy)    )annotationsN)bisect_right)copy)c_void_pCDLLcdll)	timedelta)partial)Path)timetime_ns)
ModuleType)AnyCallablecastNoReturnOptionalTYPE_CHECKINGTypeVarUnion)Self)SymIntTensor)CompileEventLoggercountersdynamo_timed)configexcmetrics)cuda_env)rocm_compile_commandrocm_compiler)
_LINKER_SCRIPT_set_gpu_runtime_env_TORCH_PATH_transform_cuda_paths
CppBuilder
CppOptionsCppTorchDeviceOptionsget_compiler_version_info&get_name_and_dir_from_output_file_pathnormalize_path_separator)pick_vec_isa)CustomGraphPassCustomGraphPassType)has_frozen_paramsis_frozen_param)_reload_python_module _reload_python_module_in_subproc)	cache_dirdefault_cache_dir)ALIGN_BYTESclear_on_fresh_inductor_cacheis_linux
is_windows)trace_structured)extract_tensor_metadata
FakeTensorTensorMetadata)log_cache_bypass)r   )CacheArtifactManagerCacheArtifactType)has_hinthint_intShapeEnv)
OrderedSet   )CUSTOM_OBJ_FILENAME_PREFIX)create_cache)autotune_cache)AutotuneCacheBundler)TritonBundler)build_paths)log_global_cache_errorslog_global_cache_statslog_global_cache_valsuse_global_cachec                      y N argskwargss     I/home/dcms/DCMS/lib/python3.12/site-packages/torch/_inductor/codecache.pyrL   rL   p           c                      y rQ   rR   rS   s     rV   rM   rM   s   rW   rX   c                      y rQ   rR   rS   s     rV   rN   rN   v   rW   rX   c                      yNFrR   rR   rX   rV   rO   rO   y   s    rX   )	GeneratorKeysViewSequence)Future)_CompileFxKwargsCompiledFxGraphGraphLowering)ChoiceCaller)CompiledFxGraphConstants
OutputCode)
JsonDataTyRemoteCache)HalideInputSpec
HalideMeta)CachingAutotuner)	InputTypeTwin32iX  output_codec                 >    t         j                  j                  dS dS )N
cubin_path
hsaco_path)torchversionhiprR   rX   rV   get_cpp_wrapper_cubin_path_namerw      s     ==,,4<F,FrX   c                    | >t        t        j                  j                  | t        j                         d               S d S )Nhash)r   ospathjoin	CacheBase
get_system)global_cache_dirs    rV   get_global_cache_path_implr      sA     ' 	RWW\\*I,@,@,B6,JKL rX   c                      e Zd Ze ej
                  d      dd              Zee ej
                  d      d	d                     Zed
d       Z	ddZ
ddZddZy)r}   Nc                    	 ddl m}   |        }	 dd id|id}t        j                  j                  t        j                  j                               }t        j                  j                  3|j                  |d   d<   t        j                  j                  |d   d<   n2|j                  |d   d<   t        j                  j                  |d   d	<   t        j                  t        j                   |d
      j#                  d            j%                         |d<   |S # t        $ r d }Y w xY w# t        t        f$ r i }Y qw xY w)Nr   )
triton_keynametriton)deviceru   r   ru   cudarv   T)	sort_keysutf-8ry   )triton.compiler.compilerr   ModuleNotFoundErrorrt   r   get_device_propertiescurrent_deviceru   r   gcnArchNamerv   AssertionErrorRuntimeErrorhashlibsha256jsondumpsencode	hexdigest)r   triton_versionsystemdevice_propertiess       rV   r~   zCacheBase.get_system   s3   	"; (\N	!4.n&F !&

 @ @

))+! }}!!-+<+A+Ax (,1MM,>,>y!&)+<+H+Hx (+0==+<+<y!%(
 !JJv.55g>

)+ 	v 7 # 	"!N	"& - 	F	s#   D CD1 D.-D.1EEc                     t        t        j                  j                  t	               dt
        j                         d               S )Ncachery   )r   rz   r{   r|   r4   r}   r~   rR   rX   rV   get_local_cache_pathzCacheBase.get_local_cache_path   s0     BGGLLgy7K7K7Mf7UVWWrX   c                 4    t        t        j                        S rQ   )r   r   r   rR   rX   rV   get_global_cache_pathzCacheBase.get_global_cache_path   s    )&*A*ABBrX   c                6    t         j                         | _        y rQ   )r}   r~   r   selfs    rV   __init__zCacheBase.__init__   s    **,rX   c                    | j                         }|j                         si S t        |      5 }t        j                  |      }d d d        |d   S # 1 sw Y   d   S xY wNr   )r   is_fileopenr   load)r   local_cache_pathlocal_cache_fplocal_caches       rV   get_local_cachezCacheBase.get_local_cache   sa    446'')I"# 	4~))N3K	47##	47##s   AAc                    | j                         }t        t        |      t        j                  | j
                  |dd      d       y )N)r   r      )indentT	make_dirs)r   write_atomicstrr   r   r   )r   r   r   s      rV   update_local_cachezCacheBase.update_local_cache   s<    446 !JJ$++DQO	
rX   returndict[str, Any])r   r   )r   Optional[Path]r   None)r   r   r   r   )__name__
__module____qualname__staticmethod	functools	lru_cacher~   r7   r   r   r   r   r   rR   rX   rV   r}   r}      s    Y"  "H "YX  # X C C-$
rX   r}   c                      e Zd ZddZddZy)
LocalCachec                N    | j                         }|}|D ]  }||v r||   } y  |S rQ   )r   )r   keysr   	sub_cachekeys        rV   lookupzLocalCache.lookup   s?    $$&	 	Ce|!#J			 rX   c                   | j                         }|}|dd D ]  }|j                  |i        ||   } |||d   <   | j                  |       y )Nr   )r   
setdefaultr   )r   valuer   r   r   r   s         rV   	set_valuezLocalCache.set_value   sa    $$&	": 	'C  b)!#I	' $	$r(&rX   N)r   r   r   Optional[dict[str, Any]])r   r   r   r   r   r   )r   r   r   r   r   rR   rX   rV   r   r      s    
	'rX   r   c                  Z    e Zd Z ej                  d      dd       Z	 	 	 	 	 	 	 	 	 	 ddZy)PersistentCacheNc                    | j                         }||j                         si S t        |      5 }t        j                  |      }d d d        |d   S # 1 sw Y   d   S xY wr   )r   r   r   r   r   )r   global_cache_pathglobal_cache_fpglobal_caches       rV   get_global_cachez PersistentCache.get_global_cache   sg     668$,=,E,E,GI#$ 	699_5L	6G$$	6G$$s   AA c                   t        j                         t        t        | j                        }t        t
        | j                        }t        t        | j                        }i ddfd}t        j                  st        j                  rt        j                  r| j                         ni }	 ||	      st               r || j                         |      s|	  |      t        fdD              sJ |	j                  i        |	   j                  i       j                  i        j!                         D ]!  \  }
}||	         |
j#                         <   # 	 | j'                  |	       D 
ci c]  }
|
j#                         |
    }}
 ||       S t               r || j                         |       S # t$        $ r} ||       |d}~ww xY wc c}
w )aG  
        Check to see if we have benchmarked the given choice callers. For each
        choice caller:

            1. Check global_cache[op][inputs][choice][precision], return benchmark if cached.
            2. Check local_cache[op][inputs][choice][precision], return benchmark if cached.
            3. If benchmark is not None:
                a. `max_autotune_gemm=True`: benchmark the choice, update
                    local_cache[op][inputs][choice], and return the benchmark.
                b. `max_autotune_gemm=False`: don't benchmark the choice, return nothing.
        Nc                    d}D ][  }|j                         }|| j                  i       j                  i       j                  i       v r|          |   	|<   Yd} n |r	 ||       |S )z2Check if `cache` contains data for all the choicesTF)cached)hash_keyget)
r   callbackhitchoicechoice_hashchoicesinputsop	precisiontimingss
        rV   check_cachez+PersistentCache.lookup.<locals>.check_cache$  s    C! $oo/%))B"3"7"7"C"G"G	SU"VV&+Bi&7	&B;&OGFO  C $JrX   )r   c              3  &   K   | ]  }|v  
 y wrQ   rR   ).0r   r   s     rV   	<genexpr>z)PersistentCache.lookup.<locals>.<genexpr>B  s     GVv0Gs   rQ   )r   r   r   r   r   bool)rt   get_float32_matmul_precisionr
   rM   r   rN   rL   r   max_autotunemax_autotune_gemmautotune_local_cacher   rO   r   allr   itemsr   r   r   )r   r   r   r   	benchmark	log_statslog_vals
log_errorsr   r   r   timingetimings_to_logr   r   s    ```          @@rV   r   zPersistentCache.lookup	  s   $ 668	2DKKVYW	0$++r69U#T[["fi

 	 	  &":":4:4O4O$..0UWK  ,$&#D$9$9$;iP)'0GGwGGGG**2r2O..vr:EEiQST*1--/ WPVB/	:6??;LMW ''4 FM";AFOO%wv6" " (  --/)D ! $ qMG"s   .B	G G3	G0!
G++G0r   )
r   zlist[ChoiceCaller]r   r   r   r   r   z4Optional[Callable[[Any], dict[ChoiceCaller, float]]]r   zdict[ChoiceCaller, float])r   r   r   r   r   r   r   rR   rX   rV   r   r      s]    Y% %N#N N 	N
 HN 
#NrX   r   c                     t         j                  j                  t               d      } t         j                  j	                  |       st        j
                  | d       | S )NlocksTexist_ok)rz   r{   r|   r4   existsmakedirs)lock_dirs    rV   get_lock_dirr   Z  s;    ww||IK1H77>>(#
Ht,OrX   c                    t        j                  t        j                  |       j	                               d d j                  d      j                         S )N3   r   )base64	b32encoder   r   digestdecodelower)datas    rV   sha256_hashr  a  s@    GNN40779:3B?FFwOUUWWrX   c                    t        | t              r| n| j                  d      }|r+t        |t              r|n|j                  d      }|dz   |z   }dt        |      z   S )Nr   s   ||c)
isinstancebytesr   r  )codeextrahashing_strextra_bs       rV   	code_hashr  f  sV    $T51$t{{77KK%eU3%g9N!E)G3[)))rX   c                F   |rKt         j                  j                  |      r|}nTt         j                  j                  t	               |      }n+t         j                  j                  t	               | dd       }t         j                  j                  ||  d|       }| ||fS )NrE      .)rz   r{   isabsr|   r4   )basename	extensionspecified_dirsubdirr{   s        rV   get_pathr  n  sz     77=='"FWW\\)+}=Fik8Aa=977<<8*Ai[ 9:DVT!!rX   c                p    |dk(  rt        | |      S |dv rt        t        |             S t        d|       )Nr	  )cubinhsacospvzUnknown hash type )r  reprr   )contentr
  	hash_types      rV   get_hashr  |  sD     F%((--g''
-i[9
::rX   c                    t        | j                         ||      }t        |||      \  }}}t        j                  j                  |      st        || d       ||fS )NTr   )r  stripr  rz   r{   r   r   )	r  r  r
  r  r  r   r  _subdirr{   s	            rV   writer"    sS     	:C&sI}EHgt77>>$T7d3T>rX   c                     t        | d      d   S )zT
    Write the `text` to a file and return the path computed based on the hash.
    txtrE   r"  )texts    rV   
write_textr'    s     ua  rX   c                R   t        |t        t        f      sJ d       t        |       }|r|j                  j                  dd       |j                  dt        j                          dt        j                          dz  }t        |t              rdnd}|j                  ||rdnd 	      5 }|j                  |       d d d        	 |j                  |
       y # 1 sw Y   xY w# t        $ r6 t        s t        j                   ||       t        j"                  |       Y y w xY w)Nz6Only strings and byte arrays can be saved in the cacheT)parentsr   r  z.tmpwwbr   )encoding)target)srcdst)r  r   r  r   parentmkdirrz   getpid	threading	get_identr   r"  renameFileExistsError_IS_WINDOWSshutilcopy2remove)path_r  r   encode_utf_8r{   tmp_path
write_modefs           rV   r   r     s    gU|, @, ;D$6{{qQy/B/B/D.ETJJH"7C0dJ	z|G	N RS	
t$    	t,
		(s   -CC' C$'<D&%D&c                  &    e Zd ZU dZded<   ded<   y)TensorMetadataAndValueszk
    TensorMetadata plus the elements as a list of raw values.
    Used for hashing inlined constants.
    r=   tensor_metadata	list[Any]valuesNr   r   r   __doc____annotations__rR   rX   rV   rA  rA    s    
 $#rX   rA  c                    | S rQ   rR   xs    rV   _identrK    s    HrX   c                d    t        |       }t        | d      st        j                  |dd      }|S )zs
    Extracts the tensor metadata and removes fields of the TensorMetadata
    that are not needed for caching
    _is_inductor_staticr   N)storage_offsetstorage_bytes)r;   hasattrdataclassesreplace)tmetas     rV   %extract_tensor_metadata_for_cache_keyrU    s2    
 #1%D1+,""4NKrX   c                       e Zd ZdZ	 d	 	 	 	 	 d fdZ	 	 	 	 ddZ	 	 	 	 ddZddZddZ	 	 	 	 ddZ	ddZ
dd	Zdd
Z xZS )FxGraphCachePicklera:  
    Custom pickler to customize the pickling of some objects (Tensors), only for the
    purpose of computing a hash for keying into the FxGraphCache. Tensors contain
    objects that don't pickle and/or vary between runs, and we want to capture the
    data that allow us to compute a stable, but safe hash.
    c                v   t        j                         | _        t        |   | j                         t
        j                  j                         | _        | j                  j                  t        t        j                  | j                        t        j                  t        j                  | j                        t        j                   j"                  j$                  t        j                  | j                        t        j&                  t        j                  | j(                        t        j*                  j,                  j.                  j0                  t        j                  | j2                        i       |r6t        j                  | j4                        | j                  |j6                  <   d| _        y)a2  
        Create an FX graph pickler. If include_non_inlined=True, then pickling will
        include the _values_ for all Tensors. (Note that any tensors are constants
        attached as attributes to the GraphModule). Otherwise, pickling will include
        only the metadata for these tensors.
        TN)ioBytesIO_streamsuperr   copyregdispatch_tabler   updater<   r   r
   _reduce_fake_tensorrt   r   _reduce_tensornn	parameter	Parameterr   _reduce_symintfxexperimental_backward_stateBackwardState_reduce_unsupported_reduce_graph_module	__class__fast)r   gmhas_user_defined_triton_kernelsrl  s      rV   r   zFxGraphCachePickler.__init__  s)    zz|&%4499;""I--d.F.FGi//0C0CD"",,i.?.?@S@S.Ti//0C0CD%%55CCYEVEV,,F
	
 +090A0A))1D- 	rX   c                *    t        |      }t        |ffS )z7
        Custom reducer to pickle FakeTensors.
        )rU  rK  )r   rS  metadatas      rV   r`  z'FxGraphCachePickler._reduce_fake_tensor  s     9;$$rX   c                P   ddl m} |j                  rt        d      t	        |      }t        |      r |j                  |      s	t        |ffS t               }|j                         }t               |z
  }|dkD  rt        j                  d|dd       t        t        ||      ffS )z
        Custom reducer to pickle Tensors.  If we see tensors, we know they're constants
        stored as attributes on the GraphModule.
        rE   rc   zmkldnn tensors unpickleableg      ?z0FX graph cache copying of a large constant took z.1zs. Please file an issue.)graphrd   	is_mkldnnBypassFxGraphCacherU  r1   can_inline_constantrK  r   tolistwarningswarnrA  )r   rS  rd   rq  startrD  elapseds          rV   ra  z"FxGraphCachePickler._reduce_tensor  s     	);; %%BCC8; 1&Gm&G&G&JXK(( &5.S=MMB72, O( (
 06BDEErX   c                &    t         t        |      ffS )z3
        Custom reducer to pickle SymInts.
        )rK  r   r   ss     rV   re  z"FxGraphCachePickler._reduce_symint/  s     Q	""rX   c                    t        d      )z{
        Custom reducer to handle any objects that we don't support and therefore
        raise to bypass caching.
        zReduce unsupported)ru  r}  s     rV   rj  z'FxGraphCachePickler._reduce_unsupported8  s    
 !!566rX   c                    |j                         \  }\  }}|d   }t        j                  dd|      }t        j                  dd|      }||d<   |||ffS )a  
        Custom reducer for graph module to handle irrelevant data for user
        defined triton kernels
        Essentially what we are doing here is a huge hack where user defined
        triton kernel contain a dynamo time side table and the arguments to the
        call_function are indicies into this side table. These arguments are not
        for hashing purposes since we included the source code into the cache
        key and the numbers are prone to give false negatives due to ordering.
        _codezkernel_idx = \d+ zconstant_args_idx = \d+)
__reduce__resub)r   rn  fnr  importsr	  s         rV   rk  z(FxGraphCachePickler._reduce_graph_module?  s_     !mmoOT7G}vv)2t4vv0"d;WD'?""rX   c                   	 | j                  |       | j                  j                         | j                  j                  d       | j                  j	                  d       S # t
        t        f$ r(}t        j                  dd       t        d      |d}~ww xY w# | j                  j                  d       | j                  j	                  d       w xY w)z<
        Pickle an object and return a byte string.
        r   zFailed to pickle cache keyTexc_infoN)
dumpr[  getvalueseektruncate	TypeErrorAttributeErrorlogwarningru  )r   objr   s      rV   r   zFxGraphCachePickler.dumpsR  s    
	%IIcN<<((* LLa LL!!!$ >* 	JKK4tKD$%ABI	J LLa LL!!!$s#   *A# #B2#BBB 8Cc                :    | j                  |      }t        |      S )zE
        Serialize an object and return a hash of the bytes.
        )r   r  )r   r  serialized_datas      rV   r  zFxGraphCachePickler.get_hashb  s     **S/?++rX   c                >    d fd}g }t        |      j                         D ]  \  }}t        |t              rTt	        t        |            D ]<  } j                  ||         }|j                  d| d| d| d |||                 > jt        |t              rM|j                         D ]9  \  }}	 j                  |	      }|j                  d| d| d| d ||	              ; ǉ j                  |      }|j                  d| d| d ||               |S )z
        Get a printable string describing in more detail all the attributes
        comprising an object. Useful for debugging when one graph hashes
        to a different value than another.
        c                   t        | t        j                        rt        t	        |             S t        | t
              ryt        |       j                  v r*t         j                  t        |          |       d         S t        |       S )Nz<bytes>rE   )r  rt   r   r   rU  r  typer^  )r  r   s    rV   get_strz0FxGraphCachePickler.debug_lines.<locals>.get_strp  sq    #u||,@EFFC' cd11194..tCy9#>qABB3xrX   [z] z]: z: r  r   r   r   )	varsr   r  listrangelenr  appenddict)
r   inpr  linesattrr  iihkvs
   `         rV   debug_lineszFxGraphCachePickler.debug_linesi  s)   		  c* 	>ID##t$C/ LBc"g.ALL1QCr$qCB8H7I!JKL C&IIK EDAqa(ALL1QCr$q3wqzl!CDE MM#&q2dV2gcl^<=	> rX   F)rn  torch.fx.GraphModulero  r   r   r   )rS  r   r   z.tuple[Callable[[T], T], tuple[TensorMetadata]])rS  r   r   zNtuple[Callable[[T], T], tuple[Union[TensorMetadata, TensorMetadataAndValues]]])r~  r   r   z#tuple[Callable[[T], T], tuple[str]])r~  r   r   r   )rn  r  r   z&tuple[Any, tuple[dict[str, Any], str]])r  r   r   r  r  )r  FxGraphHashDetailsr   	list[str])r   r   r   rF  r   r`  ra  re  rj  rk  r   r  r  __classcell__)rl  s   @rV   rW  rW    s     16" " *." 
	"H%%	7% F F	W FD#7#&#	/#&% ,rX   rW  c                   t        t        j                  | |      d       D ]  }|j                  j	                  |j
                  d       }|J |j                  }|J t        |d      5 }|j                  |j
                  j                  d             |j                  |j                                d d d        |j                  st        |j                  |j
                   d|        y # 1 sw Y   =xY w)Nc                    | j                   S rQ   )r   rI  s    rV   <lambda>z!build_code_hash.<locals>.<lambda>  s
     rX   r   rbr   r  )sortedpkgutiliter_modulesmodule_finder	find_specr   originr   r_  r   readispkgbuild_code_hashsubmodule_search_locations)rootsprefixhasherlibspecmoduler?  s          rV   r  r    s     g**5&9?OP 
V  **388T:!!!&$ 	$1MM$))**734MM!&&(#	$ 99D;;		{!_fU
V
	$ 	$s   ,A
C22C;	c                 
   t        dd      5  t        j                         sd
d}  | t              cddd       S ddlm} |j                  d      j                         j                  d	      cddd       S # 1 sw Y   yxY w)zS
    Compute a key that contains relevant information about torch source files
    inductor_codecache_torch_keyT)log_pt2_compile_eventc                >   d}t         j                  j                  t              }|D cg c]"  }t         j                  j	                  ||      $ }}t        j                         }|j                  t        j                  j                  d             t        | gd|       |D ]V  }t         j                  j                  |      s#t        |d      5 }|j                  |j                                d d d        X |j                         S c c}w # 1 sw Y   wxY w)N)z"codegen/aoti_runtime/interface.cppcodegen/cpp_prefix.hz	script.ldr   r  r  )rz   r{   dirname__file__r|   r   r   r_  rt   __version__r   r  r   r   r  r   )rootextra_filesinductor_rootrJ  r  r{   r?  s          rV   get_code_hashz torch_key.<locals>.get_code_hash  s    
 !# 9GRS!rww||M1=SS )e//66w?@F3' 4Dww~~d+!$- 4"MM!&&(34 44 }}& T4 4s   'D DD	Nr   parutilztorch/src_hash.txtascii)r  r   r   r  )	r   r   	is_fbcoder%   libfb.pyr  get_file_contentsrstripr   )r  r  s     rV   	torch_keyr    sw    
 
4D	Q X!'( !-/X X2 	%(()=>EEGNNwW7X X Xs   $A93A99Bc                 H    t         j                  j                  t              S rQ   )rz   r{   r  r  rR   rX   rV   get_inductor_rootr    s    77??8$$rX   c                      e Zd ZU dZded<   y)OrderedSetHolderzb
    See FxGraphHashDetails. Holds a sorted list to support stable hashing
    of set kwargs.
    rC  r   NrE  rR   rX   rV   r  r    s    
 rX   r  c                      e Zd ZdZy)ru  zI
    Exception to indicate that the FxGraphCache should be bypassed.
    N)r   r   r   rF  rR   rX   rV   ru  ru    s    rX   ru  c                  B    e Zd ZdZdgZ	 	 	 	 	 	 	 	 	 	 ddZ	 	 	 	 ddZy)r  zz
    Object to capture all the details for a compiled FX graph relevant to computing
    a safe and stable cache key.
    graph_idc                   || _         || _        t        j                  | _        i | _        t        |j                               D ]\  \  }}|| j                  vst        |      t        t        fv r"t        t        |            | j                  |<   N|| j                  |<   ^ ddlm}m}m}	 ddlm}
 g | _        |@|j'                         D ],  }t)        |t*        j,                  j.                        s)t1        j2                  |j4                  j7                  d|      |j4                  j7                  d|	            D ]  }ddlm} |j=                  |j>                  d         }d }t)        ||      r=|j@                  r%tC        t        d |j@                  D                    }|jD                  } |
|      }|jG                  |j>                  d	         }| j$                  jI                  |||f        / || _%        tM        d
 |D               }|rAt*        jN                  jQ                         r#t*        jN                  jS                         | _*        t+        jV                         t+        jX                         t*        jZ                  j\                  j^                  f| _0        t*        jb                  jd                  jf                  jh                  t*        jb                  jd                  jf                  jj                  t*        jb                  jd                  jf                  jl                  f| _7        tq               | _9        tt        jw                         | _<        t{        j|                         | _?        | j                  tz        j                        | _A        | j                  tz        j                        | _B        y )Nr   )kernel_side_table triton_kernel_wrapper_functionaltriton_kernel_wrapper_mutation)9user_defined_triton_kernel_transitive_closure_source_codecall_function)r   r-  )	Autotuner
kernel_idxc              3  x   K   | ]2  }t        d  |j                         j                         D               4 yw)c              3  2   K   | ]  }t        |        y wrQ   )r   )r   kvs     rV   r   z8FxGraphHashDetails.__init__.<locals>.<genexpr>.<genexpr>  s     *Tr3r7*Ts   N)r  
all_kwargsr   )r   r  s     rV   r   z.FxGraphHashDetails.__init__.<locals>.<genexpr>  s2      '"() %+*TQ\\^=Q=Q=S*T$T'"s   8:constant_args_idxc              3  P   K   | ]  }t        |t        j                           y wrQ   )r  rt   r   )r   rJ  s     rV   r   z.FxGraphHashDetails.__init__.<locals>.<genexpr>(  s     "W1:a#>"Ws   $&)Crn  example_inputscconfigcache_key_tag	fx_kwargsr  r   EXCLUDED_KWARGSr  setrD   r  *torch._higher_order_ops.triton_kernel_wrapr  r  r  torch._inductor.codegen.wrapperr  user_defined_triton_sourcemodulesr  rt   rf  GraphModule	itertoolschainrs  
find_nodestriton.runtime.autotunerr  
get_kernelrU   configsr   r  get_constant_argsr  inputs_to_checkanyacceleratoris_availablecurrent_device_indexdefault_cuda_device_index$are_deterministic_algorithms_enabled-is_deterministic_algorithms_warn_only_enabledutilsdeterministicfill_uninitialized_memory!deterministic_algorithms_settingsbackendsr   matmul
allow_tf32&allow_fp16_reduced_precision_reduction&allow_bf16_reduced_precision_reductioncuda_matmul_settingsr  torch_versionr}   r~   system_infor   save_config_portableinductor_config_get_custom_pass_detailpost_grad_custom_pre_passpost_grad_custom_post_pass)r   rn  r  r  r  r  r  r  r  r  r  r  noder  kernelr  kernel_sourceconstant_argsno_tensor_inputss                      rV   r   zFxGraphHashDetails.__init__  s$    ,$22
 -/9??,- 	*DAq,,,7sJ// )9(CDNN1%()DNN1%	*	
 	

	
 68'>**, #!&%((*>*>?%OOLL++*3S ,  LL++*3Q , 	  D C.99$++l:STF"G!&)4!>>&) & '"-3^^'" !"'G "( R" "
 %6$G$G$78%M 33::&w?= #L  /""W"WWW  1 1 > > @-2->->-S-S-UD* 668??AKK%%??2
. NN&&11NN&&MMNN&&MM%
! '[$//1%::<)-)E)E,,*
& +/*F*F--+
'rX   c                L    |sy t        |t              sJ |j                         S rQ   )r  r.   uuid)r   custom_passs     rV   r  z*FxGraphHashDetails._get_custom_pass_detailI  s)     +777!!rX   N)
rn  r  r  Sequence[InputType]r  ra   r  Sequence[int]r   r   )r  r/   r   zOptional[Any])r   r   r   rF  r  r   r  rR   rX   rV   r  r    sa     "lOl
 l
 ,l
 $	l

 'l
 
l
\"."	"rX   r  c                   t        | |||      }t        |j                        dk7  }t        | |      }d|j	                  |      z   }|j                  |      }dj                  |      }	t        j                  d| d|	        ||fS )z=
    Generate a unique hash of the FX graph for caching.
    r   r?  
z$FX graph cache hash details for key z:
)	r  r  r  rW  r  r  r|   r  debug)
rn  r  r  r  detailsro  picklerr   r  	debug_strs
             rV   compiled_fx_graph_hashr(  R  s     !^YPG&)'*L*L&MQR&R#!"&EFG   )
)C%%g.K		+&III4SEYKHIrX   c                   t         j                  j                         rt         j                  j                         syt	        | dz        }t        j                         rJt         j                  j                  d      }t        j                  d||       |t	        ||z  dz        z  }t        j                  d|       t        j                  j                  t        |             |S )z}
    Ephemerally increases the NCCL timeout when compiling for a distributed job
    Returns amount of seconds increased
    r   g    eAz>pytorch/remote_cache:ephemeral_timeout_fudge_factor_percentagezNEphemeral NCCL timeout increase fudge factor %d and original increase value %dd   zIncreasing NCCL timeout by %d)seconds)rt   distributedr  is_initializedintr   r  _utils_internaljustknobs_getval_intr  infodistdistributed_c10d"_add_ephemeral_timeout_for_all_pgsr	   )time_saved_nsincreased_timeout_secfudge_factors      rV   .add_ephemeral_timeout_increase_for_distributedr8  h  s    
 ))+53D3D3S3S3U 45,,AAL
 	\!	

 	%:\%IC%O!PPHH,.CD<</0 ! rX   c                  b   e Zd ZdZedd       Zedd       Zedd       Zedd       Ze	 	 	 	 	 	 	 	 	 	 	 	 dd       Z	edd       Z
e	 	 	 	 	 	 	 	 	 	 	 	 dd       Zedd	       Zedd
       Ze	 	 	 	 	 	 	 	 	 	 	 	 dd       Zedd       Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd       Zedd       Zy)FxGraphCachea7  
    Supports caching and reusing compiled Fx graphs.

    The overall strategy is as follows:
    - This cache stores entries on disk. When saving an entry, we can't
      serialize callables (that could be C++, Triton, etc.), so we serialize
      their own disk cache location. We then recreate the compiled artifact
      after fetching from disk.
    - For indexing the cache, we gather the fields relevant to identifying an
      FxGraph (the graph module, graph inputs, system settings etc.) into an
      FxGraphCacheDetails object, pickle it, and compute a hash for the key.
      See FxGraphCachePickler.
    - Among the metadata we store, we also include a guards expression that's
      appropriate for validating any symbols for Tensor arguments that have
      symbolic bounds. On cache lookup then, we evaluate those guards in the
      current context to validate that a cached entry can be served.
    - A given graph could have multiple compiled versions, corresponding to
      different sets of guards. Therefore, we store cache entries in the form:
          <temp dir>/<fx graph hash>/<serialized metatdata>
    - On lookup, we compute the key from the graph details, iterate over all
      leaf files in the corresponding subdirectory, deserialize the entry, and
      evaluate its guards expression. If the evaluation succeeds, we have a
      cache hit. If it fails, we compile the graph and store a new entry.
    - Finally, on a cache hit, we need to make sure any guards that would
      have been created during compilation are added to the current context.
    c                 R    t         j                  j                  t               d      S )zS
        Get the toplevel temporary directory for storing compiled graphs.
        fxgraph)rz   r{   r|   r4   rR   rX   rV   _get_tmp_dirzFxGraphCache._get_tmp_dir  s    
 ww||IK33rX   c                n    t         j                  j                  t        j	                         | dd |       S )zA
        Return the disk location for a given cache key.
        rE   r  )rz   r{   r|   r:  r=  r  s    rV   _get_tmp_dir_for_keyz!FxGraphCache._get_tmp_dir_for_key  s*    
 ww||L557Qq3GGrX   c                z    | D cg c]+  }t        |t        j                        st        |      s*|- c}S c c}w )z
        Get the backed SymInt objects from the input list. Note that we can never
        have guards that depend on unbacked symint.
        )r  rt   r   rA   )r   r~  s     rV   _filter_backed_symintsz#FxGraphCache._filter_backed_symints  s+     "QaZ5<<%@Xa[QQQs   888c                     t         j                  j                  j                         } | sy| j                  j
                  S )zG
        Helper to get the shape env from the tracing context.
        N)rt   _guardsTracingContexttry_get	fake_mode	shape_env)ctxs    rV   _get_shape_envzFxGraphCache._get_shape_env  s2    
 mm**224}}&&&rX   c                    t         j                         }|J t         j                  |      }|D cg c]  }t        |       }}d fd}	d}
d}t	               } |	       D ]c  \  }}|j
                  s|}
 nPt        |j                  |j
                  |            }t        j                  d |j
                  ||       |sa|}
 n |
d|fS |%t        j                  t        j                   |       |
j                  x}rut        j                   |      }|x}\t#        |      |d<   t%        j&                  d|j(                         t+        |j(                        dkD  rt%        j,                  d       	 |
j/                  |      d	d
lm} |j4                   |j4                  |
j6                         t;        j<                         }|
j6                  t?        j@                  |       |
j
                  rLt        |j                  |
j
                  |            }|du sJ t        j                  d |jB                         tD        jF                  jI                  |
jJ                         tL        dxx   |
jN                  z  cc<   tP        j                  d       tP        j                  d       tS        dfdfd       |
|fS c c}w # t8        $ r d|fcY S w xY w)z
        Lookup a compiled graph in the cache by key. On a hit, return the
        deserialized CompiledFxGraph object. On a miss, return None.
        Nc               3    K   rt         j                        } t        j                  j	                  |       rt        t        j                  |             D ]_  }	 t        t        j                  j                  | |      d      5 }|j                         }t        j                  |      |f d d d        a rs	 j                        x}^t!        |t"              sJ |d   }t!        |t$        t&        f      sJ t)        j*                  |      }t        j                  |      |f y y y # 1 sw Y   xY w# t        $ r t        j                  dd       Y w xY w# t        $ r t        j                  dd       Y y w xY ww)Nr  z,fx graph cache unable to load compiled graphTr  r  )r:  r?  rz   r{   r   r  listdirr   r|   r  pickleloads	Exceptionr  r  r   r  r  r   r  r   	b64decode)	r  r{   r?  r  
cache_datar  r   localremote_caches	         rV   iterate_over_candidatesz;FxGraphCache._lookup_graph.<locals>.iterate_over_candidates  sl     %::3?77>>&) &rzz&'9 : 	!%bggll64&@$!G E1*+&&(&,ll7&;W&D DE	 
&2&6&6s&;;
H)*d;;;)&1)$e==="("2"24"8$ll73W<< I E E  ) KK N)- (  ! KKFQU   sg   AF	*D<*D00D<8F	=A0E# -F	0D9	5D<< E F	E  F	# FF	FF	zEfx graph cache key %s evaluating guards [%s] with values %s => hit=%striton_bundler_metainductor_compile)cached_kernel_namesr   num_triton_bundlesrE   rc   r	  Tz*fx graph cache key %s post-load guards: %sinductorzOutput code: 
%szOutput code written to: %sinductor_output_codec                     d iS )NfilenamerR   )artifact_paths   rV   r  z,FxGraphCache._lookup_graph.<locals>.<lambda>J  s    Z/ rX   c                      S rQ   rR   rY  s   rV   r  z,FxGraphCache._lookup_graph.<locals>.<lambda>K  s    t rX   
payload_fn)r   z4Generator[tuple[CompiledFxGraph, bytes], None, None])*r:  rI  rA  rB   r  guards_exprr   evaluate_guards_expressionr  r$  r?   record_artifactr@   INDUCTOR_triton_bundlerJ   read_and_emitr   r   try_add_pt2_compilerW  r  increment_toplevelafter_deserializationrs  rd   save_output_codesource_codeOSErrorrH   inductor_meta_from_configrI   begin_compileguardsr   CachedMetricsHelperapply_deltasmetrics_deltasr   counter_deltasoutput_code_logr:   )r   r  rR  rS  	constantsrG  symintsr~  hintsrT  rs  pickled_content
cache_info	candidater   bundlerU  rT  rd   inductor_metacheckr^  r	  s   ` ``                 @@rV   _lookup_graphzFxGraphCache._lookup_graph  s    !//1	$$$55nE&-.!..	@ %)V
*A*C 	&I((! 44Y5J5JERC IIW%% !-	0 =##& 00!**C )))6)"/"="=f"E++847I
01"66&D<T<T t//014&99:NO	$!77	BM, --9...u/@/@A '@@B  **=tD 44U5F5FPE D= =II<c9CSCS 	##001E1EF 4 44148:MJ"/#	

 j  w /t  	$ ##	$s   K??K K*)K*c                   t         j                  |       }t        j                  j	                  |      st        j
                  |d       t        j                  j                  |t        |            }t        ||d       y )NTr   r   )	r:  r?  rz   r{   r   r   r|   r  r   )r   r  r  r{   s       rV   _write_to_local_cachez"FxGraphCache._write_to_local_cacheO  sW    2237ww~~f%KK.
 ww||FK$89T7d3rX   c                r   ddl m} t        ||      sJ dt        |       d       t	        |      }|j                          t        j                         }|J t        j                  |      }|j                  |      }	|j                  ||	      |_        	 t        j                  |      }
	 t%        j&                  t(        j*                  | |
       |rt        j-                  | |
       |rVt/        |j0                  xs ddz        }t3        j4                  |
      j7                  d      |d}|j9                  | |       yy# t        $ r. t        j!                  dd	       t"        d
   dxx   dz  cc<   Y yw xY w# t        $ r. t        j!                  dd	       t"        d
   dxx   dz  cc<   Y yw xY w)z=
        Store a serialized CompiledFxGraph on disk.
        rE   )rb   zserialization for z NYIN)placeholdersrp  z1fx graph cache unable to serialize compiled graphTr  rZ  fxgraph_cache_pickle_errorr   g    .Ar  )r  time_taken_msz!fx graph unable to write to cachefxgraph_cache_write_error)
compile_fxrb   r  r  r   prepare_for_serializationr:  rI  rA  get_pruned_guardsproduce_guards_expressionrb  rM  r   rO  r  r  r   r?   rd  r@   re  r  r.  _time_taken_nsr   	b64encoder  put)r   compiled_graphr  rR  rS  rb   disk_compiled_graphrG  rw  rp  r  r  rQ  s                rV   _save_graphzFxGraphCache._save_graph[  s    	0./: 	
 n!5 6d;	
: #>2557 !//1	$$$55nE,,W5*3*M*M  +N +
'	ll#67G	C 00!**C 223@ #%8%G%G%L1QT$T U",,W5<<WE%2*
   j1   	KKCd   Z !=>!C>	*  	CKK;dKKZ !<=B=	Cs%   E /BE? 4E<;E<?4F65F6c                4   | j                         D ]  }t        |t        j                  j                        s)|j
                  j                  D ]  }t        |j                  t        j                  j                        r@|j                  j                         s&t        d|j                  j                                |j                  dk(  st        t        | |j                        t        j                  j                         st        d        y )Nz!Can't cache HigherOrderOperator: getattrzCan't cache torchbind objects)r  r  rt   rf  r  rs  nodesr-  _opsHigherOrderOperator	cacheableru  r   r   r  _CScriptObject)rn  r  r  s      rV   _check_for_hopzFxGraphCache._check_for_hop  s    jjl 	NFfehh&:&:;** Nt{{EJJ,J,JK KK113,;DKK<L<L<N;OP  77i'JB,ehh.C.C- --LMMN	NrX   c                J   t         j                  t         j                  fD ]0  }|st        |t              r|j                         r't        d       t        |       r*t        j                  j                  d      st        d      t         j                  j                  rt        d      ddlm} |j                  rt         j#                  d       t        t$        j'                          t         j#                  d	       t        d
      t$        j)                  |        y)z
        Check some conditions that would preclude caching and raise BypassFxGraphCache
        to bypass in case caching is not possible.
        z!Unsupported post grad custom passz,pytorch/inductor:allow_freezing_with_cachingz$Skipping graph with frozen constantszORuntime constant folding can introduce constants that aren't static across runsr   )CompilerBisectorz$dont cache graph when bisect enabledNzfx graph cache no shape envzNo shape env)r   r  r  r  r.   r  ru  r0   rt   r/  justknobs_checkaot_inductoruse_runtime_constant_folding!torch._inductor.compiler_bisectorr  bisection_enabledr  r$  r:  rI  r  )rn  pr  s      rV   _check_can_cachezFxGraphCache._check_can_cache  s     22F4U4UV 	NA*Q8()LMM	N
 R )>)>)N)N:*
 %%KLL;;$% 
 	G--II<=$$ &&(0II34$^44 	##B'rX   c                @   	 t         j                  |        t        | |||      \  }}||fi fS # t        $ rf}t        d   dxx   dz  cc<   t
        j                  d|       |rt        dt        |             dt        |      t               d}d|fcY d}~S d}~ww xY w)	a  
        Checks that the inductor input is cacheable, then computes
        and returns the cache key for the input.
        Returns (key_info, cache_info) where:
        - key_info is (hash_key, debug_lines), and
        - cache_info will contain debug info in the event of BypassFxGraphCache.

        NB: It is possible to have this function return a union instead. But
        I personally believe it is more annoying/difficult to read in that format.
        rZ  fxgraph_cache_bypassrE   z%Bypassing FX Graph Cache because '%s'bypass_fx_graphbypass)cache_statecache_bypass_reasoncache_event_timeN)
r:  r  r(  ru  r   r  r1  r>   r   r   )	rn  r  r  r  remoter   r  r   rz  s	            rV   prepare_keyzFxGraphCache.prepare_key  s    $	$))"-5NI C [!2%% " 
	$Z !78A=8HH<a@ !2CF;''*1v$+IJ
 ##
	$s   &. 	BABBBc                 F    d} t        | t        j                         dd      S )zK
        Attempts to load the remote cache, returns None on error.
        zfx-graph-v1FbRemoteFxGraphCacheRemoteFxGraphCache)rG   r   r  )cache_ids    rV   get_remote_cachezFxGraphCache.get_remote_cache  s+    
 !" 	
 	
rX   c                d   t         j                  | ||||      \  }}i || |t               d}|t        j	                  d|        t
        d   dxx   dz  cc<   d|d<   |r+t        j                  d       t        j                  d	|        |j                  x}	3|	|d
<   t        j                  d|	dz         t        |	      x}
dk7  r|
|d<   ||fS |r+t        j                  d       t        j                  d|        t        j	                  d|        t
        d   dxx   dz  cc<   d|d<   ||fS )z
        Lookup the graph with the given key, and return results and metadata.
        Doesn't do any logging on its own, because AOTAutograd handles a cache miss
        differently from FXGraphCache.
        )r   
componentsr  zfx graph cache hit for key %srZ  fxgraph_cache_hitrE   r   r  "inductor_fx_remote_cache_hit_count!inductor_fx_remote_cache_hit_keysr5   distributed_ephemeral_timeout_usi  r   ephemeral_timeout_increase#inductor_fx_remote_cache_miss_count"inductor_fx_remote_cache_miss_keyszfx graph cache miss for key %sfxgraph_cache_missmiss)r:  r  r   r  r1  r   r   ri  add_to_set_toplevelr  r8  )r   r  r  rR  rS  is_backwardrv  r  rz  r5  ephemeral_increases              rV   load_with_keyzFxGraphCache.load_with_key  sw    &2%?%?i&
"


% '		

 %HH4c:Z !45:5(-J}%"558 #667 "0!>!>>K.;
?+"5568M
 FmTU& @RJ;< z)) "559 #668# HH5s;Z !56!;6(.J}%z))rX   c                 r    	 t        j                  t        j                                y# t        $ r Y yw xY w)z.
        Clear out the on-disk cache.
        N)r8  rmtreer:  r=  FileNotFoundErrorrR   rX   rV   clearzFxGraphCache.clear?  s.    
	MM,3356  		s   '* 	66Nr   r   )r   r   r   r   )r   r   r   zlist[torch.SymInt])r   zOptional[ShapeEnv])r   r   r  r   rR  r   rS  !Optional[RemoteCache[JsonDataTy]]rv  rf   r   0tuple[Optional[CompiledFxGraph], dict[str, Any]])r   r   r  r  r   r   )r   r   r  rg   r  r   rR  r   rS  r  r   r   )rn  r  r   r   )rn  r  r  r   r  ra   r  r!  r  r   r   z6tuple[Optional[tuple[str, list[str]]], dict[str, Any]])r   r  )r   r   r  r  r  r   rR  r   rS  r  r  r   rv  rf   r   r  r   )r   r   r   rF  r   r=  r?  rA  rI  r  r  r  r  r  r  r  r  r  rR   rX   rV   r:  r:    s   : 4 4 H H R R ' ' J!J!+J! J! 8	J!
 ,J! 
:J! J!X 	4 	4 8C8C"8C ,8C 	8C
 88C 
8C 8Ct N N" $( $(L "& "&+"& $"& '	"&
 "& 
@"& "&H 

 

 ;*;*;* ,;* 	;*
 8;* ;* ,;* 
:;* ;*z  rX   r:  c                    | j                  d      rt        j                  j                  |       S | j                  d      rt        j                  j                  |       S | dfS )zDReturns the path where the AOT Inductor compiled kernels are stored..soz.pt2r  )endswithrz   r{   split)r{   s    rV   split_aot_inductor_output_pathr  J  sK     }}Uww}}T""	v	ww}}T""RxrX   c                  v    e Zd ZU i Zded<    eej                        Zedd       Z	edd       Z
ed	d       Zy)
CudaKernelParamCachezdict[str, dict[str, Any]]r   c                    t        |||t        t        j                  j                        d         \  }}||t               <   || j                  |<   y )Nr   )r  r  )r"  r  r   r  output_pathrw   r   )clsr   paramsr  bin_type_r{   s          rV   r  zCudaKernelParamCache.setZ  sU    8##//	
4 59.01		#rX   c                :    | j                   j                  |d       S rQ   )r   r   )r  r   s     rV   r   zCudaKernelParamCache.geth  s    yy}}S$''rX   c                6    | j                   j                         S rQ   )r   r   )r  s    rV   get_keyszCudaKernelParamCache.get_keysl  s    yy~~rX   N)
r   r   r  zdict[str, str]r  r   r  r   r   r   )r   r   r   r   )r   zKeysView[str])r   r   r   r   rG  r   r  cache_clearclassmethodr  r   r  rR   rX   rV   r  r  U  sU    ')E$)u{{+K    ( (    rX   r  c                  :    e Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd       Zy)AotCodeCompilerc          
     z  HIJKLM |}t         j                  dk(  rt        d      t                t	               }t        ddt        |j                              }	t        |	j                               }
t        j                         xr dk(  xr j                  Lt        t        j                  j                        \  }}t        j                  j                  rdj!                  f      d	t#        d
|
|      \  }Mt#        d|
|      \  }Jt        j                  j$                  r<|j'                  M       t        j                  j                  s|j'                  J       t(        j+                  dM       t(        j+                  dJ       t-        dMfdfd       t-        dJfdfd       t/        M      }t/        J      }|j0                  |z  KKj3                         sKj5                  d       t7        t/        K      dz        }dCKLfd}ddlm} t=               } |t>        j@                  j!                  ||dz         tB              }|5  |rkt7        |jE                  d            }tG        |d      5 }|j#                  |       ddd       t        j                  j$                  r|j'                  |       t        j                  jH                  }|d <   t7        |jK                  |jL                   d!            }t        j                  jH                  jO                         D ],  \  }}tQ        |t6              rtQ        |t6              r'J d"        tG        |d      5 }|j#                  tS        jT                  t        j                  jH                               ddd       t7        |jK                  |jL                   d!            }tW        jX                  ||       t        j                  j$                  r<|j'                  |       t        j                  j                  s|j'                  |       |rt        j                  j                  nt7        |jE                  d#            }t[        fd$j\                  j_                         D              IdDd%Ht        j                  j`                  r5d&j!                  HIfd'j\                  j_                         D              }nd&}tc        |      }t        j                          xr |d(kD  } t        j                  jd                  rd} j                  | t        j                         |d)}!t        dEdt        j                  j                   d*|!}"t        dEd+di|!}#t        t7        |jL                        Mt7        |j0                        |",      }$|$j                         }%|$jg                         }&t        t7        |jL                        Jt7        |j0                        |#,      }'|'j                         }(|'jg                         })th        jk                  d-|%       th        jk                  d.|(       t        j                  j                  r~t7        |jK                  |jL                   d/            }*|"jm                  |*       |j'                  |*       |$jo                  |       |$jq                  |M       |j'                  |       n |$js                          |'js                          | s|}+d},nxtu        tv        ty        jz                  dty        j|                  tx        j~                        j                  d0      j                               },t        j                  d1|d2z   |,      }+ ||+t         j                        }-d}.i }/t        j                  jO                               D ]  \  }.\  }0}1tQ        |1tx        j                  j                        sJ t         |. }2th        jk                  d3|0|2       |2|/|0<   tx        j                  j                  |1      }3t>        j@                  j!                  |j0                  |2      }4t        |4|3d       |j'                  |4        t>        j@                  j!                  |j0                  d4      }5tG        |5d      5 }|j#                  tS        jT                  |/             ddd       |j'                  |5       tx        j                  j                  r
t               n	t               }6|6j                  j                         D 7cg c])  }7|7j                  j                  d5      r|7j                  + }8}7d6j!                  |8      }8t        |      \  }9}:t        |j                  L7      };t        |9|8r|&|)|-|8gn|&|)|-g|:|;,      }<|<j                         }=|<jg                         }th        jk                  d8|=       tG        Md9      5 }|j#                  d       |j#                  d:|% d       |j#                  d;|= d       ddd       tG        Jd9      5 }|j#                  d       |j#                  d:|( d       |j#                  d;|= d       ddd       t        j                  j                  rPt7        |jK                  |jL                   d<            }>|;jm                  |>       |j'                  |>       |j'                  t               | rt7        |jK                  |jL                   d=            }?tG        |?d>      5 }@|@j#                  |       |@j#                  t        j                  d?|,             ddd       |j'                  |?       |j'                  |-       |j'                  |8       |<jq                  ||-       |8j                         D ]  }A|<jq                  ||A        |<j                  |       n|<js                          |&|)|-fD ]  }Bt?        j                  |B        | rddlW}C|Cj                         }Dt        d@|D      }EtG        |dA      5 }F|Fj                         }G|Fj#                  dBE|G|Ez  z
  z         |Fj#                  |       |Fj#                  t        j                  d?|,             ddd       t        j                  j$                  r|j'                  |       ddd       t        j                  j$                  r|S S # 1 sw Y   
xY w# 1 sw Y   	xY w# 1 sw Y   YxY wc c}7w # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w)Fz
        Returns the .so path, or returns a list of files that were generated if
        config.aot_inductor.package=True.
        ro   z.AotCodeCompiler not yet supported for inductoroi)vec_isadevice_typeaot_moder   sourcesBuildOptioncpur#  r  zwrapper.cpp)r
  r  z
kernel.cppzWrapper code written to: %szKernel code written to: %s
graph_dumpc                     dd dS )Ninductor_aot_wrapper_codecppr   r  r]  rR   )wrapper_paths   rV   r  z)AotCodeCompiler.compile.<locals>.<lambda>  s    3( rX   c                      S rQ   rR   )wrapper_codes   rV   r  z)AotCodeCompiler.compile.<locals>.<lambda>  s    | rX   r`  c                     dd dS )Ninductor_aot_kernel_coder  r  rR   )kernel_paths   rV   r  z)AotCodeCompiler.compile.<locals>.<lambda>  s    2' rX   c                      S rQ   rR   )kernel_codes   rV   r  z)AotCodeCompiler.compile.<locals>.<lambda>  s    { rX   Tr   zCMakeLists.txtc                   |dk(  rQj                   t        j                  j                               z  rt	        |       dkD  rt        d      d}nd}d}n|dk(  rd}d	}nt        d
|       t	        |       dkD  }d| d}|dt         dz  }|d| dz  }|| dz  }|s| D ]  }|d| dz  } | s |dz  }n|dz  }|dt	        |       dz
   dz  }|d| dz  }|| dz  }t        |dt                    \  }}t        |      }t        dk7  rndj                  d      }	t        t        |j                        t        |      t        |j                        |	       }
|
j!                         }|
j#                          |rt%        |d!      5 }|j'                  d"       |j)                  d      }|j+                  d#      }|d$k7  sJ |j'                  |       d"}|t	        |       k  r(|j                  | |d        }||z  }|t	        |       k  r(d d d        t-        j.                  |       |S # 1 sw Y    xY w)%Nlinux 5wzPModels with buffer mutation included doesn't support constants greater than 2GB!z.ldata, "aw"z.lrodata, "a"r  darwinz__DATA,__datar  zUnsupported platform: i   z
	.section	r#  z		.balign z	.globl	z_binary_constants_bin_start
z_binary_constants_bin_start:
z	.byte z
	.space 1
z	.quad 0x1234567899abcdef
z	.space    z.globl	z_binary_constants_bin_end
z_binary_constants_bin_end:
S)r  xpur  T)r  r  compile_onlyuse_relative_pathr   r  
output_dirr  zr+br   s   ͫxV4r   )mutated_buffersrD   rv  r   r  
ValueErrorr   r6   r"  r   r   r)   r  r'   stemr0  get_target_file_pathbuildr   r  r  findrz   r:  )constsplatformsection_attrsymbol_prefixis_large_consts
consts_asmr  r  consts_sobject_build_optionsobject_builderconsts_or?  hdr	start_idxposrcr  rs  specified_sub_dirr  s                    rV   _compile_constsz0AotCodeCompiler.compile.<locals>._compile_consts  s   7"((:eoo6J6J6L+MM 6{]2(n  $2L#2L "X%. #"%;H:#FGG!&kD0O'~R8JJ{m266JJ}o5RSSJ]O+IJJJ" 3AHQCr"22J3 .0J<<
	#f+/):"==
H]O3NOOJ]O+GHHJ!"34KAx
 H~H#8 ,7%+?KU!"3$  ('Hx/0	N &::<H  "(E* 
"aFF1I&&,C #)L MI$?*?FF9%CF+WWVCD\2r	 F+
" IIhO
" 
"s   BH;;Ir   FileLock.locktimeoutz.jsonr*  NAOTI_DEVICE_KEYz_metadata.jsonz"Metadata must only contain stringsr  c              3  p   K   | ]-  }|j                   vrj                  |      j                   / y wrQ   )folded_constantsget_original_value_of_constantis_cuda)r   r   rs  s     rV   r   z*AotCodeCompiler.compile.<locals>.<genexpr>\  s8      u555 44T:BBs   36c                   dd}dd l }| j                         dk(  ry| j                  rSt        j                  j
                  j                  |       }t        j                  j
                  j                  |       }n>| j                         j                         }|j                         }|j                         }|j                  ||j                  |j                  |z              }t        |j                        }|r|S  ||      S )Nc                l    | j                  t        |       t        z   dz
  t        z  t        z  d      }|S )NrE       )ljustr  r6   )	raw_bytespadded_bytess     rV   _pad_to_alignmentzEAotCodeCompiler.compile.<locals>._to_bytes.<locals>._pad_to_alignmentc  s6    #,??Y+59kIKW$L ('rX   r   rX   )r!  r  r   r  )ctypesnumelrt  rt   opsmkldnndata_ptr_nbytesuntyped_storager  nbytesr   POINTERc_ubyter  contents)	rS  all_cudar#  r$  r(  r+  t_cpu	raw_arrayr!  s	            rV   	_to_bytesz*AotCodeCompiler.compile.<locals>._to_bytesb  s    ( 779>;;$yy//88;H"YY--55a8F--/335E$~~/H"\\^F"KKNN6>>F#:;	 ")"4"45	$,yN2CI2NNrX   rX   c              3  j   K   | ]*  }|j                   vr j                  |             , y wrQ   )r  r  )r   r   r2  r/  rs  s     rV   r   z*AotCodeCompiler.compile.<locals>.<genexpr>  s9      .5#9#99 eBB4H(S.s   03r  )r  r  use_mmap_weightsr  r  )r  min_optimizer  r  z#aot wrapper compilation command: %sz"aot kernel compilation command: %sz_compile_flags.json)rE   qqr  zsaving script object %s as %szcustom_objs_config.jsonz.o )r  r  r  r  zaot linkage command: %saz// Compile cmd
// z// Link cmd
// z_linker_flags.jsonz_serialized_weights.binr+  qi @  za+b    )r  r  r  r   r   r   )rS  ztorch.Tensorr/  r   r   r  rR   )Zsysr  r   r$   r-   r'   r)   r  r  get_command_liner   r  r  r  r  package_cpp_onlyr|   r"  packager  ru  r1  r:   r   r0  r   r1  r   torch.utils._filelockr  r   rz   r{   LOCK_TIMEOUTwith_suffixr   rq  	with_namer  r   r  r   r   r8  r   r   rv  r   package_constants_in_sor  force_mmap_weightsr   r  r$  save_flags_to_jsonsave_compile_cmd_to_cmakesave_src_to_cmaker  r   r.  rt   randintiinfoint64maxitemstructpack	enumeratetorchbind_constantsr  r  rF   _pickle_saver   ru   rv   ROCmCodeCacheCUDACodeCacher   rD  r  r+   r#   r  save_link_cmd_to_cmaker:  resourcegetpagesizetell)Nr  rs  r  r  serialized_extern_kernel_nodesr  additional_filesgenerated_filespicked_vec_isavec_isa_cmd_gencpp_commandspecified_output_pathspecified_artifact_namewrapper_keyr  wrapper_path_operatorkernel_path_operator
cmake_pathr  r  r   lockextern_kernel_nodes_jsonr?  rq  	meta_jsonr  r  kernel_meta_json	output_soserialized_weightsconsts_sizer4  compile_commandwrapper_build_optionskernel_build_optionswrapper_builderwrapper_compile_cmd	wrapper_okernel_builderkernel_compile_cmdkernel_ocompile_flagsaot_constantsmagic_numberr  custom_obj_idxqual_name_to_idr   constantcustom_obj_namecustom_obj_bytescustom_obj_pathconstants_config_jsongpu_codecacheentrygpu_kernels_ooutput_namer  so_build_options
so_builderlink_cmdlinker_flagsweight_file	f_weightsgpu_oo_filerU  
page_size_	page_sizef_soso_sizer2  r/  r  r  r  r  sN    ``` `                                                                  @@@@@@rV   compilezAotCodeCompiler.compiler  sm    +<<7"OPP%$-&'
 ?;;=> J;%#7JENN 	 +6+>+>+J+JK	
!# //99lK%@ALK$)/	%
!\ /	
; &&""<0&&77&&{3:LI9;G
 ,	
 	
 +	
 !%\ 2#K0188;F '')##T#2/03CCD
L	 L	\ 	3>GGLL;#89<
  _	6-+.)55g>,( 2C8 <AGG:;< &&..#**+CD**33H*5H&' %//,112.AI
 ++44::< 1!!S)jS.B 8B
 i% B

6#6#6#?#?@AB  #$..+001@ 
 KK	#34""**&&y1**;;#**+;< + ##//.::5AB 
  !OO002 HO< ""::%(XX . % 4 4 6. &" &)"01K $*#3#3#55U+:U""55#'  "NN*$4%+%5%5%7)/O %: %!!'!4!4!E!EE% "%!
 $9 $!$!$ 
 ).334$4;;<1	O #2"B"B"D'<<>I'-223#4;;<0	N "0!@!@!B%::<HII;=PQII:<NO""33 #)3305566IJ!
 &88G&&}599*E11*lK&&z2%%'$$&# 2 #q%++ekk*B*F*FMRRT  !'D+/< P&}cllCHN !O4=))//15 80 0x "(EHH,A,ABBB%?$@@P"Q		94Q(7%#(88#8#8#B "$'',,)00/# _.>E&&78" %'GGLL%,,.G%! +S1 5Q

?345""#89 $)==#4#4-/ 
 +00779$$--d3 !!M 
  HH]3M&LY&W#K4&'"3	  $   #HhF84%,J "224H"779III/: lC( 9A-.A-B"EF*8*B789
 k3' 91-.@-ADE*8*B789
 ""33")3305566HI 
 !33LA&&|4&&~6
 $"%-77499::QR#K
 k40 HI!(:;!C(FGH $**;7&&x0&&}5,,ZB*002 DE00UCD11*=  "((H= &FIIf%& $#!)!5!5!7J #E: 6Ii/ C4"&))+

49w7J+J#KL

#56

6;;sL#ABC &&..#**95_	6B	 && #"E	< <(B BF5 5:9 9
9 90H H4C Co_	6 _	6s   0)v1u+Cv13v1=uT,v10%u,A#v18.u9&Bv1<<u>8v1<vB6v1>7v5C#v1A!v%93v1u	v1u)	$v1,u6	1v1>v	v1v	v1v"	v1%v.	*v11v:N)rs  rd   r  r   r  r   rX  Optional[str]r  r   rY  r  r   zUnion[list[str], str])r   r   r   r  r  rR   rX   rV   r  r  q  sa    cc c 	c
 )6c c $c 
c crX   r  c                     t        t              j                  dz  } | j                         5 }|j	                         }t        |d      \  }}d d d        t        |      S # 1 sw Y   t              S xY w)Nr  r  )r   r  r0  r   r  r"  r,   )r{   r?  r  r  r]  s        rV   cpp_prefix_pathr  a  sm     >  #99D	 
&&(
8
 $H--
 $H--s    A  A3c                     t               } t        j                         r#dt        j                  j                  |        dS d|  dS )Nz
#include "")r  r   r  rz   r{   r  r]  s    rV   
cpp_prefixr  n  sF     H BGG,,X67q99H:Q''rX   zOptional[CDLL]_libgompc                   d	fd|D cg c]
  } |       }}| j                  d      s
J | dz          d }t        | j                  d            D ]+  \  }}|dk(  rt        j                  |      }t        ||      }- t        |      s
J | dz          t               }t        |j                  j                  |      D ]!  \  }}	|j                  s|	||j                  <   # |r|t        |       d =  ||i |}
t        |
t        t         f      r|
D cg c]  }|t#        j$                  g       n| }
}t        |
      D ])  \  }}t        |t"        j&                        r!J | dz           t"        j(                  j*                  j-                  |
      S t        |
t"        j&                        s
J | dz          t"        j(                  j*                  j/                  |
      S c c}w c c}w )
Nc                    t        t        |             dk(  r)t        j                  j                  j                  |       S t        | t        t        f      r t        |       fd| D              S | S )Nz<class 'PyCapsule'>c              3  .   K   | ]  } |        y wrQ   rR   )r   r8  convert_args     rV   r   z9custom_op_wrapper.<locals>.convert_arg.<locals>.<genexpr>  s     9[^9s   )	r   r  rt   r  _aoti&alloc_tensor_by_stealing_from_void_ptrr  r  tuple)argr  s    rV   r  z&custom_op_wrapper.<locals>.convert_arg~  sW    tCy>2288>>HHMMdE]+499S999JrX   z
torch.ops.z, can not be called through custom_op_wrapperr  r   z, can not be loaded through custom_op_wrapperz returns a list of non-tensorsz returns a non-tensor)r  r   r   r   )
startswithrO  r  	importlibimport_moduler  callabler  zip_schema	arguments
kwarg_onlyr   r  r  r  r  rt   tensorr   r  r  #unsafe_alloc_void_ptrs_from_tensors!unsafe_alloc_void_ptr_from_tensor)r   rT   r  converted_argsfuncr  r~  rU   func_argconv_argresultrr  s               @rV   custom_op_wrapperr  {  s    3773k#&7N7==& 
;;& D"((3-(  16**1-DtQ 
 D>N2 NNN> VF!$,,"8"8.I -($,F8==!- CK<>*>,V,F&4-(@FG1ai%,,r"Q6GGf% 	VDAqa.U5U0UU.	Vxx~~AA&II&%,,/M6M1MM/xx~~??GG= 80 Hs   G' G,c                      e Zd ZU i Zded<    eej                        Zi Zded<   ed
d       Z	e
d
d       Ze
	 	 	 d	 	 	 	 	 	 	 	 	 dd       Ze
ddd	       Zy)CppCodeCache0dict[str, Callable[[], Union[CDLL, ModuleType]]]r   r   cpp_compile_command_flagsc                ,    t        j                  |       S rQ   )r   LoadLibrary)r{   r   s     rV   _load_library_innerz CppCodeCache._load_library_inner  s    %%rX   c           	        	 | j                  ||      }||_        |S # t        t        f$ r}dt	        |      v rTt
        j                  j                  d      r5t        j                  d      a
| j                  ||      }||_        |cY d }~S dt	        |      v r9t        | dt        j                          dt        j                          d      | d }~ww xY w)Ngompz/usr/lib64/libgomp.so.1z(failed to map segment from shared objectz3.  The most common reason this may occur is if the zl folder is mounted with noexec (e.g., by default Docker mounts tmp file systems as noexec).  Please remount zi with exec enabled, or set another temporary directory with TORCHINDUCTOR_CACHE_DIR environment variable.)r  r   ImportErrorrm  r   rz   r{   r   r   r  r  tempfile
gettempdir)r  r{   r   r  r   s        rV   _load_libraryzCppCodeCache._load_library  s    	,,T37FFJMW% 	QBGGNN3L$M  ++,EF00s; 
9SVCcLXM`M`MbLc d33;3F3F3H2I J]]
  	s"    CACCACCNc           	     @    i  j                   |t               |d}t                t        ddt	        di |      }t        |j                               }t        |d|      \  } j                  vrddl	m
}	 t        j                  j                  t               d	z         }
t        |      \  }}d d t	        di |d
t!        j"                         xr |dk(  i}t        ||||      }t%        j&                  t(        |
|      t+        |j-                               d fd}|> |	|
t.              5  t        j                  j1                        s |      d d d        | j                  <    j                     S # 1 sw Y   'xY w)N)r  r  extra_flagsr  r  r  r  r
  r   r  r  r  r  r  c                 r    3j                                  } | J j                        J S rQ   )r  r  )r  binary_pathr  futurer   r  	worker_fns    rV   load_fnz(CppCodeCache.load_async.<locals>.load_fn  sI    ;)&[F!>)>++K=C?*?
rX   r  rR   r   r   )r  r-   r$   r'   r)   r  r<  r"  r   r?  r  rz   r{   r|   r   r+   r   r  r   r
   _worker_compile_cppr,   r   r@  r   )r  rl  r  	submit_fnr  rk  command_genvec_isa_cmd
input_pathr  	lock_pathr  r  cpp_build_optioncpp_builderr  r  r  r   r  r  s   `               @@@@@rV   
load_asynczCppCodeCache.load_async  s   
++
&#~&	
 	 c/D/W/W
 ;779:U+FZcii6\^S7]CI&LZ&X#K,0FC4  ! #)#3#3#5#N+:N  % "%,	K "))#I
 3;3S3S3UVK	 	 $i> 677>>+6!*9!56 %CIIcNyy~6 6s   (FFc                0     | j                  ||             S rQ   )r  )r  rl  r  s      rV   r   zCppCodeCache.load  s    7s~~k;799rX   )r{   r   r   r   r   zUnion[CDLL, ModuleType])r  NrR   )
rl  r   r  r   r  r   r  Sequence[str]r   r   )r  )rl  r   r  r   r   r   )r   r   r   r   rG  r   r  r  r  r  r  r  r  r   rR   rX   rV   r  r    s    >@E;@u{{+K02~2& &  ,  !%'FF F 	F
 #F 
F FP : :rX   r  c                    ddl m}  || t              5  t        j                  j                  |j                               s|j                          d d d        y # 1 sw Y   y xY w)Nr   r  r  )r?  r  r@  rz   r{   r   r   r  )r  r  r  s      rV   r  r    sK     /	)\	2  ww~~k>>@A     s   >AA%c                      e Zd ZU i Zded<    eej                        ZdddZdZ	dZ
dZ ej                  d	      Zedd
       Ze	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 dd       Zedd       Zy)CppPythonBindingsCodeCacher  r   FTinclude_pytorchsharedr  zkernel({}); Py_RETURN_NONE;r  aR  
        // Python bindings to call {entry_func}():
        #define PY_SSIZE_T_CLEAN
        #include <Python.h>
        #include <sstream>
        #include <cstdlib>

        #ifndef _MSC_VER
        #if __cplusplus < 202002L
        // C++20 (earlier) code
        // https://en.cppreference.com/w/cpp/language/attributes/likely
        #define likely(x)       __builtin_expect(!!(x), 1)
        #define unlikely(x)     __builtin_expect(!!(x), 0)
        #endif
        #else
        #define likely(x) (x)
        #define unlikely(x) (x)
        #endif

        // This is defined in guards.cpp so we don't need to import PyTorch headers that are slooow.
        // We manually link it below to workaround issues with fbcode build.
        static void* (*_torchinductor_pyobject_tensor_data_ptr)(PyObject* obj);

        template <typename T> static inline T parse_arg(PyObject* args, size_t n) {{
            static_assert(std::is_pointer_v<T>, "arg type must be pointer or long");
            return static_cast<T>(_torchinductor_pyobject_tensor_data_ptr(PyTuple_GET_ITEM(args, n)));
        }}
        template <> inline int64_t parse_arg<int64_t>(PyObject* args, size_t n) {{
            auto result = PyLong_AsSsize_t(PyTuple_GET_ITEM(args, n));
            if(unlikely(result == -1 && PyErr_Occurred()))
                throw std::runtime_error("expected int arg");
            return result;
        }}
        template <> inline uintptr_t parse_arg<uintptr_t>(PyObject* args, size_t n) {{
            auto result = PyLong_AsVoidPtr(PyTuple_GET_ITEM(args, n));
            if(unlikely(result == reinterpret_cast<void*>(-1) && PyErr_Occurred()))
                throw std::runtime_error("expected int arg");
            return reinterpret_cast<uintptr_t>(result);
        }}

        {extra_parse_arg}

        static PyObject* {entry_func}_py(PyObject* self, PyObject* args) {{
            try {{
                if(unlikely(!PyTuple_CheckExact(args)))
                    throw std::runtime_error("tuple args required");
                if(unlikely(PyTuple_GET_SIZE(args) != {arg_len}))
                    throw std::runtime_error("requires {arg_len} args");
                {call_entry_func}
            }} catch(std::exception const& e) {{
                PyErr_SetString(PyExc_RuntimeError, e.what());
                return nullptr;
            }} catch(...) {{
                PyErr_SetString(PyExc_RuntimeError, "unhandled error");
                return nullptr;
            }}
        }}

        static PyMethodDef py_methods[] = {{
            {{"{entry_func}", {entry_func}_py, METH_VARARGS, ""}},
            {{NULL, NULL, 0, NULL}}}};

        static struct PyModuleDef py_module =
            {{PyModuleDef_HEAD_INIT, "{entry_func}", NULL, -1, py_methods}};

        PyMODINIT_FUNC PyInit_{entry_func}(void) {{
            const char* str_addr = std::getenv("_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTR");
            if(!str_addr) {{
                PyErr_SetString(PyExc_RuntimeError, "_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTR must be set");
                return nullptr;
            }}
            std::istringstream iss(str_addr);
            uintptr_t addr = 0;
            iss >> addr;
            _torchinductor_pyobject_tensor_data_ptr =
                reinterpret_cast<decltype(_torchinductor_pyobject_tensor_data_ptr)>(addr);
            PyObject* module = PyModule_Create(&py_module);
            if (module == NULL) {{
                return NULL;
            }}
            #ifdef Py_GIL_DISABLED
                PyUnstable_Module_SetGIL(mod, Py_MOD_GIL_NOT_USED);
            #endif
            return module;
        }}
        c                   t        t        j                  j                  j                  j
                        t        j                  d<   | d| j                   }	 t        j                  |   S # t        $ r Y nw xY wt        j                  j                  ||      }|J t        j                  j                  |      }|t        j                  |<   |j                   j#                  |       |S )N'_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTRr  )r   rt   r  _dynamorp  '_torchinductor_pyobject_tensor_data_ptrrz   environentry_functionr;  r  KeyErrorr  utilspec_from_file_locationmodule_from_specloaderexec_module)r  r{   r   module_namer  r  s         rV   r  z.CppPythonBindingsCodeCache._load_library_inner  s    @CHH##KKA


<= Qs1123	;;{++ 		~~55k4H006#)K 's   A. .	A:9A:Nc                \   
 dj                  d t        |      D              } j                  j                  t	        |       j
                  j                  |       j                   j                  j                  |            } j                  ||z   |||      
dd 
fd}	|	S )	a5  
        Wrap a C++ function in fast Python bindings.

        Args:
            argtypes: The types of args to ENTRY_FUNCTION(), e.g. ["float*", "long"]
            source_code: C++ source code containing a ENTRY_FUNCTION() function

        Returns:
            A python version of ENTRY_FUNCTION()
        , c              3  T   K   | ]   \  }}d |j                  dd       d| d " yw)z
parse_arg<zconst r  z>(args, )N)rR  )r   nargtypes      rV   r   zBCppPythonBindingsCodeCache.load_pybinding_async.<locals>.<genexpr>  s7      
7 267xs!D
s   &()	array_len)arg_lencall_entry_func
entry_funcextra_parse_arg)r  r  Nc                 f            t        t              sJ t         j                        S rQ   )r  r   r  r  )r  
get_resultr  s   rV   r  z?CppPythonBindingsCodeCache.load_pybinding_async.<locals>.future  s2    ~#!&*55563#5#566rX   r  )	r|   rO  suffix_templateformatr  call_entry_functionr  r  r  )r  argtypesrl  r  num_outputsr  r  	parseargssuffixr  r  r  s   `         @@rV   load_pybinding_asyncz/CppPythonBindingsCodeCache.load_pybinding_async  s    ( II 
'1
 
	 $$++M33::9E))//666M	 , 
 ^^& #	 $ 

 	7 rX   c                0      | j                   |i |       S rQ   )r  r  rT   rU   s      rV   load_pybindingz)CppPythonBindingsCodeCache.load_pybinding  s     8's''88::rX   )r{   r   r   r   r   r   )r  r   NrR   )r  r  rl  r   r  r   r  r.  r  r   r  r  r   r   )rT   r   rU   r   r   r   )r   r   r   r   rG  r   r  r  r  r  r  r  textwrapdedentr  r  r  r  r  rR   rX   rV   r  r  #  s    >@E;@u{{+K !!
 N7O%hooU	WOr    
 !%',, , 	,
 , , #, 
, ,\ ; ;rX   r  c                  v    e Zd ZU i Zded<    eej                        ZdddZdZ	dZ
 ej                  d      Zy)	CppWrapperCodeCacher  r   Tr  inductor_entry_cppzreturn inductor_entry_cpp({});a	  
        #include <torch/csrc/inductor/aoti_torch/c/shim.h>

        static inline std::vector<AtenTensorHandle> unpack_tensor_handle_list(PyObject* pyvec) {{
            std::vector<AtenTensorHandle> result;
            size_t result_len = PyList_GET_SIZE(pyvec);
            result.reserve(result_len);
            for (size_t i = 0; i < result_len; i++) {{
                // AtenTensorHandle is essentially a pointer
                void* elem = PyCapsule_GetPointer(PyList_GET_ITEM(pyvec, i), NULL);
                result.push_back(reinterpret_cast<AtenTensorHandle>(elem));
            }}
            return result;
        }}

        static inline PyObject* pack_tensor_handle_list(const std::array<AtenTensorHandle, {array_len}>& arr) {{
            PyObject* result = PyList_New({array_len});
            for (size_t i = 0; i < {array_len}; i++) {{
                PyObject *elem =
                    arr[i] == nullptr
                        ? Py_None
                        // Store AtenTensorHandle as PyCapsulate
                        : PyCapsule_New(reinterpret_cast<void*>(arr[i]), NULL, NULL);
                PyList_SET_ITEM(result, i, elem);
            }}
            return result;
        }}

        template <> inline std::vector<AtenTensorHandle> parse_arg<std::vector<AtenTensorHandle>>(PyObject* args, size_t n) {{
            return unpack_tensor_handle_list(PyTuple_GET_ITEM(args, n));
        }}

        PyObject* inductor_entry_cpp(std::vector<AtenTensorHandle>&& input_handles) {{
            // For outputs, we only allocate an array to hold returned tensor handles,
            // not the actual output tensor storage.
            std::array<AtenTensorHandle, {array_len}> output_handles{{}};
            try {{
                inductor_entry_impl(input_handles.data(), output_handles.data());
                if (PyErr_Occurred()) {{
                    return nullptr;
                }}
                return pack_tensor_handle_list(output_handles);
            }} catch(std::exception const& e) {{
                PyErr_SetString(PyExc_RuntimeError, e.what());
                return nullptr;
            }} catch(...) {{
                PyErr_SetString(PyExc_RuntimeError, "unhandled error");
                return nullptr;
            }}
        }}
        N)r   r   r   r   rG  r   r  r  r  r  r  r  r  r  rR   rX   rV   r   r     sK    >@E;@u{{+K! *N:%hoo2	4OrX   r   c                     e Zd ZU i Zded<    eej                        ZdZded<    e	j                  d      Ze e	j                  d      z   Ze e	j                  d      z   Z e	j                  d	      Zedd
       Zedd       Ze ej&                  d      dd              Zedd       Ze ej&                  d      dd              Ze ej&                  d      dd              Ze	 d	 	 	 	 	 	 	 dd       Zedd       Zedd       Zy)HalideCodeCachez0dict[str, Callable[[], Union[ModuleType, CDLL]]]r   Nr  _standalone_runtime_patha  
        #include "{halideruntime_h}"
        #include "{headerfile}"
        #include <stdexcept>
        #include <cmath>

        namespace c10 {{
            inline long div_floor_integer(long a, long b) {{
                if ((a<0) != (b<0)) {{
                    const auto quot = a / b;
                    const auto rem = a % b;
                    return rem ? quot - 1 : quot;
                }}
                return a / b;
            }}
        }}
        z
        void kernel({argdefs}) {{
            {buffers}
            int err = halide_kernel({buffer_names});
            if(err != 0) throw std::runtime_error("halide_kernel failed");
        }}
        a{  
        #include <cuda.h>
        static const halide_device_interface_t* cuda_interface = halide_cuda_device_interface();

        void kernel({argdefs}, uintptr_t stream) {{
            {buffers}
            int err = halide_kernel(reinterpret_cast<void*>(stream), {buffer_names});
            if(err != 0) throw std::runtime_error("halide_kernel failed");
        }}
        a  
        #include "{}"
        #include <cuda.h>

        static int acquire_context(void* user_context,
                                   void** cuda_context_out,
                                   bool create) {{
            return cuCtxGetCurrent(reinterpret_cast<CUcontext*>(cuda_context_out));
        }}

        static int release_context(void* user_context) {{
            return 0;
        }}

        static int get_stream(void* user_context,
                              void* cuda_context,
                              void** stream_out) {{
            *stream_out = user_context;
            return 0;
        }}

        static int register_halide_hooks() {{
            halide_set_cuda_acquire_context(&acquire_context);
            halide_set_cuda_release_context(&release_context);
            halide_set_cuda_get_stream(&get_stream);
            return 0;
        }}

        int inductor_register_halide_hooks_result = register_halide_hooks();
        c                   |j                   J |j                  +t        |j                         t        |j                        k(  sJ |j                  J |j                  xs |j
                   d|j                   }|rd| d}d}d}d}nd}d}d| d}d	}g }	t        |j                   |j                        D ]  \  }
}|	j                  d
|
 d| d        d| dd| ddj                  |	       d| d| d| d| d| d| d| d| d| d|j                          d| dt        |	       d| d| d| dg
S )Nz + zreinterpret_cast<uint64_t>(r  cuda_interfacenullptrhalide_buffer_flag_device_dirty0zreinterpret_cast<uint8_t*>(halide_buffer_flag_host_dirtyzhalide_dimension_t(0, r  zhalide_buffer_t ;zhalide_dimension_t z_dims[] = {z};z
.device = z.device_interface = z.host = z	.flags = z.type = z.dimensions = z.dim = z_dims;z.padding = nullptr;)
shapestrider  offsetalias_ofr   r  r  r|   halide_type)r  r   r  r   r(  r   device_interfacehostflagsdimssizer  s               rV   _codegen_bufferzHalideCodeCache._codegen_buffer\	  s   yy$$$zz%#cii.C

O*KKKzz%%%ll.chh/s3::,?28*A>F/D5EF(0
!<D3E		3::6 	DLD&KK0bBC	D tfA&!$|DIIdO3DCHfJvha(f()9(:!<fHTF!$fIeWA&fHS__./q1fN3t9+Q/fGD6(f'(
 	
rX   c           	        |j                         }|d|j                  v u sJ d|j                  v sJ g }g }t        |j                        D ]z  \  }}|j	                         r:|j                  d|        |j                  | j                  d| ||             Pd|j                  vsJ |j                  |j                         | dj                  |D cg c]  }d| 	 c}      j                         }|r| j                  n| j                  }	|	j                  | j                  |rdnd	      |d
j                  d |j                  D              |d
j                  |            }
|
S c c}w )Nuser_context
no_runtimez&hl_buf_hl_buf_*r#      HalideRuntimeCuda.hzHalideRuntime.hr  c              3  r   K   | ]/  }|j                   !|j                          d|j                    1 y w)Nr7  )r  bindings_typer   )r   r8  s     rV   r   z0HalideCodeCache._codegen_glue.<locals>.<genexpr>	  s9      ::% ??$%Qqvvh/s   57)halideruntime_h
headerfileargdefsbuffersbuffer_names)r  r-  rO  r  	is_bufferr  extendr  ctyper   r|   lstripglue_template_cudaglue_template_cppr  find_header)r  rT  r!  r  r#  r$  r  r  lineglue_template	glue_codes              rV   _codegen_gluezHalideCodeCache._codegen_glue~	  sh   ,,.>T[[8999t{{***. 	.FAs}}##hqcN3s22WQC=#wOP#))+++##CHH-	. ))w?ttD6]?@GGI29..s?T?T!((OO)0%6G "II  
 <0 ) 
	 ! @s   E-c                    t        ddt                     }|j                         }t        dj	                  | j
                  | j                  | j                  |g      j                  d            S )NOIr  r#  r   )	r'   r(   r<  r  r|   r*  r)  standalone_runtime_cuda_initr   )r  r  command_lines      rV   config_hashzHalideCodeCache.config_hash	  sp     !"

 #335II))**44 	 fWo	
 		
rX   c                X   t         j                  j                  j                  d      }||j                  st        d      	 |j                  d   }t        j                  |      D ]  }|j                  d      s	 t        j                  dt        j                  j                  ||      g      }t        j                  d|j!                  d            }|sst        j                  j                  t        j                  j#                  |j%                  d            |       }t        j                  j'                  |      st        j                  j#                  |      c S  	 t        |      # t        j                  $ r Y #w xY w# t(        $ r}t        |      |d }~ww xY w)	Nhalidez$halide python bindings not installedr   r  lddz(/.*)/libHalide.sor   rE   )r  	machinery
PathFinderr  r  r   rz   rL  r  
subprocesscheck_outputr{   r|   SubprocessErrorr  searchr  abspathgroupr   rO  )	r  errmsgr  r>  fileoutmr{   r   s	            rV   _search_for_filez HalideCodeCache._search_for_file	  sL   ""--77A<t>>EFF	.44Q7F

6* 9=='!(55"BGGLL$>?
 		"7G9LMA!ww||BGGOOAGGAJ,GP77>>$/#%77??4#889 6"" &55 ! !  	.v&A-	.sO   8F >5E53'F A+F  F (F 5FF FF 	F)F$$F)c                *   d| j                          d}dt        j                  v rRt        j                  j	                  t        j                  d   |      }t        j                  j                  |      r|S d| d}t        j                  ||      S )Nlibautoschedule_r  
HALIDE_LIBCan't find z3, set env HALIDE_LIB to the directory containing it)r  rz   r  r{   r|   r   r  rE  )r   sofiler{   rA  s       rV   find_libautoschedulez$HalideCodeCache.find_libautoschedule	  s}     $DJJL>52::%77<<

< 8&ADww~~d#&!TU 	 //??rX   c                   dt         j                  v rRt         j                  j                  t         j                  d   |       }t         j                  j	                  |      r|S dt         j                  v rrt         j                  j                  t         j                  j                  t         j                  d   d|              }t         j                  j	                  |      r|S d|  d}t        j                  d|  |      S )NHALIDE_INCLUDErH  z../include/rI  z7, set env HALIDE_INCLUDE to the directory containing it)rz   r  r{   r|   r   r?  r  rE  )r   r{   rA  s      rV   r+  zHalideCodeCache.find_header	  s     rzz)77<<

+; <dCDww~~d#2::%77??RZZ5TF7KLD ww~~d#$VW 	 //+dV0DfMMrX   c                   t        t        t        |t        | j	                         |f            d      d         }t        j                  |d       d t        |dz        }t        |dz        }t        |dz        }t        |d	z        }t        |d
z        }	t
        j                  j                  |       }
g }|
rt        ||       t        j                  |ddd| ddddg
}|j                  r,|j                  d| j                  |j                        g       |j                  |j!                                |j#                  t%        j&                  t(        j*                  |             |j,                  D cg c]  }|j.                  |j1                         ! }}|j3                         r|j#                  d       | j5                  || j7                  ||      || j9                         f|
r|j"                  nd |j3                         rdnd      |
r`|j#                  t%        j&                  t:        |             t%        j&                  t<        |	|      }|r ||      j>                  n |        dfd}|S c c}w )Nr  r7     Tr   zgenerate_kernel.pyzhalide_kernel.azhalide_kernel.hdonerd  -gr  -oz-fhalide_kernelz-ezstatic_library,h,schedulez-p	uintptr_tr   r  )r  r  r  c                 $    r                  S rQ   rR   )bindings_futurewait_for_compiles   rV   r   z3HalideCodeCache.generate_halide_async.<locals>.load'
  s     ""$$rX   )r   Callable[[], Any]) r   r  r  r  r5  rz   r   r   r{   r   r   r;  
executable	schedulerr&  rK  rT   r  r   r
   r;  
check_callr  r  r  r  r  r/  build_standalone_runtimetouch_worker_task_halider  )r  rT  rl  r  dirpathgenfilelibfiler!  donefilelockfileneed_compilejobscmdr  binding_typestaskr   rV  rW  s                    @@rV   generate_halide_asyncz%HalideCodeCache.generate_halide_async	  sM     1489  
 	Gd+g 445g 112#445
w'(w'(77>>(33+.)+C ~~

D#":":4>>"JKLJJtyy{#KK	))**?*?EF ,0==
$'CLL<PC
 
 <<>  -22dJ/ #">">"@A%1dkkt"&,,.e 3 
 KK	))%:;$$%8(DID#,T?#9#9 	%
 5
s   J	J	c                0      | j                   |i |       S rQ   )ri  r  s      rV   generate_halidezHalideCodeCache.generate_halide.
  s     9(s(($9&9;;rX   c           	     r   | j                   r5t        j                  j                  | j                         r| j                   S t        j
                  j                         rdnd}d}|dk(  rdnd}| j                   r6t        j                  j                  | j                         rJ t               }n
t               }t        |      d| d| j                          z  }t        j                  |d	       t        |d
z        }t        |dz        }t        |dz        }t        |dz        }	t        ||z        }
t        j                  j                  |      sdd l}ddlm}  ||t               5  t        j                  j                  |      st#        |d      5 }|dk(  r9|j%                  | j&                  j)                  | j+                  d                   d d d        |j-                  |	|j/                  |             t1        |
      \  }}t3        |||	g|t5        |            }t7        j8                  t;        j<                  |j?                                      tA        |       d d d        t        j                  j                  |
      sJ |
| _         |
S # 1 sw Y   xY w# 1 sw Y   ?xY w)Nr   r  zlibStandaloneHalideRuntime.soz	host-cudar  zhalide-runtime--Tr   rP  rd  z	hooks.cppzstandalone_halide_runtime.ar   r  r*  r  r  r  )!r  rz   r{   r   rt   r   r  r5   r4   r   r5  r   r   r7  r?  r  r@  r   r"  r3  r  r+  compile_standalone_runtimeTargetr+   r'   r)   r;  r[  shlexr  r<  r]  )r  r  libnamer-  baser_  rb  rc  hookfileafilerJ  hlr  r?  r   r  halide_cmd_gens                    rV   r\  z(HalideCodeCache.build_standalone_runtime2
  sH   ''BGGNN((-
 /// %

 7 7 9fu1 +v 56''ww~~c&B&BCCC
 %&D;Dt*#//:K9LMM
Gd+w'(w'(w,-G;;<Ww&'ww~~h'6(L1 $ww~~h/h, &&0GG # @ @ G G$'OO4I$J!" 11%69JK'Mf'U$D*%/!!)5 1#-$9(3%	&N ))N$C$C$EF (O1$2 ww~~f%%%'-$3 $ $s%   1,J-?J!BJ-!J*	&J--J6)r   r   r  rj   r   r   r   r  )rT  rk   r!  objectr   r   r  )r  r   rA  r   r   r   )r   r   r   r   rQ   )rT  rk   rl  r   r  r   r   rX  )rT   r   rU   r   r   rX  )r   r   r   r   rG  r   r  r  r  r  r  r  r*  r)  r3  r  r  r/  r   r   r5  rE  rK  r+  ri  rk  r\  rR   rX   rV   r  r  	  s   >@E;@u{{+K.2m2X__	F& 	"   /(//		#  $38??	$ B 
 
B  > Y
  
$ # #. Y	@  	@ YN  N  BFBB,/B<?B	B BH < < 8 8rX   r  c                B   ddl m} 	  || t              5  |D ]	  } |         	 d d d        y # 1 sw Y   y xY w# t        j                  $ rP}t
        j                  j                  d      dk(  r't        |dd      ^}}}t
        j                  j                  |      j                  d      rt        |      j                         }d}	|j                  |	      d	k(  sJ  G d
 d      }
 |
       ||j                  d      d	z   <   t!        j"                  t!        j$                  ddg|d      d      }|j'                  |	|      }t        dd      5 }|j)                  |j+                                d d d        n# 1 sw Y   nxY wt-        d|       | d }~ww xY w)Nr   r  HALIDE_REPRO1rf  )r  r  r  pythonz    hl.main()rE   c                      e Zd ZddZy) _worker_task_halide.<locals>.Outc                     y)NrC  rR   r   s    rV   __repr__z)_worker_task_halide.<locals>.Out.__repr__~
  s    $rX   Nr  )r   r   r   r  rR   rX   rV   Outr~  }
  s    %rX   r  rR  z                        import sys, tempfile
                        with tempfile.TemporaryDirectory() as out:
                            sys.argv = zrepro.pyz?
                            hl.main()
                        r  r*  zwrote repro.py: )r?  r  r@  r;  r=  rz   r  r   r  r{   r  r  r   r  countindexr  r   r  rR  r"  r(  r   )rc  re  r  jobr   r|  scriptrf  r	  mainr  replfds                rV   r^  r^  n
  s   . h- 	 	 	 	 %% ::>>.)S0#*1e\#B FFSww'228<F|((*&zz$'1,,,% % ,/5CIIdOa'(OO( *4(:c(:'= > 
 ||D$/*c* ,bHHT[[]+, , ,"%5aS#9:A9sB   : .: 7: : FDF E=4	F=F	FFc                8    t        | d      j                          y )Nr8  )r   closer  s    rV   r]  r]  
  s    3rX   c                      e Zd ZU g Zded<   i Zded<   eddd       Ze	 	 	 d	 	 	 	 	 	 	 	 	 dd       Ze	 	 d	 	 	 	 	 	 	 	 	 dd       Z	eddd	       Z
e ej                  d      	 	 	 	 	 	 dd
              Zy)PyCodeCachezlist[ModuleType]r  z dict[str, list[tuple[Any, ...]]]linemapsc                    t        |d|      S Npyr  r%  )r  rl  r
  s      rV   r"  zPyCodeCache.write
  s    [$e44rX   Nc                L    t        |d|      \  }}| j                  ||||      S r  )r"  load_by_key_path)r  rl  r
  linemapattrsr   r{   s          rV   r   zPyCodeCache.load
  s-     +t59	T##Cw>>rX   c                .   |g }t        ||      }t        t        |       | j                  |<   |%|j	                         D ]  \  }}t        |||        |s"|s t        j                  t        ||      |_	        | j                  j                  |       |S rQ   )r2   r  r  r  r   setattrr   r
   r3   _reload_in_subprocr  r  )r  r   r{   r  r  modr  r  s           rV   r  zPyCodeCache.load_by_key_path
  s     ?G#C. "#w-0T #1Q"# 5%.%6%60#t&C" 	3
rX   c                    |r?| j                   D ]0  }	 |j                  sJ t        j                  |j                         2 | j                   j                          y# t        $ r Y Yw xY w)z
        Clear the in-memory module cache. If purge=True, also delete all the
        corresponding on-disk source files.
        N)r  r  rz   r:  r  r  )r  purger  s      rV   r  zPyCodeCache.cache_clear
  sa     {{ <<'<IIcll+ 	 ) s   -A	A)(A)c                    || j                   vry | j                   |   \  }}t        ||      }|dk(  ry ||dz
     }|sy dd} ||      S )Nr   rE   c           	         d}t        j                  ||       }t        |      D cg c]  \  }}}|t        |      |d c}}}S c c}}}w )Nz"File "(.+)", line (\d+), in (.+)\n)r]  r,  r   )r  findallreversedr.  )stack_traceregexmatchesr?  lr  s         rV   parse_stack_tracez<PyCodeCache.stack_frames_for_code.<locals>.parse_stack_trace
  sV     :Ejj4G  (0 Aq! A:  s   A)r  r   r   zlist[dict[str, Any]])r  r   )r  r{   linenor  r  r  r  r  s           rV   stack_frames_for_codez!PyCodeCache.stack_frames_for_code
  s`    
 s||#||D)u'6a!e	 !''rX   r  )rl  r   r
  r   r   tuple[str, str])r  NN)
rl  r   r
  r   r  Optional[list[tuple[int, str]]]r  r   r   r   )NN)
r   r   r{   r   r  r  r  r   r   r   r  )r  r   r   r   )r{   r   r  r.  r   zOptional[list[dict[str, Any]]])r   r   r   r  rG  r  r  r"  r   r  r  r   r   r  rR   rX   rV   r  r  
  s    !#G"13H.35 5  37*.?? ? 1	?
 (? 
? ? 
 48*.  1	
 ( 
 6   Y(( #(	'(  (rX   r  c                @    t        t        j                  |      |       S rQ   )r  r  r   )kernel_namerl  s     rV   _load_triton_kernel_from_sourcer  
  s     ;##K0+>>rX   c                    t        j                  t        j                  j                        rt        j                  j                  S t        j
                         r/t        j                  j                  t        j                  dd      S t        j                  t        j                  d            rt        j                  dd      S t        j                  t        j                  d            rQt        j                  j                  t        j                  j                  t        j                  dd      d            S y)NbinnvccCUDACXXr  	CUDA_HOMEzbin/nvcc)r    
nvcc_existr   r   cuda_cxxr  rz   r{   r|   rK   sdk_homegetenvrealpathrR   rX   rV   _cuda_compilerr  
  s    6;;//0{{###ww||K00%@@299Y/0yyB''299[12wwRYY{B-G TUUrX   c            	     r   t        j                         rddlm}  | j	                  d      }nt         j
                  j                  }t        j                  j                  t        j                  j                  |d            t        j                  j                  t        j                  j                  |d            t        j                  j                  t        j                  j                  |d            t        j                  j                  t        j                  j                  |d            gS )Nr   r  zcutlass-3-headersincludeztools/library/includeztools/library/srcztools/util/include)r   r  r  r  get_dir_pathr   cutlass_dirrz   r{   r  r|   )r  cutlass_paths     rV   _cutlass_include_pathsr  
  s    $++,?@{{.. 	lI>?
l4KLM
l4GHI
l4HIJ rX   c                    t                ddlm}  | j                  d      }g }t	               rPt        |       |D ]  }|j                  d| dd| g        |j                  d       |j                  d	       |S t        d
      )Nr   )cpp_extensionr   rn  z-Lz-Xlinkerz-rpath=z-lcudaz-lcudartzMUnsupported env, failed to find cuda libs! Currently only Linux is supported.)	r$   torch.utilsr  library_pathsr8   r&   r&  r  NotImplementedError)r  lpathsextra_ldflagsr{   s       rV   _cuda_lib_optionsr    s    )((V(<F!Mzf% 	ND   Btf+zWTF;K!LM	N 	X&Z(
  "[
 	
rX   c                 
    g dS )N)z-fPICz-fno-strict-aliasingz-fvisibility=hiddenz-WconversionrR   rR   rX   rV   _nvcc_host_compiler_optionsr  /  s     rX   c                    t        j                         } | dk(  rd} d|  d|  g}t        j                  j                  r	|d|  gz  }dddd	d
d|  ddj                  |       dt        j                  j                  dddg
}t        j                         r>|j                  dt        j                  j                  t        j                        g       t        j                  j                  r|j                  g d       t        j                  j                  r|j                  g d       t        j                  j                   r|j                  ddg       |S )N9090asm_compute_lto_z-t=0z"-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1z+-DCUTLASS_ENABLE_SM90_EXTENDED_MMA_SHAPES=1z'-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLEDz-wz-gencode=arch=compute_z,code=[,]z
-std=c++17z--expt-relaxed-constexprz-DNDEBUGz-ccbin)z	-lineinforQ  z-DCUTLASS_DEBUG_TRACE_LEVEL=1)z--keepz,--ptxas-options=--warn-on-local-memory-usagez --ptxas-options=--warn-on-spillsz--resource-usagez--source-in-ptxz--use_fast_mathz -DCUTLASS_USE_TANH_FOR_SIGMOID=1)r    get_cuda_archr   r   enable_cuda_ltor|   compile_opt_levelr  r&  rz   r{   r  rK   gccenable_debug_infoenable_ptxas_infouse_fast_math)archr	  optionss      rV   _nvcc_compiler_optionsr  8  s.   !!#Dt|$LHTF+,D{{""4v,51
 gchhtn-=Q?%%"G "''//+//"BCD{{$$KL{{$$	
 {{  !2	
 NrX   c                r   |g }t               }t               }t               }t               }||z   |D cg c]  }d|v rd| nd|  c}z   |D 	cg c]  }	d|	z   	 c}	z   |z   }
dj	                  |       }d}|dk(  r%t                ddj	                  |
       d| d| }nt|d	k(  r6|
j                  d
       t                ddj	                  |
       d| d| }n9|dk(  r%t                ddj	                  |
       d| d| }nt        d| d      t        j                  d|       |S c c}w c c}	w )N=z-Xcompiler z-Xcompiler=z-Ir7  r  r  z -c -o soz-sharedz -o exezUnsupported output file suffix !zCUDA command: %s)
r  r  r  r  r|   r  r  r  r  r$  )	src_filesdst_filedst_file_ext
extra_argsinclude_pathscuda_lib_optionsnvcc_host_compiler_optionsnvcc_compiler_optionsoptr{   r  src_fileress                rV   cuda_compile_commandr  d  s    
*,M(*!<!>24
	 2
 $'#:k#[3FF
	
 $1
144$;
1	2 	  xx	"H
Cs!"!CHHW$5#6ghZq
S		y!!"!CHHW$5#6d8*AhZP		!"!CHHW$5#6d8*AhZP!$CL>QR"STTII #&J'
 2s   D/D4c                  P    e Zd ZdZ	 	 	 	 d
dZddZddZddZddZddZ	ddZ
y	)
DLLWrapperz A wrapper for a dynamic library.c                b    || _         d| _        t        j                  |      | _        d| _        y )NFT)lib_pathis_openr   r  DLL)r   r  s     rV   r   zDLLWrapper.__init__  s+     !##H-rX   c                L    | j                   r| j                          d| _         y y r\   )r  _dlcloser   s    rV   r  zDLLWrapper.close  s    <<MMO DL rX   c                    d }t               r;t        d       }t        |d      st        d      }t        |d      rF|j                  }n9t	               r$dd l}|j                  dd      }|j                  }nt        d      |wt               r)t        g|_	         || j                  j                         y t	               r9dd l}ddlm} |j                  g|_	         || j                  j                         y y t        j                  d	       y )
Ndlclosezlibc.sor   kernel32T)use_last_errorz&Unsupported env, failed to do dlclose!)wintypeszKdll unloading function was not found, library may not be unloaded properly!)r8   r   rP  r  r9   r$  FreeLibraryr  r   r  r  _handler  HMODULEr  r  )r   	f_dlclosesymsr$  r  r  s         rV   r  zDLLWrapper._dlclose  s    	::D4+ItY' LL	\{{:d{CH ,,I%&NOO z&.Z	"$((**++&.&6&6%7	"$((**+  KK]rX   c                    | j                   st        d| j                         t        | j                  |      dfd}|S )NzCannot use closed DLL library: c                 D     |  }|rt        dj                         y )NzError in function: )r   r   )rT   errmethods     rV   _wrapped_funcz-DLLWrapper.__getattr__.<locals>._wrapped_func  s,    $-C"%88I#JKK rX   rT   r   r   r   )r  r   r  r  r  )r   r   r   r  s      @rV   __getattr__zDLLWrapper.__getattr__  s?    ||!@PQQ4(	L
 rX   c                    | S rQ   rR   r   s    rV   	__enter__zDLLWrapper.__enter__  s    rX   c                $    | j                          y rQ   r  )r   rT   s     rV   __exit__zDLLWrapper.__exit__      

rX   c                $    | j                          y rQ   r  r   s    rV   __del__zDLLWrapper.__del__  r  rX   N)r  r   r   r   r   )r   r   r   zCallable[..., None])r   r   r  )r   r   r   rF  r   r  r  r  r  r  r
  rR   rX   rV   r  r    s;    * 
!
!FrX   r  c                      e Zd ZU ej                   G d d             Zi Zded<    eej                        Z
dZed
d       Ze	 d	 	 	 	 	 	 	 dd       Zedd	       Zy)rS  c                  "    e Zd ZU ded<   ded<   y)CUDACodeCache.CacheEntryr   r  r  Nr   r   r   rG  rR   rX   rV   
CacheEntryr        rX   r  dict[str, CacheEntry]r   cuc                n    t        t        dgd|            }t        || j                  |      \  }}||fS z
        Writes source code into a file with dst_file_ext as the file extension.
        Returns the hash key of source code, and the path to the file.
        dummy_inputdummy_outputr  )r  r  r"  _SOURCE_CODE_SUFFIXr  rl  r  cuda_commandr   r  s         rV   r"  zCUDACodeCache.write  E      -.,O
  00
Z JrX   Nc                \   | j                  ||      \  }}|| j                  vrddlm} t	               } |t
        j                  j                  ||dz         t              }|5  |dt        | j                          |z   }	t
        j                  j                  |	      st        |g|	||      }
t        |d      5 }|j                  d       |j                  d|
 d       ddd       t               }t        j!                  d	|
       |
j#                  d
      }	 t%        j&                  |t$        j(                  t
        j*                         t               }d||z
   d|
 }t        j5                  |       nt        j!                  d|       t6        j9                  ||	      | j                  |<   ddd       | j                  |   j:                  ||fS # 1 sw Y   xY w# t$        j,                  $ r&}t/        j0                  ||j2                        |d}~ww xY w# 1 sw Y   mxY w)z
        Compiles CUDA source_code into a file with dst_file_ext extension.
        Returns a tuple of dst_file_path, hash_key, source_code_path
        r   r  r  r  Nr8  r#  z// CUDA Compile cmd
// zCUDA Compilation: %sr7  )stderrenvzCUDA Compilation took  seconds. Compile command: z8CUDA Compilation skipped: %s since output already exists)r"  r   r?  r  r   rz   r{   r|   r@  r  r  r   r  r   r   r  r$  r  r;  r<  STDOUTr  CalledProcessErrorr   CUDACompileErroroutputr1  rS  r  r  )r  rl  r  r  r   r  r  r   rd  r  rf  r?  
start_time	cmd_partserrorend_timelog_duration_msgs                    rV   r  zCUDACodeCache.compile  s    ))K>Zcii6#~HBGGLL3=A<XD S()HC0G0G,H+HILXww~~k2.#k<C j#. D!":3%r BCD "&JII4c: #		#IW"//%j.?.?RZZ
  $vH)?:@U?VVqruqv'w$HH-.IIR" "/!9!9*k!R		#5S8 		#**C<<-D D &88 W!229ellKQVVW!S SsJ   &AH"='G$9H"4G&A$H"G#	H"&H9!HHH""H+c                v    |dk7  rt        d| d|       | j                  ||      \  }}}t        |      ||fS z
        Compiles source code and loads the generated .so file.
        Returns a tuple of DLLWrapper, hash_key, source_code_path
        r  zCOnly support loading a .so file for now. Requested file extension: z. Source code: r   r  r  r  rl  r  dst_file_pathr   source_code_paths         rV   r   zCUDACodeCache.load  a     4--9N/+X  58KK5
1x!1 =)85EFFrX   rl  r   r  r   r   r  rQ   rl  r   r  r   r  Optional[list[str]]r   tuple[str, str, str]rl  r   r  r   r   ztuple[DLLWrapper, str, str])r   r   r   rQ  	dataclassr  r   rG  r   r  r  r  r  r"  r  r   rR   rX   rV   rS  rS    s       $&E %u{{+K  TX)=)=-0)=>Q)=	)= )=V G GrX   rS  c                      e Zd ZU ej                   G d d             Zi Zded<    eej                        Z
dZdZedd       Ze	 d	 	 	 	 	 	 	 dd	       Zedd
       Zy)rR  c                  "    e Zd ZU ded<   ded<   y)ROCmCodeCache.CacheEntryr   r  r  Nr  rR   rX   rV   r  r7  .  r  rX   r  r  r   r  Fc                n    t        t        dgd|            }t        || j                  |      \  }}||fS r  )r  r!   r"  r  r  s         rV   r"  zROCmCodeCache.write8  r  rX   Nc                V   | j                   s6d| _         t        j                  t        t	        t                                  | j                  ||      \  }}|| j                  vr`ddlm	} t               } |t        j                  j                  ||dz         t              }|5  |dt        | j                           |z   }	t        j                  j#                  |	      st%        |g|	||      }
t'               }|
j)                  d      }	 t+        j,                  |t*        j.                  dt        j0                        }t        j                  d	|       t'               }d
||z
   d|
 }t        j;                  |       nt        j                  d||	       t<        j?                  ||	      | j                  |<   ddd       | j                  |   j@                  ||fS # t*        j2                  $ r&}t5        j6                  ||j8                        |d}~ww xY w# 1 sw Y   axY w)z
        Compiles source_code into a file with dst_file_ext extension,
        using the compile command specific for the ROCm platform.
        Returns a tuple of dst_file_path, hash_key, source_code_path
        Tr   r  r  r  Nr7  )r  r&  r  zCompilation output: %szCompilation took r  z+Skip compiling %s: output %s already exists)!_logged_compiler_versionr  r$  r*   r   r"   r"  r   r?  r  r   rz   r{   r|   r@  r  r  r   r!   r   r  r;  r<  r  r  r   r   r!  r"  r1  rR  r  r  )r  rl  r  r  r   r  r  r   rd  r  rf  r#  r$  r"  r%  r&  r'  s                    rV   r  zROCmCodeCache.compileG  s    +++/C(II/MO0DEF))K>Zcii6#~HBGGLL3=A<XD S()HC0G0G,H+HILXww~~k2.#k<C "&J #		#I	W!+!8!8%#-#4#4!% "

	" 		":FC  $vH):8j;P:QQlmplq'r$HH-.IIE"#
 "/!9!9*k!R		#7S: 		#**C<< &88 W!229ellKQVVW!S Ss3   (A&HAG#A%H#H6!HHHH(c                v    |dk7  rt        d| d|       | j                  ||      \  }}}t        |      ||fS r)  r*  r+  s         rV   r   zROCmCodeCache.loady  r.  rX   r/  rQ   r0  r3  )r   r   r   rQ  r4  r  r   rG  r   r  r  r  r:  r  r"  r  r   rR   rX   rV   rR  rR  ,  s       $&E %u{{+K$  TX/=/=-0/=>Q/=	/= /=b G GrX   rR  c                      e Zd ZddZy)CodeCacheFuturec                    t         rQ   )r  r   s    rV   r  zCodeCacheFuture.result  s    !!rX   Nr   Callable[..., Any])r   r   r   r  rR   rX   rV   r=  r=    s    "rX   r=  c                  *    e Zd Z	 d	 	 	 	 	 ddZddZy)LambdaFutureNc                     || _         || _        y rQ   )	result_fnr  )r   rD  r  s      rV   r   zLambdaFuture.__init__  s     #rX   c                "    | j                         S rQ   )rD  r   s    rV   r  zLambdaFuture.result  s    ~~rX   rQ   )rD  r@  r  zOptional[Future[Any]]r   r   r?  )r   r   r   r   r  rR   rX   rV   rB  rB    s'    MQ+5J	 rX   rB  )rT   r   rU   r   r   r   )r   r   r  )r   r   r   r   )r  r  r   r   r  )r	  Union[str, bytes]r
  rF  r   r   )r  r   r  r   r  r   r   r2  )r  r	  )r  rF  r
  r   r  r   r   r   )r  r	  r  )r  rF  r  r   r
  r   r  r   r  r   r   r  )r&  r   r   r   )FF)
r;  r   r  rF  r   r   r<  r   r   r   )rJ  rn   r   rn   )rS  r   r   r=   )r  zlist[str] | Noner  r   r  zhashlib._Hashr   r   )r   r  )
rn  r  r  r   r  ra   r  r!  r   ztuple[str, list[str]])r5  r.  r   r.  )r{   r   r   r  )r   r   rT   r   r   zUnion[list[c_void_p], c_void_p])r  r   r  r'   r   r   )rc  r   re  zlist[partial[Any]]r   r   )r]  r   r   r   )r  r   rl  r   r   rl   )r   r  )r   r  rQ   )
r  r  r  r   r  r   r  r1  r   r   )
__future__r   r   r]  rQ  r   r   r  rY  r  r   loggingrz   rM  r  r  rq  r8  rM  r;  r;  r  r  r3  rx  bisectr   r   r$  r   r   r   datetimer	   r
   pathlibr   r   r   typesr   typingr   r   r   r   r   r   r   r   typing_extensionsr   rt   torch.distributedr,  r2  r   r   torch._dynamo.utilsr   r   r   torch._inductorr   r   r   torch._inductor.codegen.cudar    ,torch._inductor.codegen.rocm.compile_commandr!   r"   torch._inductor.cpp_builderr#   r$   r%   r&   r'   r(   r)   r*   r+   r,   torch._inductor.cpu_vec_isar-   !torch._inductor.custom_graph_passr.   r/   torch._inductor.freezing_utilsr0   r1   %torch._inductor.runtime.compile_tasksr2   r3   %torch._inductor.runtime.runtime_utilsr4   r5   torch._inductor.utilsr6   r7   r8   r9   torch._loggingr:   torch._subclasses.fake_tensorr;   r<   r=   torch._utils_internalr>   torch.compilerr  torch.compiler._cacher?   r@   %torch.fx.experimental.symbolic_shapesrA   rB   rC   torch.utils._ordered_setrD   package.pt2_archive_constantsrF   rS  rG   runtimerH   runtime.autotune_cacherI   triton_bundlerrJ   r  	triton.fbrK   torch._inductor.fb.utilsrL   rM   rN   rO   collections.abcr]   r^   r_   concurrent.futuresr`   r  ra   rb   rs  rd   irre   rp   rf   rg   rh   ri   runtime.hintsrj   rk   runtime.triton_heuristicsrl   r  rm   rn   r  r7  r@  _logginggetArtifactLoggerr   ru  	getLoggerr  rw   r   r   r}   r   r   r   r  r  r  r  r"  r'  r   r4  rA  rK  rU  PicklerrW  r  r  r  r  rO  ru  r  r(  r8  r:  r  r  r  r  r  r  rG  r  r  r  r  r   r  r^  r]  r  r  r  r  r  r  r  r  r  rS  rR  r=  rB  rR   rX   rV   <module>rq     s   "       	    	   	     
       ' '     	 	 	 #      J J 0 0 1   5 R M O  , 
 3 , I N N / E & # 8 ) 6%  ==)=$ A5:; A llg%..228]Kg!G T B
 B
J' '2Xi XvX
* 9;""!"25"" CI;;'*;<?;;   	
  "! 	  	
 
>   	p&.. pfVV%(V2?V	V  TX XD%    }" }"@'   #	
 ,!8C CL T       6e e` 
.  .(  . *HZ k: k: k:\    
  f; f; f;R =4 = =@ \0 \ \~
#L  \( \( \(~??#&??	 *)` '+	""" " $	"
 	"JH HV TG TG TGn [G [G [G|" "
 ?  rX   