
    VhZ                       U d Z ddlmZ ddlZddlZddlZddlZddlmZ ddl	m
Z
mZmZmZmZ ddlmZ ddlZddlmZmZ ddlmZmZmZmZ dd	lmZmZ dd
lmZmZm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( er$ddl)m*Z* ddl+m,Z, ddl-m.Z. ddl/m0Z0 ddl1m2Z2 ddl3m4Z4  ejj                  e6      Z7ejp                   G d d             Z9e:Z;de<d<   d(dZ=d)dZ>d*dZ?	 	 	 	 	 	 	 	 	 	 d+dZ@	 	 	 	 	 	 	 	 d,dZA G d d       ZB G d! d"eB      ZCejp                   G d# de9             ZDejp                   G d$ d%e9             ZEejp                   G d& d'e9             ZFy)-a  
This provides an abstract class which parametrizes over an "output code" concept
for Inductor.  Intuitively, this represents the compiled callable which Inductor
produces which you can call to get optimized code.  However, this callable
has some other capabilities:

- It is serializable, so you can save/load this product from disk without
  having to do compilation again.

- (When using remote cache) it is addressable, so you can save just a key
  which you can use to load this product from remote cache later.

This class is abstract because we have several different implementations of
serialized format:

- Python wrapper (the default)

- AOTInductor (this produces ABI stable binaries which work across PyTorch
  versions)

    )annotationsN)Path)AnyCallableOptionalTYPE_CHECKINGUnion)	TypeAlias)countersget_runtime_metrics_context)BoxedDeviceIndexCudagraphCachedInfoget_placeholder_info#log_cudagraph_skip_and_bump_counter)has_frozen_paramsis_frozen_param)align_inputs_from_check_idxs	BoxedBool	InputTypeoutput_node"set_tracing_context_output_strides)
OrderedSet   )config)AutotuneCacheBundler)Counter)Sequence)metrics)GraphLowering)_CompileFxKwargs)TritonKernelArtifactsc                      e Zd ZU  ej                  dd      Zded<    ej                  dd      Zded<   ddZ	 	 	 	 	 	 	 	 dd	Z	dd
Z
y)
OutputCodeNF)defaultinitOptional[str]_fx_graph_cache_keyzOptional[int]_time_taken_nsc                *    t        t        |             NNotImplementedErrortypeselfinputss     K/home/dcms/DCMS/lib/python3.12/site-packages/torch/_inductor/output_code.py__call__zOutputCode.__call__N       !$t*--    c                *    t        t        |             r*   r+   r/   example_inputs
cudagraphs	constantss       r1   post_compilezOutputCode.post_compileQ   s     "$t*--r4   c                *    t        t        |             r*   r+   r/   triton_bundles     r1   set_triton_bundlezOutputCode.set_triton_bundleZ   r3   r4   r0   zSequence[Any]returnr   r7   Sequence[InputType]r8   r   r9   CompiledFxGraphConstantsr@   Noner=   r   r@   rD   )__name__
__module____qualname__dataclassesfieldr'   __annotations__r(   r2   r:   r>    r4   r1   r#   r#   D   sn    
 *;):):4e)TT %6K$5$5d$ONMO..+. . ,	.
 
..r4   r#   r
   _StrideExprStrc                    t        | t        j                        sy t        | j                        D cg c].  }| j                  |      dk(  s| j                  |      dk7  s-|0 c}S c c}w Nr   r   )
isinstancetorchTensorrangendimstridesize)tis     r1   get_expanded_dimsrY   e   sL    a&QVV}L!q(8QVVAY!^ALLLs   A)A)"A)c                l    |D ].  }t         j                  j                  j                  | |dd      } 0 | S rO   )rQ   opsatenslice)rW   expanded_dimsexpanded_dims      r1   index_expanded_dimsr`   k   s4    % 8IINN  L!Q78Hr4   c                   t         j                  ryt        | t        |             j	                         } t        j                  |       dk7  r| j                         }| j                  }t        t        t        |                  }t        t        ||            D cg c]  \  }}|	 }}}t        t        |            D ]6  }|dk(  rdn
|||dz
        }|dk(  rdn
|||dz
        }|||      ||z  k  s6 y yc c}}w )NTr   r   F)r   *always_complex_memory_overlap_TESTING_ONLYr`   rY   squeezerQ   _debug_has_internal_overlaprU   shapelistrS   lensortedzip)	rW   stridessizesindices_xrX   prev_stride	prev_sizes	            r1   complex_memory_overlaprq   q   s    88 	A034<<>A((+q0((*uS\*+!'GW(=!>?A1??s7|$ 	A Av!771q5>+BK!Vwq1u~)>Iwqz"[9%<<		
  @s   C;CompiledFxGraphc                   |j                   J |j                  J |j                  }|j                  }|j                  }|j                  d   }|j                  d   }|s|j                  }	|	d   }
|j
                  }|j                  }t        j                  j                  s-| D ](  }t        |t        j                        st        |       * |1|s/|s-|j                  t        t!        |j"                                     ddlm} |j                   }|J  |||
xs dt        t!        |j"                              |||t)        |j+                               |t)        |j,                        	      |_         yt/        j0                  |       |rt        j                  j                  re|J |j2                  J |j                   t        j4                  j                  j7                  |j2                  d	
      J dfd}||_         d|j8                  v r1|j:                  rt=        |j:                         yt=        d|        yy)z
    Checks for any reasons not to run cudagraphs and then
    runs it on compiled_graph.
    Mutates the `compiled_graph.current_callable` and `cudagraphs`
    Nis_inferenceis_backwardstatic_input_idxsr   )cudagraphifyrL   )rv   device_indexstack_tracesru   rt   r9   placeholdersmutated_input_idxsF)create_if_none_existsc                4    j                           |       S r*   )set_to_running_backward)
new_inputscompiled_graph_callablemanagers    r1   compiled_artifactz1cudagraph_post_compile.<locals>.compiled_artifact   s    //1.z::r4   cudaskipping cudagraphs due to )r   z	list[Any]r@   zCallable[..., Any])current_callablecudagraph_infocudagraph_fail_reasonsboxed_forward_device_index	fx_kwargsrz   ry   r   tritoncudagraph_treesrP   rQ   SymIntintsetnextiterdevice_idxs
compile_fxrw   tuplevaluesr{   r   disablevalue	_inductorget_managerdevice_typesdisabled_cudagraphs_reasonr   )r7   compiled_graphr8   r9   cached_infor   r   rt   ru   r   rv   rz   ry   rW   rw   r   r   r   r   s                    @@r1   cudagraph_post_compiler      sC    **666((444 //K(??!/!J!J!++N;L **=9K!",,	%&9:"//"//}},,# a.F
 '2 &**4^5O5O0P+QR,)::+++*6/52d>#=#=>?%#%I,,./%$^%F%FG
+
' 	*%
 6==88-999-33???&4&E&E#oo55AA*00 B G &&&; /@N+^000 883"== 412H1IJ 1r4   c                ~    | s;|j                   J t        |j                   |      }||j                   ur||_         yyy)z
    Realigns input strides from inputs_to_check if
    we didn't end up running cudagraphs. Mutates
    `compiled_graph.current_callable` if cudagraphs
    was run. Otherwise, does nothing.
    N)r   r   )ran_cudagraphsr   inputs_to_checknew_callables       r1   maybe_realign_inputsr      sP     ..:::3++_
 ~>>>.:N+ ? r4   c                      e Zd ZdZddZy)rC   a\  Wrapper class that unwraps constants from a compiled fx graph. This
    version of the class only supports directly grabbing the saved constants off of
    a CompiledFxGraph.

    With freezing, FxGraphCache doesn't store the constants of the input
    GraphModule it gets from AOTAutograd. Instead, it saves just the **names**
    of those constants, and grabs the constant values directly from the graph module
    passed in at runtime.

    Thing is, we don't always *have* the graph module available at runtime, hence
    the existence of this class and its CompiledFxGraphConstantsWithGm counterpart.

    To support freezing, FXGraphCache gets passed a CompiledFxGraphConstantsWithGm during
    post compile. Otherwise, CompiledFxGraphConstants supports the basic case of loading
    the value of constants directly off of the original saved object.
    c                6    |j                   J |j                   S r*   )r9   )r/   gs     r1   unwrapzCompiledFxGraphConstants.unwrap  s    {{&&&{{r4   Nr   rr   r@   dict[str, torch.Tensor])rF   rG   rH   __doc__r   rL   r4   r1   rC   rC      s    "r4   rC   c                       e Zd ZdZddZddZy)CompiledFxGraphConstantsWithGma/  
    This version of CompiledFxGraphConstants, instead of grabbing constants
    directly saved on CompiledFxGraphs, will just grab their names. Then, it takes
    a second GraphModule to grab the corresponding constant values out of.

    This is necessary for supporting freezing in FxGraphCache.
    c                    || _         y r*   gm)r/   r   s     r1   __init__z'CompiledFxGraphConstantsWithGm.__init__  s	    r4   c           	         |j                   j                         D ci c]  \  }}|t        | j                  |       }}}|j                  xs i }i ||S c c}}w r*   )frozen_param_namesitemsgetattrr   r9   )r/   r   name	orig_namefrozen_paramsr9   s         r1   r   z%CompiledFxGraphConstantsWithGm.unwrap  si     $%#7#7#=#=#?
i '$''9--
 
 KK%2	-)-}--
s   !AN)r   torch.fx.GraphModuler@   rD   r   )rF   rG   rH   r   r   r   rL   r4   r1   r   r     s    .r4   r   c                     e Zd ZU dZded<   ded<    ej                  d      Zded<   d	ed
<   ded<   ded<   ded<   ded<   ded<   ded<   ded<   ded<   ded<   ded<   ded<   ded<   d ed!<   d"ed#<   d$ed%<   d&ed'<   d(Zd)ed*<   d(Z	d+ed,<   	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d3d-Z
d4d.Z	 	 	 	 	 	 	 	 d5d/Zd6d0Zd7d1Zd8d2Zy()9rr   zr
    Class holding a compiled FX graph. This is the object serialized on disk
    to support FxGraph caching.
    Optional[Callable[..., Any]]r   str	cache_keyF)reprsource_codezOptional[list[tuple[int, str]]]cache_linemapzOrderedSet[str]r   zOrderedSet[int]r   mutated_inputsr{   z!Optional[dict[str, torch.Tensor]]r9   zdict[str, str]r   z dict[str, torch._C.ScriptObject]torchbind_constantsz4Optional[list[Optional[tuple[_StrideExprStr, ...]]]]output_stridesr&   r   metrics.CachedMetricsDeltasmetrics_deltasCounter[str]counter_deltasguards_exprzOptional[CudagraphCachedInfo]r   r    r   Sequence[int]r   Optional[BoxedDeviceIndex]r   NzOptional[bool]_boxed_callz%Optional[list[TritonKernelArtifacts]]_triton_bundlec                   || _         |j                  | _        |j                  r3t        |j                        5 }|j	                         | _        d d d        |j                  | _        t        |j                        | _        t        |j                        | _	        t        |j                        | _
        t        |j                        | _        t        |      s|j                  | _        i | _        ngi | _        i | _        |j                  j                         D ]<  \  }}t!        |      r|j"                  |   | j                  |<   .|| j                  |<   > |j$                  | _        || _        || _        || _        || _        d | _        d | _        i | _        d| _        d | _        d }|r| j(                  rRd| j                  v rt9        d| j(                          nt:        d   dxx   dz  cc<   t=        j>                  |       n;tA        d |	D              }tB        jD                  jF                  s3dd	l$m%}  ||| j                  | j                  |
      }|d u}|r
|| _        nd }| d
f| dftM        d |	D              dfg}tO        |      }tQ        |jR                        dk(  sJ |jR                  d   D cg c]>  }tU        |tV        jX                  jZ                  j\                        r|j^                  nd @ }}|D cg c]
  \  }}|r	| }}}ta        tc        |jd                              }tg        |||      }|| _        || _        || _        || _        d| _4        y # 1 sw Y    xY wc c}w c c}}w )NrL   r   r   inductorcudagraph_skipsr   c              3  f   K   | ])  }t        |t        j                        rt        |       + y wr*   )rP   rQ   rR   rq   .0rW   s     r1   	<genexpr>z+CompiledFxGraph.__init__.<locals>.<genexpr>  s+      4!!U\\2 +1-4s   /1r   )3check_for_mutation_ignore_cuda_graph_managed_tensorzmutated inputszcomplex memory overlapc              3     K   | ]=  }t        |t        j                  t        j                  t        j                  f       ? y wr*   )rP   rQ   rR   r   	Generatorr   s     r1   r   z+CompiledFxGraph.__init__.<locals>.<genexpr>  s2       ! 'q5<<u*WXs   AAznon-Tensor inputsT)5r   r   
cache_pathopenreadr   r   r   r   r   r   r{   r   r9   r   r   r   allocated_constant_namer   r   r   r   r   r   r   r   r   r   r   r   r   r   anyr   r    cudagraph_support_input_mutationtorch._inductor.cudagraph_utilsr   allr   rg   argsrP   rQ   fxnodeNodestack_tracer   r   graphr   r   )r/   r   r   r   r   r   r   r   r8   r7   rv   r   r   r   fkvr   complex_memory_overlap_inputsr   has_mutation_strhas_mutationcudagraph_testsoutputargry   bsr   rz   s                                 r1   r   zCompiledFxGraph.__init__F  su     !1e&&' ,1#$668 ,"00&u'9'9:%e&7&78()=)=>",U-E-E"F !$"__DN&(D#DN&(D#--/ *1"1%161N1Nq1QD++A.()DNN1%	* $)#<#< ,*D',,"!*.'..T...75d6U6U5VW Z():;q@;!!*-03 4+4 1- }}EE
 L // 33-	 % $44#?L#:J7 $(L &%'78668PQ %3  ,
# %R6;;'1,,,  &{{1~  )338J8J(KS__QUU    9H)Q1q!)Q&)Q$%9"((%CD!4 ,0F" -."*D'  ], ,z  *Rs   M.AM;
N N .M8c                    | j                   J 	 | j                  |      t               j                          t        j                          S # t               j                          t        j                          w xY wr*   )r   r   finishr   end_compiler.   s     r1   r2   zCompiledFxGraph.__call__  sa    $$000	/((0')002 ,,. ()002 ,,.s   A .A;c                F   t        ||        |rz| j                  rQd| j                  v rt        d| j                          nt        d   dxx   dz  cc<   t        j                  |       nt        || ||j                  |              | j                  }t        || |       y)a  
        Run a set of post processing steps after loading from the cache. These involve:
         - Setting the tracing context output strides
         - Running cudagraphs if enabled
         - Realigning inputs

        This runs whether or not we have a cache hit, and always runs directly after we get a CompiledFxGraph.
        The results of this function are *not* saved in the cache itself.
        r   r   r   r   r   N)r   r   r   r   r   r   r   r   r   r   r   )r/   r7   r8   r9   r   s        r1   r:   zCompiledFxGraph.post_compile  s     	+>4@ ..T...75d6U6U5VW Z():;q@;!!*-&"$$T*	 .. 		
r4   c                    || _         y r*   )r   r<   s     r1   r>   z!CompiledFxGraph.set_triton_bundle  s
    +r4   c                    d | _         y r*   )r   r/   s    r1   prepare_for_serializationz)CompiledFxGraph.prepare_for_serialization  s    
 !%r4   c           	     R   ddl m}m} ddlm}m}m}m}  || j                  d      d   }| j                  }	t        j                  j                  |      s|d   dxx   dz  cc<   t        t        j                  j                  |            j                  d	d	
        |       }
t        j                  j!                  |
      |	v rJ|
|	v rnEdt        j                  j!                  |
       d}t#        j$                  |d|
 d|	      }	|	| _	         |||	d	       	  |dd	      5  |j'                  | j                  || j(                  |j+                  |             j,                  | _        d d d        |S # 1 sw Y   |S xY w# t0        $ r t2        j5                  d|        w xY w)Nr   )r   dynamo_timed)cpp_prefix_pathget_pathPyCodeCachewrite_atomicpy   r   fxgraph_lookup_write_filer   T)parentsexist_okz#include\s*"[^"]+"z
#include ")	make_dirszPyCodeCache.load_by_key_path)log_pt2_compile_eventzFailed to load artifact: %s)torch._dynamo.utilsr   r   torch._inductor.codecacher   r   r   r   r   r   ospathexistsr   dirnamemkdirbasenameresubload_by_key_pathr   r   callr   OSErrorlogerror)r/   r9   r   r   r   r   r   r   artifact_pathcodecpp_pppatterns               r1   after_deserializationz%CompiledFxGraph.after_deserialization  s   >	
 	
 !6q9ww~~m,Z !<=B=/066td6S$&Fww'4/T> "32773C3CF3K2LANG66'Zxq+A4HD'+D$=	.&* 	 )4(D(DNN!&&$$T*	)
 $ %		 	 	  	II3]C	s+   
F 'AE8.F 8F=F F !F&)r   r   r   r   r   r   r   z*list[Optional[tuple[_StrideExprStr, ...]]]r   r&   r   r   r   r   r8   r   r7   rB   rv   r   r   r    r   r   r   r   r@   rD   r?   rA   rE   r@   rD   )r9   rC   r@   r   )rF   rG   rH   r   rK   rI   rJ   r   r   r   r   r2   r:   r>   r   r  rL   r4   r1   rr   rr   !  s   
 32N({((e4K422!!  ##''00&&99HH --//   11"" ::"&K&<@N9@A 6A  A  !	A 
 CA  %2A  4A  %A  A  ,A  )A  $A  'A  %?A  
A F/+
++
 +
 ,	+

 
+
Z,%,r4   c                  D    e Zd ZU dZded<   ddZ	 	 	 	 	 	 	 	 d	dZd
dZy)CompiledAOTIz3
    Class holding an AOTInductor compiled so.
    zUnion[str, list[str]]filenamec                    t        d      )NNYI)r,   r.   s     r1   r2   zCompiledAOTI.__call__?  s    !%((r4   c                     y r*   rL   r6   s       r1   r:   zCompiledAOTI.post_compileB       	r4   c                     y r*   rL   r<   s     r1   r>   zCompiledAOTI.set_triton_bundleJ      r4   Nr?   rA   rE   )rF   rG   rH   r   rK   r2   r:   r>   rL   r4   r1   r  r  7  sD     $#)+  ,	
 
r4   r  c                  L    e Zd ZU dZded<   ddZ	 	 	 	 	 	 	 	 d	dZd
dZddZy)MockFXGraphCacheOutputNr   r   c                    d| _         y )NT)r   r   s    r1   __post_init__z$MockFXGraphCacheOutput.__post_init__R  s
    r4   c                     y r*   rL   r6   s       r1   r:   z#MockFXGraphCacheOutput.post_compileU  r  r4   c                $    | j                  |      S r*   r   r.   s     r1   r2   zMockFXGraphCacheOutput.__call__]  s    wwvr4   c                     y r*   rL   r<   s     r1   r>   z(MockFXGraphCacheOutput.set_triton_bundle`  r!  r4   r  rA   r?   rE   )	rF   rG   rH   r   rK   r%  r:   r2   r>   rL   r4   r1   r#  r#  N  sE    BN +  ,	
 
r4   r#  )rW   torch.Tensorr@   	list[int])rW   r)  r^   r*  r@   r)  )rW   r)  r@   bool)
r7   rB   r   rr   r8   r   r9   r   r@   rD   )r   r   r   rr   r   r   r@   rD   )Gr   
__future__r   rI   loggingr  r  pathlibr   typingr   r   r   r   r	   typing_extensionsr
   rQ   r  r   r   r   r   r   r   r   torch._inductor.freezing_utilsr   r   torch._inductor.utilsr   r   r   r   r   torch.utils._ordered_setr    r   runtime.autotune_cacher   collectionsr   collections.abcr   torch._inductorr   torch._inductor.graphr   r   r    triton_bundlerr!   	getLoggerrF   r  	dataclassr#   r   rM   rK   rY   r`   rq   r   r   rC   r   rr   r  r#  rL   r4   r1   <module>r=     s  , #   	 	  @ @ '  E  N  0  8 #('3,5g! . . .2  	 M,W'W#W W '	W
 
Wt;;#; #; 
	;( ..%= .* Rj R Rj :  , Z  r4   