
    VhZ                   t	   d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZmZmZmZmZ d d	lmZmZm Z m!Z!m"Z"m#Z# d d
l$m%Z% d dl&Z'd dl(Z'd dl)m*c m+Z, d dl-m.Z. d dl'm/Z/ d dl0m1Z1 d dl2m3Z3m4Z5mZ6m*Z7 d dl8m9Z9 d dl:m;Z; d dl<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZCmDZDmEZE d dlFm4ZG d dlHmIZI d dlJmKZKmLZLmMZM d dlNmOZOmPZPmQZQ d dlRmSZSmTZTmUZUmVZV d dlWmXZX d dlYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_ d dl`maZa d dlbmcZcmdZdmeZemfZfmgZgmhZhmiZimjZj d dlkmlZl d dlmmnZn d dl(moZo d dlpmqZqmrZr d dlsmtZt d d lumvZv d d!lwmxZx d"d#lymzZz d"d$l{m|Z|m}Z} d"d%l~mZ d"d&lmZ d"d'lmZ d(d)lm4Z4mZ d(d*lmZ d(d+lmZ d(d,lmZ d(d-lmZ d(d.lmZmZ d(d/lmZ d(d0lmZ d(d1lmZmZ d(d2lmZ d(d3lmZ d(d4l*mZmZmZmZmZmZmZmZmZ d(d5lmZ erd d6lmZmZ d d7lYmZ d d8lmZ d(d9lmZ  e d:      Z ed;      Zes e4j`                         s	dzd<Zd{d=Znd d>lmZmZ er
d d?lmZmZmZ  G d@ dAejp                        Zd|dBZ e       Z ejx                  e      Ze'j~                  j                  edC      Ze'j~                  j                  edD      Ze'j~                  j                  edE      Ze'j~                  j                  edF      Zd}dGZd~dHZ ej                  d      ddI       Z ej                  d      ddJ       Z	 	 	 	 	 	 	 	 ddKZddLZ	 	 	 	 	 	 ddMZd~dNZdddPZ	 	 	 d	 	 	 	 	 	 	 	 	 ddQZddRZ	 	 	 	 ddSZ	 	 	 	 	 	 ddTZ	 d	 	 	 	 	 	 	 ddUZ	 d	 	 	 ddVZej                  ddW       Z G dX dYe"dOZ      Z G d[ d\e!      Z	 	 	 	 	 	 	 	 dd]Z ed^_      	 	 	 	 	 	 	 	 dd`       Z G da db      Z G dc dde      Z G de dfeܫ      Z	 	 	 	 	 	 	 	 	 	 ddgZ	 	 	 	 	 	 ddhZ	 ddidididj	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddkZddlZ	 	 	 	 	 	 	 	 ddmZ	 d	 	 	 	 	 	 	 ddnZedf	 	 	 	 	 	 	 	 	 ddoZ ed       Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddpZddqZddrZeddf	 	 	 	 	 	 	 	 	 	 	 ddsZddtZ	 	 	 	 	 	 	 	 dduZ	 	 	 	 	 	 	 	 ddvZddwZ	 dddx	 	 	 	 	 	 	 	 	 ddyZy)    )annotationsN)ABCabstractmethod)defaultdict)AbstractContextManager)currentframe)count)AnyCallableOptionalTYPE_CHECKINGTypeVarUnion)Neveroverride	ParamSpecProtocol	TypedDictUnpack)mock)#min_cut_rematerialization_partition)fx)enable_python_dispatcher)compiled_autogradconfigloggingutils)get_interface_for_device)wrap_compiler_debug)	chromium_event_timedCompileEventLoggercountersdetect_fake_modedynamo_timedflatten_graph_inputsget_metrics_contextlazy_format_graph_codeset_feature_use)r   )!unwrap_tensor_subclass_parameters)aot_export_modulemake_boxed_funcSerializableAOTDispatchCompiler)	code_hashFxGraphCacheoutput_code_log)BoxedDeviceIndexformat_default_skip_message#log_cudagraph_skip_and_bump_counterPlaceholderInfo)save_args_for_compile_fx_inner)CompiledAOTICompiledFxGraphCompiledFxGraphConstantsWithGmget_expanded_dimsindex_expanded_dims
OutputCode)	cache_dir)	BoxedBoolcount_tangentsfresh_inductor_cache	InputTypeis_gpushould_assume_input_aligned should_use_remote_fx_graph_cachetensor_is_aligned)trace_structured)compile_time_strobelight_meta)GraphModule)free_unbacked_symbolsSymExprPrinter)FakeTensorProp)_WaitCounter)
OrderedSet   )aot_autograd)ShortenTraceback	SkipFrame)_use_lazy_graph_module)_PyTreeCodeGen)
has_triton   )r   metrics)DebugContext)select_decomp_table)InductorError)joint_graph_passes)post_grad_passesview_to_reshape)pre_grad_passes)GraphLowering)get_device_typeIRNode)complex_memory_overlap)TritonBundler)	align_inputs_from_check_idxsclone_preserve_stridescopy_misaligned_inputs get_cloned_parameter_buffer_name%get_first_incompatible_cudagraph_node#maybe_get_suppress_shape_guards_ctxoutput_noderemove_unaligned_input_idxsshape_env_from_inputs)V)	GeneratorSequence)_StrideExprStr)
OpOverload)ExternKernelNode_P_Tc                "    t         j                  S N)dynamo_utilsidentityattrs    J/home/dcms/DCMS/lib/python3.12/site-packages/torch/_inductor/compile_fx.pytime_and_logry      s    $$$    c                      y rs    )argskwargss     rx   log_optimus_to_scubar      s    rz   )r   ry   )FQNGraphInputNameGraphSignaturec                      e Zd ZdZdZdZy)FxCompileModer   rS   rL   N)__name__
__module____qualname__NORMAL	SERIALIZE
SUBPROCESSr|   rz   rx   r   r      s    F IJrz   r   c                    d} t         j                  j                  |       }|t        j                  S 	 |j                         }t        |   S # t        $ r dd l} |j                  t              }|j                  d|| dj                  t        d t        j                  j                         D                           t         j                  j                  |        t        j                  cY S w xY w)NTORCHINDUCTOR_FX_COMPILE_MODEr   z>Invalid value of %s for %s. Expected one of %s. Using default., c              3  2   K   | ]  }t        |        y wrs   )repr.0xs     rx   	<genexpr>z+_fx_compile_mode_default.<locals>.<genexpr>   s     OT!WOs   )osenvirongetr   r   upperKeyErrorr   	getLoggerr   errorjoinsorted__members__keyspop)namevaluer   logs       rx   _fx_compile_mode_defaultr      s    *DJJNN4 E}###$U## $g)		LIIfOm.G.G.L.L.NOOP		
 	

t###$s   A B!C21C2
perf_hintspre_grad_graphspost_grad_graphscudagraph_static_inputsc                    t         j                  j                  j                         }t	        t        |             }|r|j                  s|S |j                  j                  S rs   )torch_guardsTracingContexttry_getlistrangefw_metadatastatic_input_indices)	num_fixedcontextfixeds      rx   get_static_input_idxsr      sM    
 mm**224Gy!"E'--333rz   c                   | j                   j                  d      d   }g }|j                  d   D ]  }t        |t        j
                  j                        rW|j                  j                  d      x}:t        |t        j                        r |j                  |j                                ~|j                  d         ||j                  d<   y )Noutputopr   valoriginal_output_strides)graph
find_nodesr}   
isinstancer   r   Nodemetar   Tensorappendstride)gmrg   output_stridesr   r   s        rx   record_original_output_stridesr      s    ((%%%215KN""1% (vuxx}}-..;3-!!#**,/!!$'( 3AK./rz   c                 4    t        j                  t              S rs   )dynamo_loggingget_step_loggerr   r|   rz   rx   _step_loggerr      s    ))#..rz   c                    t         j                  j                         rgt         j                  j                  j                  j
                  s8t         j                  j                         dk\  rt        j                  d       y y y y )N)   r   zTensorFloat32 tensor cores for float32 matrix multiplication available but not enabled. Consider setting `torch.set_float32_matmul_precision('high')` for better performance.)	r   cudais_availablebackendsmatmul
allow_tf32get_device_capabilitywarningswarnr|   rz   rx   _warn_tf32_disabledr      sc     	

!##**55JJ,,.&8d	
 9 6 	"rz   c           	        ddl m}m} i }| j                  d      D ]   \  }}|||<    |||||j                         " | j                  d      D ]   \  }}|||<    |||||j                         " |j                  j                  d      }	g }
|	D ]  }|j                  }||j                  v r!|j                  |   }|
j                  |       >||j                  v rE|j                  |   }|
j                  |       t        ||         |j                  t        |      <   ||j                   v sJ |
j                  d         ddlm} t'        |j                  j(                        d	   j*                  d   }g }|j,                  }|j.                  }|j0                  }t3        |      D ]y  \  }}d }|t5        |      t5        |      z   t5        |      z   k  r;|j                  |v r||j                     }n|j                  |v r||j                     }|j                  |       {  |||
|t7        j8                         d |i       }|S )
Nr   )_assign_attr	_AttrKindF)remove_duplicate)	attr_kindplaceholderr   )_unlift)torch.export.unflattenr   r   named_parameters	PARAMETERnamed_buffersBUFFERr   r   r   inputs_to_parametersr   inputs_to_buffersrb   r   rd   user_inputstorch.export._unliftr   r   nodesr}   buffers_to_mutateuser_inputs_to_mutateoutput_tokens	enumeratelenpytreeLeafSpec)modr   graph_signaturer   r   
state_dictr   parambufferplaceholder_nodeslifted_inputsnode	node_nameparameter_namebuffer_namer   outputsmutated_outputsbuffer_mutationsuser_input_mutationsr   idxoutr   unlifted_gms                            rx   _unlift_graphr      sq    ?OQJ++U+C 
e 
4))		

 ))5)A 
f!
4&&		

 ++}+=)+M " 'II	<<<,AA)LN  0/;;;);;IFK  -&z+'>? GG4[AB  ; ;;;;  &' -288>>"2&++A.GO&88*@@#11Mg& 	&S6:%&-A)BBSEWWWxx++(211,SXX6u%	& 

K rz   c           	   #    K   t        t        j                  | j                  j	                  dt
        j                  j                  j                        | j                  j	                  dt
        j                  j                  j                                    D ]  }|j                  t
        j                  j                  j                  k(  r;|j                  d   j                  }|j                  d   j                  }| | o|j                  t
        j                  j                  j                  k(  s|j                  d   j                  }|j                  d   j                  }| |  y w)Ncall_functionr   targetrS   rL   r   )r   	itertoolschainr   r   r   opshigher_ordercond
while_loopr  r}   r   )r   r   true_subgraph_namefalse_subgraph_namecond_subgraph_namebody_subgraph_names         rx   _get_subgraph_namesr  >  s    HH?599;Q;Q;V;VWHH"599+A+A+L+L   	
 % ;;%))00555!%1!2!2"&))A,"3"3$$%%[[EII22===!%1!2!2!%1!2!2$$$$#%s   D:E:==E:c                   t        ddd      5  t        j                  }t        j                  }t	        |       D ]'  }t        | |      }t        |d      }t        | ||       ) t        | |||      cd d d        S # 1 sw Y   y xY w)N_recursive_pre_grad_passesTpre_grad_pass_time_uslog_pt2_compile_eventdynamo_compile_column_usr|   )	r$   r   add_pre_grad_passesremove_pre_grad_passesr  getattrr  setattrr[   )r   example_inputs
add_passesremove_passessubgraph_namesubgraphnew_subgraphs          rx   r  r  S  s     
$"!8
 N
 //
5504 	5Mr=1H5hCLB|4		5
 r>:}MN N Ns   A#A<<Bc                    t        ddd      5  t        |       D ]  }t        | |      }t        |        t	        |        d d d        y # 1 sw Y   y xY w)N_recursive_joint_graph_passesTjoint_graph_pass_time_usr  )r$   r  r  r!  rX   )r   r  r  s      rx   r!  r!  f  s\    	'"!;
 
 14 	4Mr=1H)(3	4 	2  s   3AAFc                    t        ddd      5  t        |       D ]  }t        | |      }t        ||        t	        | |       d d d        y # 1 sw Y   y xY w)N_recursive_post_grad_passesTpost_grad_pass_time_usr  )r$   r  r  r$  rY   )r   is_inferencer  r  s       rx   r$  r$  r  sb    	%"!9
 +
 14 	@Mr=1H',?	@ 	\*+ + +s   5AAc                f   ddl m}m}m}m}m}  || |||      }	| |	       nd}
t        t        |	j                  j                        d   j                  d         D ci c]  \  }}|j                  | }}}g }g }i }| j                  j                  D ]V  }|j                  |v r|j                  |       #|j                  |   |k(  s6|j                  dk7  sF|j                  |       X |D ]B  }d|j                  z   } || |||
||j                        nd|       ||j                     ||<   D |ddd   D ]X  }|j                  r/|j                  D ]  }|j                  |   |k(  rJ d| d        >| j                  j!                  |       Z | j#                          |	|fS c c}}w )	a  
    This function takes an GraphModule input "gm".
    The gm will be split into 2 components,
      1) const_gm, which consists the subgraph of gm that can be constant folded.
      2) gm (being inplace modified,) which returns the graph after constant folding.

    If an additional "lifted_constants" argument is passed in, we will assume the gm has
    been lifted and run the transformation accordingly.

    When a "skip_folding_node_fn" callback is passed, we will skip constant folding on
    the nodes for which the callback returns True.

    const_output_index is a mapping of corresponding node name from gm to the
    output index of const_gm.
    Returns (const_gm, const_output_index)
    r   )CONST_MODULE_TAGMETA_TAG
MODULE_TAGreplace_node_with_constantrun_and_get_constant_graphNr   r   _FOLDED_CONST_znode: z user not empty.) torch._inductor.constant_foldingr(  r)  r*  r+  r,  r   tupler   r   r}   r   r   r   r   users
erase_node	recompile)r   skip_constructorlifted_constant_namesskip_folding_node_fnr(  r)  r*  r+  r,  const_gmconst_resultr   r   const_outputsto_erase_nodeto_replace_nodeconst_output_indexr   new_const_namens                       rx   split_const_gmr>  ~  s   ,  *
35IH "7!>8:DL #,E(..2F2F,G,K,P,PQR,S"TQM  MO '99%""4(YYx $44M9Q  &	'   F)DII5" )0 ]49956		
 .;499-E>*F dd# &::ZZ Wvvh':5VvEU7VV5W HH%& LLN'''Es    F-c                Z   t         j                  j                  }t        |j                  j
                  |j                  j
                  |j                  j
                  |j                  j
                  g      }|D ]  }| j                  j                  d|      D ]  }t        |j                  j                  dd       t         j                        s8|j                  d   j                  t         j                   k(  sc|j                  d   j"                  j$                  dk(  s  y  y)Nr  r  r   r   TF)r   r  atenrK   mmdefaultaddmmbmmbaddbmmr   r   r   r   r   r   dtypefloat32devicetype)r   r@  tf32_opsr  r   s        rx   is_tf32_warning_applicablerK    s    99>>DGGOOJJHHLL  		
H  HH''?6'J 	D499==5u||DIIe$**emm;IIe$++00F:	 rz   c                r   t        d | D              }t        j                  r=t        j                  r-|s+t        j                  d       t        j                  d      S t        j                  j                  r+t        j                  d       t        j                  d      S t        j                         S )z
    For CPU backend, enable comprehensive padding causes some unit tests
    fail due to changing number of generated kernels. Skip for now.
    c              3     K   | ]>  }t        |t        j                        st        |j                  j
                         @ y wrs   )r   r   r   r@   rH  rI  )r   ts     rx   r   z6maybe_disable_comprehensive_padding.<locals>.<genexpr>  s/      "#Au||9Tqxx}}s
   A$Az!Skip comprehensive padding on CPUF)comprehensive_paddingz;Skip comprehensive padding for use_runtime_constant_folding)anyr   disable_padding_cpurO  perf_hint_loginfopatchaot_inductoruse_runtime_constant_folding
contextlibnullcontext)r  has_gpus     rx   #maybe_disable_comprehensive_paddingrZ    s      '5 G !!f&B&B7>?||%88				9	9I	
 ||%88%%''rz   c                ^    | s|rt        j                  d      S t        j                         S )zH
    graph partition does not support cpp_wrapper and aot_mode yet.
    F)graph_partition)r   rT  rW  rX  )cpp_wrapperaot_modes     rx   maybe_disable_graph_partitionr_    s'     h||E22%%''rz   c                   t               5  t        |      }|s;t        j                  j	                  d      } t        | |      j                  |  n\|st        j                         n t        j                  j                  |dd      }|5   t        | |      j                  |  ddd       ddd       |S # 1 sw Y   xY w# 1 sw Y   S xY w)z}
    If we can not detect fake mode from the context of inputs, create one.

    The created fake mode will be returned.
    Tallow_non_fake_inputs)moderb  N)r   r#   r   _subclassesFakeTensorModerI   	propagaterW  rX  r   rT  objectpropagate_dont_convert_inputs)r   r  force_allow_non_fake_inputs	fake_modectxs        rx   fake_tensor_proprl    s     
"	# $^4	))88t8TI8N2I.88.I 3 &&(ZZ&&y2I4P 
  Pr	2PP#     s$   BCB:(C:C	?CCc                    t        j                  |       5  t        j                         cd d d        S # 1 sw Y   y xY wrs   )r   rT  get_config_copy)config_patchess    rx   get_patched_config_dictrp    s1     
n	% (%%'( ( (s   4=c               #     K   t         j                  r#t        t               d      5  d  d d d        y d  y # 1 sw Y   y xY ww)NF)dirdelete)r   force_disable_cachesr>   r;   r|   rz   rx   with_fresh_cache_if_configru  $  s>     "" "ik%@ 		 	 		 	s   &A;AA Ac                  r    e Zd ZU ded<   ded<   ded<   ded<   ded	<   ded
<   ded<   ded<   ded<   ded<   y)_CompileFxKwargszOptional[BoxedBool]
cudagraphsSequence[int]static_input_idxsboolis_backwardzOptional[int]graph_idr]  r^  r&  zOptional[bool]
layout_optz1Optional[Callable[[list[ExternKernelNode]], Any]]extern_node_serializerzOptional[BoxedDeviceIndex]boxed_forward_device_indexN)r   r   r   __annotations__r|   rz   rx   rw  rw  0  s=    ##$$NMM ::rz   rw  )totalc                  $    e Zd Z	 	 	 	 	 	 	 	 ddZy)_CompileFxCallablec                     y rs   r|   )selfr   r  r~   s       rx   __call__z_CompileFxCallable.__call__>  s    
 rz   Nr   rF   r  Sequence[InputType]r~   Unpack[_CompileFxKwargs]returnr:   )r   r   r   r  r|   rz   rx   r  r  =  s-     , +	
 
rz   r  c           	     x   |j                  dd        |j                  dd       |j                  dd       |j                  dd        |j                  dd       |j                  dd       |j                  d	d        |j                  d
d        |j                  dd        t        j                         5 }|j                  t        j
                  j                  j                                |j                  t        t        j                               |j                  t        j                  dddd             |j                  t        d      j                                t        j                  j                   j"                  r7|j                  t        j                  j                   j%                                |j                  t'                      |j                  t)                      t+        j,                  d|d           t/        t0        d      | |fi |cd d d        S # 1 sw Y   y xY w)Nrx  rz  r|   r|  Fr}  r]  r&  r  r~  r  compile_fx_innerinductor_compileT#inductor_cumulative_compile_time_us)
phase_namer  r  z#pytorch.wait_counter.dynamo_compile)r|  inductor)compiler_name)
setdefaultrW  	ExitStackenter_contextr   r   _python_dispatch_disable_current_modesrP   dynamo_configuse_lazy_graph_modulert   r$   rJ   guard_dynamocallback_handlerprevent_duplicate_callbacksinstall_callbacksru  rU   r!   pt2_compiler   _compile_fx_inner)r   r  r~   stacks       rx   r  r  F  s   
 lD)
)2.
mU+
j$'
mU+
ne,
2D9
lD)
.5 
			 
5EKK88OOQR2=3V3VWX%%"-&*)N		
 	L)NOUUWX==))EE > > P P RS689LN+&&}-	
 P"#4JO
 
7
 
 
s   7E/H00H9zcompilation time (in seconds)rv   c                   t         j                  }t        j                  | j                        dk(  r.|s,ddlm} |j                  |        t        | j                        S |j                  dd      }t        j                  d|       t        ||      }t        t        t!        t#        | j                  j$                                    j&                  d   t(        t*        f      sJ d| j                          |j-                  d      x}(t/        t0        j2                  j4                        x|d<   }t0        j6                  rt9        | |fi | t;        j:                         }t=               }	t?        d	      jA                         5 }
t?        d
      jA                         5  t0        jB                   xr t0        jD                  xs |	xr | }t0        jD                  }|	}tG        d|       tI        |      D ]L  \  }}t        |tJ        jL                        s!tO        |jP                  jR                        sA||v sFd|_*        N d}d}dd}tW        |       }t;        jX                         }|rht[        j\                  | ||||      \  }|J|\  }}|rt[        j^                         }t[        j`                  ||||||j-                  dd      |      \  }d   dk(  r|J tc        | ||fi |}nd   dk(  r|J |J te        jf                          	 tc        | ||fi |}|J t;        jX                         |z
  |_4        |d   }||_5        te        jl                         \  }}|jo                  |       	 te        j~                          |t        |      d<   |jh                  d<   t[        j                  |||||       nd   dk(  sJ |J |J |d   }||_5        |J |}d   ndt        j                  d xs i |       t        j                  d|rj-                  d      ndrj-                  d      ndrj-                  d      nd||       t        dfd fd!"       |j                  |||       ddd       ddd       t        j                  d#t;        j:                         |z
         t        j                  d$d%j                  d& t        d'   j                         D                     tJ        j                  j                  j                  j                           t               t        j                  d(|d   rd)nd* d+|d,           S # tp        tr        f$ r  tt        $ r3}tw        |ty                     j{                  |j|                        dd}~ww xY w# te        j~                          w xY w# 1 sw Y   HxY w# 1 sw Y   MxY w)-z
    Inductor API that compiles a single graph.

    If you change the argument list for this function, make sure you
    also update the call to save_args_for_compile_fx_inner below accordingly.
    r   )_LazyGraphModulerz  r|   z&static input idxs compile_fx_inner: %szGinductor can only compile FX graphs which return a tuple/list, but got rx  Nz+pytorch.wait_counter.fx_codegen_and_compilez*pytorch.wait_counter.all_compilation_typesfx_cacheTr|  F)r|  	constantscache_statebypassmisstriton_bundler_metatime_taken_nshitdisabledfx_graph_cache_)metadatatime_nsr  key
componentscache_bypass_reasonzcache not enabled)r  cache_event_timer  r  r  remote_cache_enabledlocal_cache_enabledartifactc                     d  ddS )Nr  jsonr   encodingr|   )r  s   rx   <lambda>z#_compile_fx_inner.<locals>.<lambda>:  s    -k]; &% rz   c                 .    t        j                         S rs   r  dumps)
cache_infos   rx   r  z#_compile_fx_inner.<locals>.<lambda>>  s    4::j#9 rz   metadata_fn
payload_fnz%FX codegen and compilation took %.3fsz&Overview info of inductor aten mms: %sr   c              3  4   K   | ]  \  }}d | d| d  yw)(: )Nr|   )r   r  r   s      rx   r   z$_compile_fx_inner.<locals>.<genexpr>G  s(      
$.CauBugQ
s   aten_mm_infoztorchinductor done compiling 	BACKWARDSFORWARDS graph r}  )Srj   aot_compilationrt   count_callsr   torch.fx._lazy_graph_moduler  force_recompiler+   forwardr  static_inputs_logdebugget_input_idxs_to_checkr   nextiterreversedr   r}   r/  r   r   r<   r   tritonrx  	save_argsr4   timerB   rJ   r  rt  fx_graph_cacher(   r   r   r   r@   rH  rI  _is_inductor_staticr7   r  r.   prepare_keyget_remote_cacheload_with_keyfx_codegen_and_compiler`   begin_compile_time_taken_ns_fx_graph_cache_keycollectset_triton_bundlerN   rO   	ExceptionrW   r   with_traceback__traceback__end_compilestr_save_graphr!   instantr  rD   post_compiler   rS  r   r"   items	_inductorasync_compileCompiledTritonKernelscache_clearr   r   INFO)r   r  graph_kwargsr^  r  rz  inputs_to_checkrx  startfx_graph_remote_cache_	use_cachelocalremoteiinputmb_compiled_graphkey_inforemote_cacher  
start_timer  debug_lines	cache_keytriton_bundler  ecompiled_graphr  r  s                               @@rx   r  r  {  s>    &&H)Q.x 	A((,rzz**'3'>'>?RTV'WDFWX-n>OPOd4 89:??BUDMR 
QRTRZRZQ[\R #&&|44
=2;FMM<T<T2UU\"Z&	
 	
 IIKE<> 	BCIIKZKOPABHHJZK
 +++ &&?*? 	
 %%&
I. ".1 	1HAu5%,,/5<<,,-**,0)	1 37
226	 \\^
%1%=%=NL/6&"Xz
 ##+ [#/#@#@#BL0<0J0J"  , 0 0 F'1-!: M!:h!F$,,, 6NO!7C!
 &&0$,,,'''''),$:%;G%! )44437<<>J3N!0$QK	8A!5 "))+!'!33MB ))+".478K4L
01*;*J*JJ'$$! m,555$000''' I4=1 ,,,* *4)?J}%Z 	 	""k]+%2	
 	&&#')3
u%7Az~~l3t  45(!' %	
  ! : 	##NJ	JuZK ZKx II5tyy{U7JK HH0		 
2:>2J2P2P2R
 	
 
OO!!77CCELN'&}5;:
F Gj)*	, C %i0  #A|~6EEOO
 ))+mZK ZK ZK ZKsj   W,&A6WW=WCWA"U8DWW,8W.V>>WWWWW)	$W,,W6c                  $    e Zd ZU dZded<   ddZy)_FxCompileStatr   intcodegen_and_compilec                     d| j                    S )Nzcodegen_and_compile: )r  )r  s    rx   __repr__z_FxCompileStat.__repr__]  s    &t'?'?&@AArz   Nr  r  )r   r   r   r  r  r
  r|   rz   rx   r  r  Y  s      Brz   r  c                  d    e Zd ZU dZ ee      Zded<   e	 	 	 	 	 	 	 	 	 	 dd       Z	e
dd       Zy)		FxCompileza
    An FxCompile represents a mechanism that can turn a GraphModule into an
    OutputCode.
    z%dict[type[FxCompile], _FxCompileStat]_compile_statsc                     y rs   r|   )r  r   r  r  r  s        rx   r  zFxCompile.codegen_and_compilem  s     rz   c                8    | j                   j                          y rs   )r  clear)clss    rx   _reset_statszFxCompile._reset_statsv  s      "rz   N
r   rF   r  r  r  ry  r  rw  r  r:   r  None)r   r   r   __doc__r   r  r  r  r   r  classmethodr  r|   rz   rx   r  r  a  sr     =H<WN9W
  , '	
 ' 
  # #rz   r  c                  2    e Zd Ze	 	 	 	 	 	 	 	 	 	 dd       Zy)_InProcessFxCompilec                8  -./ d|v r|d   J |d   }|j                  dd      }|j                  dd      }|j                  dd       }|j                  dd      }	t        j                  }
|j                  dd      }|j                  d	d       }|j                  d
d       }t        d      j	                         5  t        j                         5  t        j                  x},dd l	}t        j                  d|        |j                  |       t              r
t                t        d   j!                         }t#        j$                  t'        t#        j(                         d              t+               t,        j.                  d|rdnd d|        d>fd-t1        dd -fd       t        j2                  j5                         t7              }t9               t;        j<                         5  t?              }d d d        tA               t        jB                        5  tE              }|5  tG        |       d d d        t        j2                  jI                         tJ        j3                  dtM        dddd             t1        dfd        t        jN                  jP                  rdt:        jR                  jT                  jW                  jX                        /t1        dd! /fd"       /t:        jZ                  j2                  _.        t_               }|ja                         rbt"        jb                  d#k  r!te        t        d$   jg                               }nt        d$   ji                         }tk        jl                  d|%       t        jn                         r 	 tq        d&ts        tu                     i'       d d d        t        jB                  |      5  ty              5  t{        |	|
      5  d }d }d }d }|
rt        j|                  j~                  rmt              \  }}t        |g |||	|
|||d)
      }t        j                  |      5  |	sJ d*       |j                          |j                         \  }}d d d        t        |||	|
|||||r|j                  nd |r|j                  nd ||+      }t        j                         }t        j                  |      5   |j                    g }|j                  t               .|j                  D ]  }t        |t              rq|j                         rat        t        |j                                     dk(  r<|j                  t        .fd,|j                         j                  D                     |j                  d         t        |       t        d-d.      5  |j                          |j                  r,d/d0lXmY} |j                  sJ d*       |j                         \  } }!t        j2                  d1| j                         |!j                  r t        j2                  d2|!j                         d }"|j                  r1|j                  |j                        }"t        j2                  d3|"       t        d4d.      5  |j                  || j                  |!j                  |"|j                  g t        j                  |j                  j                        5      }#d d d        n|j                         j                  }#d d d        |j                         \  }$}%}&t        xj                  |$z  c_g        t        xj                  |&z  c_h        t        xj                  |%z  c_i        |r.t        j                  j                  rt        jX                  j                  st;        jZ                  j                  j                   rd }'jX                  j                  D ]  }(|(j                  j                  d6d       })|(j                  d7k(  sCt        |)t:        j                        r)t:        jZ                  j                  j                  |)      sr|(j                  j                  d8d       x}'s n d9}*|'r	|* d:|' d;}*n|* d;}*|*t        jX                  _l        |rqt        jX                  j                  sWt              }+|+rJd<|+j                   }*|+j                  j                  d8d       x}'r|* d:|' d;}*|*t        jX                  _l        t        j                  rYt        #tr        t        f      sJ t        |#      cd d d        cd d d        cd d d        cd d d        cd d d        cd d d        S |rSt        jX                  j                  s9dd=lwmx},  |,t        jX                  j                        t        jX                  _l        | j                  t        |          xj                  d/z  c_|        t        #||t        jX                  j                  |j                         t        d   |z
  |||||      cd d d        cd d d        cd d d        cd d d        cd d d        cd d d        S # 1 sw Y   xY w# 1 sw Y   wxY w# tv        $ r t        j                  d(       Y w xY w# 1 sw Y   xY w# 1 sw Y   lxY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   nxY w	 d d d        n# 1 sw Y   nxY wd d d        n# 1 sw Y   nxY wd d d        n# 1 sw Y   nxY wd d d        n# 1 sw Y   nxY wd d d        y # 1 sw Y   y xY w)?Nrx  rz  r|   r|  Fr}  r]  r&  r  r  z/pytorch.wait_counter.actual_codegen_and_compiler   z3Sleeping for %s since sleep_sec_TESTING_ONLY is setr  i  ztorchinductor compiling r  r  r  c                     t        j                         } t        j                  j                  j
                  j                  | dd        | j                         S )Nr  )save_dir)ioStringIOr   r  repro	after_aotsave_graph_reprogetvalue)fdr  r   s    rx   log_graph_runnablezC_InProcessFxCompile.codegen_and_compile.<locals>.log_graph_runnable  sJ    [[]##-->>NJ ?  {{}$rz   r  c                     dddS )Nfx_graph_runnablestringr  r|   r|   rz   rx   r  z9_InProcessFxCompile.codegen_and_compile.<locals>.<lambda>  s    / (% rz   c                              S rs   r|   )r%  s   rx   r  z9_InProcessFxCompile.codegen_and_compile.<locals>.<lambda>  s    #5#7 rz   r  r&  %szAFTER POST GRADTinclude_strideinclude_devicecoloredinductor_post_grad_graphc                 ,     j                  ddd      S )NFTprint_outputr-  r.  )print_readable)r   s   rx   r  z9_InProcessFxCompile.codegen_and_compile.<locals>.<lambda>  s    r'8'8%*4PT (9 ( rz   r  c                     dddS )Ninductor_post_to_pre_grad_nodesr  r  r|   r|   rz   rx   r  z9_InProcessFxCompile.codegen_and_compile.<locals>.<lambda>  s    $E(.- rz   c                 .    t        j                         S rs   r  )provenance_tracking_jsons   rx   r  z9_InProcessFxCompile.codegen_and_compile.<locals>.<lambda>
  s    4::6N+O rz   )   
   graph_break)	overwritenum_graph_breakspt2_configs)extra_loggingzfailed to log pt2_configs)	r  	shape_envr}  r]  r^  r  r&  r|  is_const_graphz"AOT mode only supports C++ wrapper)r  rA  r}  r]  r^  r  r&  r|  r;  const_wrapper_codeconst_kernel_codeconst_moduler  c              3  @   K   | ]  }j                  |        y wrs   )doprint)r   sps     rx   r   z:_InProcessFxCompile.codegen_and_compile.<locals>.<genexpr>m  s     )X1!))A,)Xs   zGraphLowering.compile_to_fn)r  rS   )AotCodeCompilerzOutput wrapper code: 
%szOutput kernel code:
%sz#Serialized Extern Kernel Nodes: 
%szAotCodeCompiler.compile)device_typeadditional_filesr   r   stack_tracezWgraph with symbolic shapes inputs and config.triton.cudagraph_skip_dynamic_graphs=True.z Found from 
z,disabling cudagraphs due to incompatible op ) check_lowering_disable_cudagraphr  )r   rj   r  rJ   r  rt   preserve_rng_stater   sleep_sec_TESTING_ONLYr  r   warningsleeprK  r   r"   copysyssetrecursionlimitmaxgetrecursionlimitr   r   r  rD   r  fx_graphri   rZ   r   no_gradrl  r   set_fake_modeget_cuda_device_contextr$  fx_graph_transformedpost_grad_graphs_logr'   traceenabledr   	tracebackget_graph_provenance_jsonr   r   _inductor_post_to_pre_grad_nodesr&   in_progressversion_infosumvaluesr  r!   compilation_metric	is_fbcoder   r  rp  
ValueErrorrZ  r_  rU  rV  r>  r\   set_graph_handlerruncodegen_with_cpp_wrapperr   rT   CachedMetricsHelpergraph_outputsrH   r   r^   has_tensor_outputr   rG   
get_strider   r/  
get_layoutr   _check_triton_bf16_supportr$   freeze_runtime_assertsr^  	codecacherJ  r]  r/   extern_kernel_nodesr  compilerK  dictfromkeyswrapper_coderL  compile_to_modulecallcount_bytesnum_bytes_accessednode_runtimesnodes_num_elemr  cudagraph_skip_dynamic_graphsdisable_cudagraphs_reasonr   any_is_symbolicr   r   r   r   re   r  r   r5   torch._inductor.cudagraph_utilsrO  device_node_mappingr  rI  r  r6   
get_deltas)0r  r   r  r  r  rx  rz  r|  r}  r]  r^  r&  r  r  	sleep_secr  inductor_countersrA  rj  cuda_contextmetrics_contextr>  r;  const_graphrC  rD  r6  r   metrics_helperr   r   rJ  rz  kernel_codeserialized_extern_kernel_nodescompiled_fn	num_bytesr  r  rM  r   meta_valdisablemaybe_incompat_noderO  r%  rI  r9  s0    ``                                          @@@rx   r  z'_InProcessFxCompile.codegen_and_compile|  s
    |+\0J0VVV ,\ :
+7+;+;<OQS+T(,,]EB"."2"2:t"D(,,]EB**)--neD5t< 	 BNAQAQ($B
"
 JKQQS_	++-_	 $:::	GI9 

9%)"-#% ( 4 9 9 ; !!#c&;&;&=t"DELN*"-;:> ?!
$%  8 GGR0
 .n=I$ B  A,R@	A +2. + :A6r:! O/NO,,R@$***)'+'+ $	 !.  <<''**DDRXXN - %"% $P 1 OO))J #6"7"..0'''1+.x/F/M/M/O+P(+3M+B+H+H+J(&99"&9I ##%	A, -s3J3L/M+e:Az 	*Q3NCQ .k8DQ
 &*""%)"$(! 3 3 P P3A"3E0H0"/ ')"+!)$/!)/E%1$/'+#K ,,[9 *P,PP{#)'@@B >*,= & $2'% +%+A!- +'9) (:'?'?( '8&=&=!,$3+. ")!<!<!>((/ YEII~.QSN**6 +,#(#6#6 <C *3 7$'$9$9$;$'(=cnn>N(O$PTU$U !/ 5 5$))X@P@W@W)X$X!" !/ 5 5d ;< /u5 &5T 3I
 446 >>B#(#4#4  D#4 9>8V8V8X5L++11 ;\=O=O  +00 / 5 5$={?P?P!" >B:$88$)$@$@(-(A(A%& !?
 !0 5 5$J$B!"
 ". 9QU" " />.E.E$)$0$6$6$/$5$5$B050A0A6&)-,1,>,>,O,O**6& /F /"	" "" +0*A*A*C*H*HKg3Ij @E?P?P?R<I~}..);.))]:)**n<* #"MMGG ! A A!OO11AA>R&*$&HHNN 
&D'+yy}}UD'AH $= 8'1(ELL'I','<'<'L'LX'V (.2iimmM4.PP{P %
& #|&)0	k]"&MG)0	nG<C9!!''*K*K.STV.W+.(TUhUoUoTp&qG.A.F.F.J.J -t/  {   .5I\+b*Q@GAGG=(()+T{CCC+K8wY YqQ Q Q Q]_	 _	 _	H
 "!''*K*K
 = ! ; ; 9 ''T
3GG1LG*#&99&113 ,/@@"&)$'2WY YqQ Q Q Q]_	 _	 _	RA AO Oh & A $?@Ao:A :Aj z" "E3I 3I9Y Y YqQ Q Q Q Q Q Q Q Q]_	 _	 _	 _	 _	 _	s~  =nD(m;:j3(m;/k2=k 	E*k26km;2m&>m	Al<(-k?A&l<;C#l&C"l Al#lE-l&0Cl&?	l<	m		m&	m;#	n6Cl&<	l<	m		m&	m; 	n3j=8m; k
k2k/	+k2.k/	/k22k<7m;?l	l<lll#l&&l/+l<3	m	<mm		m&mm&	m;&m/+m;2	n;n	 nnNr  )r   r   r   r   r  r|   rz   rx   r  r  {  sK    zz ,z '	z
 'z 
z zrz   r  c                    t         t        j                  k(  rt               }nIt         t        j                  k(  rddlm}  |       }n$t         t        j                  k(  rddlm	}  |       }j                  | |||      S )NrS   )_DebugSerdeFxCompile)_SubprocessFxCompile)fx_compile_moder   r   r  r   compile_fx_extr  r   compile_fx_subprocr  r  )r   r  r  r  schemer  r  s          rx   r  r    sc     -...$&	M33	38%'	M44	4<%'%%b./<XXrz   c                d   g }t        |       D ]  \  }}t        |t        j                        s!t	        |j
                  j                        sAt               5  ||v rt        |      r
	 ddd       et        |      s
	 ddd       z	 ddd       |j                  |        |S # 1 sw Y   xY w)z
    This function runs at compile time, and generates a list of indices for which we
    might need to do a copy to preserve alignment requirements.
    N)r   r   r   r   r@   rH  rI  rf   rC   rA   r   )inputsrz  ids_to_checkr  r  s        rx   r  r    s     Lf% 5%.ell''(02 	 %%*;E*B		 	
 /u5	 	
 6	 	A), 	 	s   B&3B&&B/	r|   )r  placeholdersmutated_input_idxsc                    ddl m}	 t        j                  j                  rEt        j                  |	|||||||t        j                  j                  j                         	      nt        d d fd}
|
S )Nr   )cudagraphify_impl)device_indexstack_tracesr|  r&  r  r  r  
compile_idc                ~    't        j                         5   |       d d d         |       S # 1 sw Y   xY wrs   )rt   rP  )
new_inputsr  cudagraphify_fnmodelrz  s    rx   rl  zcudagraphify.<locals>.runW  sH    002 T-eZARST:&&T Ts   3<)r  r  r  r
   )torch._inductor.cudagraph_treesr  r   r  cudagraph_trees	functoolspartialr   r   CompileContextcurrent_compile_id)r  rz  r  r  r|  r&  r  r  r  new_cudagraphify_implrl  r  r  s   ``         @@rx   cudagraphifyr  5  sr    
 }}$$#++!%%#%%1}}33FFH

 ,K' ' Jrz   c                    t        j                  | j                         | j                         | j                  | j
                        S )z1
    Copy and input while preserving strides
    )rF  rH  )r   empty_stridedsizer   rF  rH  )r   s    rx   static_inputr  a  s/     qvvx177188TTrz   c                V    t        | |      } t        ||      }| j                  |       y)z=Index into expanded dimensions of both dst and src then copy_N)r9   copy_)dstsrcexpanded_dimss      rx   index_expanded_dims_and_copy_r  h  s'     c=
1C
c=
1CIIcNrz   c                  	
 t        |      }t        t        |            t        ||       t	        |t
              sJ t        |      D cg c]  \  }}|vrt        |      ng  c}}t        |      D cg c]@  \  }}t	        |t        j                        s|n|vrt        |      n|j                         B c}}t        t        |            D ]8  \  }\  }}t	        |t        j                        s$|vs)t        |   ||       : t        j                  j                          t        j                  j!                         }|j#                  t        j                  j%                                t        j                  j'                  |      5   | t                     ddd       |j                          t        j                  j%                         j#                  |       t        j                  j                          t        j                  j)                         
t        j                  j+                  
|d      5   | t                    ddd       t	        t
        t,        f      sft.        j0                  rd
fd}n1t3        t5                    D cg c]	  }|vs| c}	d	
fd}t7        ||      S c c}}w c c}}w # 1 sw Y   <xY w# 1 sw Y   xY wc c}w )zQ
    Assumes inputs[static_input_idxs[i]] are always the same memory address
    Nthread_local)streamcapture_error_modec                   t              t        |       k(  sJ t        t        |             D ]u  \  }\  }}}t        |t        j
                        s%t        |t        j
                        sJ |v r$|j                         |j                         k(  rgJ t        |||       w | j                          j                          	S rs   )
r   r   zipr   r   r   data_ptrr  r  replay)
r  r   r  r  r  r   inps_expanded_dimsrz  static_inputsstatic_outputss
        rx   rl  zcudagraphify_impl.<locals>.run  s    }%Z8882;M:/AB3 K..c3 "#u||4!#u||444++<<>S\\^;;;
 2#sMJK LLN!!rz   c                    D ]8  }|   }| |   }t        |t        j                        sJ t        |   ||       : | j	                          j                          S rs   )r   r   r   r  r  r  )	r  r   r  r  copy_indicesr   r  r  r  s	       rx   rl  zcudagraphify_impl.<locals>.run  si    # V 23 7 o!#u||444-mC.@#}U	V
 LLN!!rz   )r  list[InputType]r   Callable[[list[InputType]], Any])r  rK   rh   rc   r   r   r   r8   r   r   r  detachr  r  r   synchronizeStreamwait_streamcurrent_streamr  	CUDAGraphr   r/  r   size_assertsr   r   ra   )r  r  rz  check_input_idxsr   r   r  r  rl  r  r   r  r  r  s     `      @@@@@rx   r  r  s  s    /v7HI)3#F,=>* 6#34fd###  'C !$+< <!"D  '	 C a.  ++ a		M $-S9K-L#M Paa&36G+G)-*<aOP
 
JJZZ F
uzz0023			6	" #d=!"#
	JJ++F3	JJ JJ  "E			%>		R 4tM234ntUm4(*	" 	"* !]!34
CT8TC
	" 	" (-=>>Y	*# #4 46
s1   K"AK("K.K;;	LL.K8;Lc                   t        | t              sJ |        t        |        |ddini |ddi}|j                  dt        j
                  j                        }|r|j                  d      r"J d       i |dt        | j                        i}|j                  dd       }| j                  j                  dd       }t        j                  j                  |      }t        j                   d      5  t        j                  j#                  |      5  t%        ddd	      5  t'               5  t)        | |t+        j,                  ||
      |      }t        |t.              sJ |j0                  cd d d        cd d d        cd d d        cd d d        S # 1 sw Y   nxY wd d d        n# 1 sw Y   nxY wd d d        n# 1 sw Y   nxY wd d d        y # 1 sw Y   y xY w)Nr]  Tzaot_inductor.output_pathz.pt2a
  The output path for aot_compile should not have an extension with .pt2 this is for specifying the output path for the .so in AOTInductor. If you would like to package the AOTInductor generated files into a pt2, please call `torch._inductor.aoti_compile_and_package`.r  dynamo_compile_idcompile_fx_aot)r  reset_event_log_on_exit)r  )inner_compilero  )r   rF   r)   r   r   rU  output_pathendswithr-   coder   r   r   r   r  rj   set_aot_compilationcompile_contextr    r&   
compile_fxr  r  r5   filename)	model_example_inputs_r  ro  r  r  saved_compile_idsaved_compile_contextcompiled_artifactss	            rx   r  r    s    fk*2F2* &f- ! 
44t4  !$$"F$7$7$C$CK ''/ 	
R	
/

&	&++(>

 ,//0H$O{{':DA!MM889IJ	d#+%%&;<+ 	"&$(	
+ 	+ (#++'= *
 ,l;;;!**-+ + + + + + + + + + + + + + +sa   2 GF=!F(,AF	.	F(7	F= 	GFF(	F=(F1-F=4	G=G	GGc                v   ddl m}m}	 t        |        t	        j
                  | d      }
|
rt        | |d        ||         |	|| |      \  }D cg c]  }||   	 }}t        |      }|j                  j                  ^ }}|j                  d   }t        |      D cg c],  \  }}t        |t        j                  j                        s+|. c}}|j                   d<   g }t        j"                  j$                  j'                         }dgd|,|j(                  J |j(                  }t+        dt-        |      dz
        t/        t0                  }|j2                  }|J d}t-        |      dkD  rg t5        t-        |            D ]I  }|vrd ||<   |dkD  r(||   ||dz
     k(  r|dz  }n|j7                  ||          j9                  |       K |j:                  J t5        t-        |j:                              D ]  }||vsd |j:                  |<    |j<                  r|j<                  j>                  }t@        jB                  jE                  |dd      5   ||||||d||
      d d d        tF        jH                  rS d
fd	}d|_%        |S c c}w c c}}w # 1 sw Y   8xY w)Nr   )%convert_conv_weights_to_channels_lastfreezeTr*  user_visible_output_idxsrS   rb  )rz  rx  r}  r&  r  r~  c           
         D cg c]  }| |t        |         z
      }}| j                           |      S c c}w rs   )minr  )r}   r  args_newmax_offset_idxoptimized_functionpreserved_arg_indicesunwrapped_args_offsetss      rx   wrapperz%fw_compiler_freezing.<locals>.wrapperl  sT     +
 +C>,BCCD
 
 	

!(++
s   <)r}   zlist[object]r  zSequence[torch.Tensor])&torch._inductor.freezingr  r  r!  r\   decide_layout_optrl  r#   r   r   r}   r   r   r   r   r   r   r   r   r   params_flat_unwrap_subclassesrW  r   rK   r  params_unwrapped_to_flat_indexr   addr   params_flatr   r   r   rT  rg  rj   r  _boxed_call)aot_autograd_modelaot_example_inputsdynamo_modelnum_example_inputsr  rx  r}  forward_devicer  r  r~  	opt_modelindrj  r  model_outputs_nodemodel_outputsr   r=  rz  tracing_contextparams_flat_unwrappreserved_indices_params_flatunwrapped_idxscurrent_offsetr  r  r  r  r  r  s                              @@@@rx   fw_compiler_freezingr    s    W ""45001CRVWJ+-?F-.@A'-($I$ >SSc,S1SS !34I '__22Q&++A.M#M2;QjEHHMM6R;67 mm22::<OSN"<<HHH,JJQ$6 7! ;<(23(9%(GG)))!"Q&%'"s-./ 	:A--(,"1%q5^A..Q2GG"a'N-11.2CD")).9	: **666s?6678 	6A5515++A.	6 && / ; ; P P			9&=t	D 

*/!'5!	


 	!!, , GNQ T;L

 

s   J$&,J)J)'J//J8c                     t         j                  j                  rt        t	        d             t         j                  j
                  t         j                  j
                  n	t               ddddS )Nzcpp wrapper enabledFT)ztriton.autotune_at_compile_timeztriton.autotune_cublasLtztriton.cudagraphsztriton.store_cubin)r   r  rx  r2   r1   autotune_at_compile_timerR   r|   rz   rx   get_cpp_wrapper_configr  y  sY    }}+'(=>	
 }}55A MM22$)""
 
rz   c                   t         j                  j                         st        j                         S | j
                  j                  d      }t        d |D              }t        d t        |       j                  d   D              }t        d ||z  D              }t        |      dk(  r1t         j                  j                  t        t        |                  S t        j                         S )zX
    Returns a cuda device context manager if there is a single device in the graph
    r   r   c              3     K   | ]P  }t        |j                  j                  d       t        j                        r|j                  d    j
                   R ywr   N)r   r   r   r   r   rH  )r   r   s     rx   r   z*get_cuda_device_context.<locals>.<genexpr>  sB      9diimmE*ELL9 			%9s   AAc              3     K   | ]j  }t        |t        j                        rNt        |j                  j	                  d       t
        j                        r|j                  d    j                   l ywr
  )r   r   r   r   r   r   r   rH  )r   args     rx   r   z*get_cuda_device_context.<locals>.<genexpr>  sN      7c277#
388<<3F(U 	7s   A0A2r   c              3  @   K   | ]  }|j                   d k(  s|  yw)r   N)rI  )r   rH  s     rx   r   z*get_cuda_device_context.<locals>.<genexpr>  s       8fkkV>S8s   rS   )r   r   r   rW  rX  r   r   rK   rg   r}   r   rH  r  r  )r   r   input_devicesout_devicescuda_devicess        rx   r\  r\    s     ::""$%%''++}+=.8 9%9 /M -7 7r?''*7 -K
 .8 8+k98 .L |! 	

$tL123 ##%rz   c                @    |rGt        j                  |      5  t         | t        j                  |            |      cddd       S t         j                  rt        j                  ddit	                     5  t        j                  |      5  |}t         t              r	 j                  j                  D cg c],  }|j                  dk(  r|j                  j                  d      . }}|D cg c]   }t        |t        j                        r|nd" }}t!        d |D              rt#        t%               ||      D ]g  \  }	}
}|

t        |t        j                        sJ |
j&                  |j&                  k7  s@t)        d|	 d	|
j&                   d
|j&                   d       |}t         |t+        j,                  d      |      cddd       cddd       S t+        j,                  t        |      }t/               st1         ||      S t         t              r1t         j                  j2                  t4              rt7         ||      S t9        t:        j<                        5  t?               5  t        j@                  jB                  jE                  t         jF                  jH                        5  t         t              rrtK        d fd       tL        jO                  dtQ        d ddd             tS         j                        t        jT                  jN                  _+        tY         |       t!        d |D              r(t[         ||      cddd       cddd       cddd       S t         j\                  rJ t_        |      ta        t         jb                  jd                        tg        d      ti        tj              ||n	tm               }	 	 	 	 	 	 	 	 d& fd}t+        j,                  |d      }to        tp        |      }t         jr                  r5t        jt                         s!t+        j,                  tv               }n't+        j,                  |d      }to        tp        |      }	 	 	 	 	 	 	 	 d'd}ty        d      	 	 	 	 	 	 d(fd       }to        tp        |      }t{        |      xs  t        j|                  j                  d      }t        j                  j                  j                         xs t        j                  j                  |      }t
        j                  rt        j                  d      5  t         |d|      \  }}dd lFmG}  ||      }|j                  j                  D ]r  }|j                  d!k(  sd|j                  vs"t        ||j                        }t        |t        j                        sS|j                  |d"      |j                  d<   t 	 ddd       t               }d# j                  v r j                  d#   |j                  d#<   d$ j                  v r j                  d$   |j                  d$<   t        j                  j                         }|rt        j                  j                  nt        j                  }t        j                  |      5  t        j                         5   |       5   |||      cddd       cddd       cddd       cddd       cddd       cddd       S t        j                  |      5  t        j                  j                  |      5  t        j                         5  t        j                  d      5  	  t        |||||d%       |      cddd       cddd       cddd       cddd       cddd       cddd       cddd       S # 1 sw Y   xY wc c}w c c}w # 1 sw Y   nxY wddd       7# 1 sw Y   AxY w# 1 sw Y   xY w# 1 sw Y   nxY wddd       n# 1 sw Y   nxY wddd       -# 1 sw Y   7xY w# t        $ r}|j                         dd}~ww xY w# 1 sw Y   nxY wddd       n# 1 sw Y   nxY wddd       n# 1 sw Y   nxY wddd       n# 1 sw Y   nxY wddd       n# 1 sw Y   nxY wddd       n# 1 sw Y   nxY wddd       y# 1 sw Y   yxY w))a@  
    Main entry point for compiling given FX graph.  Despite the fact that this
    lives in :mod:`torch._inductor`, this function is responsible for calling
    into AOT Autograd (and we will eventually get a callback to
    ``inner_compile`` to perform actual compilation.  In other words, this
    function orchestrates end-to-end compilation for the inductor backend when
    you use :func:`torch.compile`.

    NB: This function TAKES OWNERSHIP of the input ``model_`` and can potentially
    mutate it!  Make a copy if you need to preserve the original GraphModule.
    )r  decompositionsNr]  Fr   r   c              3  $   K   | ]  }|d u 
 y wrs   r|   )r   vs     rx   r   zcompile_fx.<locals>.<genexpr>  s     :q}:s   zBDevice mismatch between fake input and example input at position #r  z vs zx. If the model was exported via torch.export(), make sure torch.export() and torch.aot_compile() run on the same device.T)r]  inductor_pre_grad_graphc                 ^     j                  ddd      dt         j                         z   S )NFTr2  z

 # graph id: )r4  idr   )r  s   rx   r  zcompile_fx.<locals>.<lambda>  s9    6#8#8!&tD $9 $ &b&6%78$9 rz   r5  r+  zBEFORE PRE GRADr,  c              3  R   K   | ]  }t        |t        t        t        f       ! y wrs   )r   r   r/  rx  r   s     rx   r   zcompile_fx.<locals>.<genexpr>3  s     Kaz!dE401Ks   %'c           
        t        j                  d      5  |rt        |        t        j                  j
                  j                  t        |            }t        |       }t        j                  rDt        j                  |j                   }t        |      }t        j                  j                  j!                         }|%|j"                  r|s|j"                  j$                  }nd}t'        t(              rXj*                  j,                  ^ }	}
|
j.                  dk(  sJ t        j0                  |
j                        \  }}	t        |      }n|}||k  sJ ||z   }||k  sJ t3        ||      D cg c]+  }t'        ||   t        j4                  j6                        r|- c}|j8                  d<   ng |j8                  d<    | |t;        |      |      cd d d        S c c}w # 1 sw Y   y xY w)Nz$compile_fx.<locals>.fw_compiler_baser   r   r  )rz  rx  r}  r&  r  )rt   r$   r!  r   r  r   num_fw_fixed_argumentsr   rg   r   keep_output_strider   arg_tree_leavesr}   r   r   r   r   num_mutated_inp_runtime_indicesr   rF   r   r   r   tree_flattenr   r   r   r   r   )r   r  r&  r   r  r  num_model_outputsr   original_output_start_indexr  orig_model_outputs_nodeorig_model_outputsnum_orig_model_outputsorig_output_end_idxr   rx  r  r}  r  r  r  s                  rx   fw_compiler_basez$compile_fx.<locals>.fw_compiler_baseQ  s   
 **+QR I1"5--DD&N(; &1_",,$*$:$:<N<S<S$TM(+M(:%#mm::BBDG*w/B/B<#//OO 4 783!&+66<ll6H6H33699XEEE060C0C3881-*A 255G1H.1B.15FFFF  46LL (
 /2CCCC $)79L$K &mC&8%((--H	 K&++,FG KM&++,FG$"&;E&B)%!-/=CI InKoI Is   E G-70G('7G-(G--G6r*  )r  r  r  rx  r}  r  c                ~    t        |       }|5  t        |        d d d        t        | |fi |ddiS # 1 sw Y   xY w)Ncompilerr  )r\  r!  r   )r   joint_inputsr~   r  s       rx   partition_fnz compile_fx.<locals>.partition_fn  sP    
 326L 2-b126L$*5? 2 2s   3<backward)r  c                   ddl m} t        j                  d      5  |5  t	        |       }t
        j                  rlt        j                  |j                   }t        |      D cg c]+  \  }}t        |t        j                  j                        r|- c}}|j                  d<   ng |j                  d<   t!        |       }t
        j"                  rt        j$                  t'                     nt)        j*                         5   | |t-        t/        |            d
	      cd d d        cd d d        cd d d        S c c}}w # 1 sw Y   nxY w	 d d d        n# 1 sw Y   nxY wd d d        y # 1 sw Y   y xY w)Nr   )compile_lockzcompile_fx.<locals>.bw_compilerr  T)rz  rx  r|  r}  r  )torch._dynamo.convert_framer,  rt   r$   rg   r   bw_outputs_user_visibler   r  r}   r   r   r   r   r   r   r=   r]  rT  r  rW  rX  r   r   )r   r  r,  r  r  r   r=  r   rx  r  r}  r  s           rx   bw_compilerzcompile_fx.<locals>.bw_compiler  s\    A ))*KL &1_"11$*$:$:<N<S<S$TM '0&>K"C%a7 K&++,FG KM&++,FG&r* )) LL!7!9:#//1
 )&*.uU|*<#-$(!)3A !  K  !     sT   E1AE&0E 
A-E!E$	E-	E1 EEE	E1E%	!E11E:ra  )unlift_effect_tokens)trace_jointr  r   )_detect_fake_mode_from_gmget_attr)static_shapes dynamo_flat_name_to_original_fqnr  )fw_compilerr/  inference_compilerr  r)  keep_inference_input_mutationsrx  )r   rF   r  r  r&  r{  r  r:   )r   rF   r(  zSequence[object]r~   rg  r  ztuple[GraphModule, GraphModule])r   rF   r  r  r  r:   )Xr   rT  r  r]  r  rj   set_real_inputsr   rF   r   r   r   r   r   r   r   rP  r  r	   rH  rj  r  r  graph_returns_tuplemake_graph_return_tuple_codegenrQ   handle_dynamo_export_graphrP   r  r  r   r   ra  preserve_node_metar_  r`  rD   pre_grad_graphs_logr  r'   r  r  _pre_grad_graph_idr  r%   _raise_error_for_testingr   r<   r  rx  r0   r  _graph_counterrV   r,   r:   freezingis_grad_enabledr  rE   r#   rd  re  r   r   r   r  functorch_configr*   torch._export.utilsr2  r  r  from_tensorr   _C_is_any_autocast_enabled_DisableAutocastrW  rX  r[  r   _disabletracingrM   rN   remove_dynamo_frames) r  r  r  ro  r  inputs_r   fake_inputsinpr   fir  recursive_compile_fxr%  r6  r7  r)  r/  rj  r  r   r   r2  r  r   disable_ampr   r  rx  r  r}  r  s    ` `                         @@@@rx   r  r    s   , \\.) 	:fll>:=I-	 	 LL!5,.)	 o.)	 ,;G&+. !' 2 2ww-/ IIMM%(   + &c5<<8CdB 
 :k::&)%';&H "
R>#-a#>>#>!yyAHH4&0&hilhmmo')yykahhZ @o%o'" !"	" *G'//4P-	I)	 )	 )	V %,,#% v&& 
 	
 &+&:~, * 
 	
 	}BBCd9 "d9 	--fll.B.BCd9 fk*)9  %%&%#'#' 	 8:&,,7GEOO!!4/HF
 K?KK'$Id9 d9 d9 d9T 2222 1 v}}778
 *$/ ' -8N>Q>S 	N	N	/N	 N	 	N	 N	b .UC 	 6j+N??5#8#8#:5>5F5F$##5+%!-6 "+!2!23CRV!W!@."
	
	*
	 
	 -	
	 
'*	="	"	-@"	"	 
>"	H 6j+N$
 J--D-I 	 MM((002 7}}++I6 	
 !''TB &7# %#1	'#O J5b9	 HHNN Dww*,dii1G!(T[[!9%fell;/8/D/D &d 0E 0DIIe,	#2 (ODK1V[[@GM{{6H  !CD #fkk18>DW8X  !45  ((;;=K-8))j>T>T  + H->-G-G-I H79 H)+GH H H H]d9 d9 d9 d9d OOI&	9MM!!/2	9 &&(	9 ""=		99| + +'9#1!-37) /+	9 	9 	9 	9 	9cd9 d9 d9 d9_	 	.')	 )	 )	 )	 )	L	 XH H H H H H H H& $ 9 ,,.D89!	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 	9cd9 d9 d9 d9 d9 d9 d9 d9 d9s  (_5`",`1`5`;%` 2`5`	A`	`"9dAc?B!c*'	c?0	dG%c*(A	`/	2`/	0`/	2"`/	Cc*a'	2a:	`<	a	a'		c*	c?'	d:c* c	/c b+ba45	b+>	c 	c		c*	c?"	d5_?
``	`""`,/`94c*<aa	a'	aa'	
c*'a1,c*4	b=bbbbb+"	c +b40c 7	c	 c	c		c*cc*!	c?*c3/c?6	d?d	ddc                   t        | t              syt        |       j                  \  }t        |t        t
        f      ryt        |t        j                  j                  j                        rst        |j                  d      r]t        |j                  j                  j                        dkD  r1t        d |j                  j                  j                  D              ryy)z"True if a FX graph returns a tupleT_schemarS   c              3  L   K   | ]  }t        |j                        d k(    yw)r   N)r  rI  )r   rets     rx   r   z&graph_returns_tuple.<locals>.<genexpr>B  s     OcCHH)Os   "$F)r   rF   rg   r}   r   r/  r   r   r   r   hasattrr  r   rU  returnsall)r   rvs     rx   r:  r:  7  s    b+&O  ER"tUm$2uxx}}))*BIIy)		!!))*Q.ORYY5F5F5N5NOO rz   c                   t        |       }|j                  \  }t        j                  |      \  }| j                  j                  |      5  | j                  j                  |       ddd       | j                  j                  |       t        |       sJ  || |      t        j                        dfd       }|S # 1 sw Y   [xY w)z
    Mutate gm so it returns a tuple.  This is only needed for graphs
    not created by torchdynamo that return non-tuples.
    Nc                 <    t        j                   | i |      S rs   )r   tree_unflatten)r}   r~   r  specs     rx   r  z(make_graph_return_tuple.<locals>.wrapper\  s     $$[$%A&%A4HHrz   )r}   r
   r~   r
   r  r
   )rg   r}   r   r  r   inserting_beforer   r1  r:  r  wraps)r   r  
compile_gmr   r[  r  r  r_  s         @@rx   r;  r;  I  s     r?DIIER""2&HB		"	"4	( 
HHr"""R(K__[!I "I N s   CCc                .   | j                   j                  t        j                  j                   j	                         | j                   _        | j                           ||  j                  |       t        j                        dfd       }|S )z
    `torch._dynamo.export` embeds pytrees in the FX graph codegen object,
    convert that to a normal FX graph so inductor can compile it.
    c                 F    j                    j                  |         S rs   )process_outputsprocess_inputs)r}   codegenr  s    rx   r  z+handle_dynamo_export_graph.<locals>.wrapperr  s'    &&{4JG4J4JD4Q'RSSrz   )r}   r
   r  r
   )	r   r<  r   r   CodeGenr2  rf  r  ra  )r   r  rb  r  rg  r  s       @@rx   r=  r=  c  sx     hhG..0BHHLLNR!7!7!7!@AK__[!T "T Nrz   c                   dd}t        j                  | j                  j                         | j                        D ]  }t        |t              st        |      }|r,t        |      r!|j                         t        j                  k7  rNt        |      }|j                  d      r y  ||j                                 y )Nc                    ddl m} | J t        | j                        }|j	                  |       }t        j                  |j                   d        |d      )Nr   )rO   z9 does not support bfloat16 compilation natively, skippingzBF16 is not supported)torch._dynamo.excrO   r   rI  get_device_propertiesr   r   r   )rH  rO   device_interfacedevice_propss       rx   warn_and_skipz1_check_triton_bf16_support.<locals>.warn_and_skipz  s\    /!!!3FKK@'==fE  !!Z[	
 /00rz   F)including_emulation)rH  zOptional[torch.device]r  r   )r  r  graph_inputsrg  ro  r   r^   r]   r@   	get_dtyper   bfloat16r   is_bf16_supported
get_device)r   ro  r   rK  rm  s        rx   rs  rs  y  s    
1  2 2 9 9 ;U=P=PQ )$'%d++&~~5>>1 4K@--%-Hdoo'()rz   )optionsc                  ddl m}  ||       sJ d       d}d}t        | j                  j                  t
        j                  j                  j                        r| j                  j                  }t
        j                  j                  j                         | j                  _        | j                          |j                  j                  |j                  j                  }|j                  j                  G|j                  j                  }n0t        | d      r| j                  }t        | d      r| j                  }|t!        j"                  |      nd}|t!        j"                  |      nd}	t!        j$                  ||xs i f      \  }
}|
D cg c]&  }t        |d   t
        j&                        r|d   nd( }}|||k7  rt)        d| d	|       |||	d
ni |||	d
}||fS c c}w )z
    Flatten the inputs to the graph module and return the flat inputs and options.
    Add "aot_inductor.serialized_in_spec" and "aot_inductor.serialized_out_spec" to the options.
    rS   )r:  zGraph output must be a tuple(). This is so that we can avoid pytree processing of the outputs. Please change the module to have tuple outputs.N_in_spec	_out_spec z>Trying to flatten user inputs with exported input tree spec: 
z-
but actually got inputs with tree spec of: 
)zaot_inductor.serialized_in_specz aot_inductor.serialized_out_spec)r  r:  r   r   r<  r   r   rQ   rh  r2  pytree_infoin_specout_specrX  rx  ry  r   treespec_dumpstree_flatten_with_pathr   rj  )r   r}   r~   rv  r:  r|  r}  rg  serialized_in_specserialized_out_specflat_args_with_pathreceived_specr   flat_example_inputss                 rx   _aoti_flatten_inputsr    s    0r" 	" GH"((##UXX^^%B%BC((##!HHNN224
&&2))11G''3**33H 2z"kkG2{#||H;B;N..w7TV+3+?h'R  *0)F)F	v|*& CV=>
1Q4.!D8  }7Mi <o
 	
 ? 0B0C	



/A0C
  ''1s   +G/)rw   r  r  z.Callable[[Callable[_P, _T]], Callable[_P, _T]])r}   rg  r~   rg  r  r  )r  r   )r   r  r  	list[int])r   rF   r  r  )r  zCallable[..., None]r  )r   rF   r   rF   r   r   r  rF   )r   rF   r  zGenerator[str, None, None])r   rF   r  r  r  rF   )F)r   rF   r&  r{  r  r  )TNN)
r   rF   r3  r{  r4  zOptional[list[str]]r5  z)Optional[Callable[[torch.fx.Node], bool]]r  z"tuple[GraphModule, dict[str, int]])r   rF   r  r{  )r  r  r  "AbstractContextManager[None, None])r]  r{  r^  r{  r  r  )r   rF   r  r  ri  r{  r  z torch._subclasses.FakeTensorModers   )ro  z$Optional[Union[str, dict[str, Any]]]r  zdict[str, Any])r  zGenerator[None, None, None]r  )r   rF   r  r  r  r  r  r:   )
r   rF   r  r  r  ry  r  r  r  r:   )r  r  rz  ry  r  ry  )r|   )r  Callable[..., Any]rz  ry  r  r  r  zlist[Optional[str]]r|  r{  r&  r{  r  ztuple[torch.Tensor, ...]r  zSequence[PlaceholderInfo]r  ztuple[int, ...]r  r  )r   torch.Tensorr  r  )r  r  r  r  r  r  r  r  )r  r  r  zlist[torch.Tensor]rz  ry  r  r  )
r  rF   r  r  r  r  ro  zOptional[dict[str, str]]r  zUnion[list[str], str])r  rF   r  r  r  rF   r  r  r  r  rx  r<   r}  r  r  r0   r  z0Callable[[list[object]], Sequence[torch.Tensor]])r  zdict[str, object])r   torch.fx.GraphModuler  zAbstractContextManager[None])r  rF   r  r  r  zCallable[..., OutputCode]ro  Optional[dict[str, Any]]r  z.Optional[dict[OpOverload, Callable[..., Any]]]r  zGUnion[Callable[[list[object]], Sequence[torch.Tensor]], str, list[str]])r   rF   r  r  rb  r  r  r  )r   r\   r  r  )
r   r  r}   z!Union[list[Any], tuple[Any, ...]]r~   r  rv  r  r  z tuple[list[Any], dict[str, Any]])
__future__r   rW  enumr  r  r  r  r   r   rU  r  r   abcr   r   collectionsr   r   inspectr   r	   typingr
   r   r   r   r   r   typing_extensionsr   r   r   r   r   r   unittestr   torch._inductor.async_compiler   torch.fxtorch.utils._pytreer   _pytreer   functorch.compiler   r   torch._dispatch.pythonr   torch._dynamor   r   r  r   rt   torch._dynamo.device_interfacer   torch._dynamo.repro.after_aotr   torch._dynamo.utilsr    r!   r"   r#   r$   r%   r&   r'   r(   torch._functorchrE  7torch._functorch._aot_autograd.subclass_parametrizationr)   torch._functorch.aot_autogradr*   r+   r,   torch._inductor.codecacher-   r.   r/   r  r0   r1   r2   r3   torch._inductor.debugr4   torch._inductor.output_coder5   r6   r7   r8   r9   r:   %torch._inductor.runtime.runtime_utilsr;   torch._inductor.utilsr<   r=   r>   r?   r@   rA   rB   rC   torch._loggingrD   torch._utils_internalrE   rF   %torch.fx.experimental.symbolic_shapesrG   rH    torch.fx.passes.fake_tensor_proprI   torch.monitorrJ   torch.utils._ordered_setrK   _dynamo.backends.commonrM   _dynamo.excrN   rO   fx._lazy_graph_modulerP   fx.graphrQ   utils._tritonrR   rz  rT   r  rU   decompositionrV   excrW   fx_passes.joint_graphrX   fx_passes.post_gradrY   rZ   fx_passes.pre_gradr[   r   r\   irr]   r^   output_coder_   triton_bundlerr`   ra   rb   rc   rd   re   rf   rg   rh   ri   virtualizedrj   collections.abcrk   rl   rm   
torch._opsrn   ro   rp   rq   ri  ry   r   torch._inductor.fb.utils&torch._functorch._aot_autograd.schemasr   r   r   Enumr   r   r  r   r   r   _logginggetArtifactLoggerrR  r?  r^  r  r   r   	lru_cacher   r   r   r  r  r!  r$  r>  rK  rZ  r_  rl  rp  contextmanagerru  rw  r  r  r  r  r  r  r  r  r  r  r  r  r  rB  r  r  r\  r  r:  r;  r=  rs  r  r|   rz   rx   <module>r     s,   "    	    	 
   # # -    I I U U  $  $ $ A  ;  D =
 
 
 8 
 O N  A  <	 	 	 , ?   W ; & / 2 5 : % &   .  5 B /   ' I )
 
 
  3:%$ t_T](((*% L DII $. +,g!00<Hnn66xARS ~~77BTU NN44' 

4A T/ / T	
 	
I	I%I8FIIX%*NN'N N&		+ "15FJ	E(E(E( /E( D	E(
 (E(P*('('(.	(	(!%	('	( ).' "& &	@ <@(8((  
;y 
; 2
2
'2
 '2
 	2
j 23ZZ'Z -Z 	Z 4ZzB B# #4|) |~YY'Y
 #Y -Y Y0  $   J (*) +-.0*,))$) 	)
 &) ) ) () ,) () )XU		  
	 (*\?\?\? %\? &	\?D )9/3	;+;+$;+ &;+ -	;+
 ;+| qc#c+c c 	c
 &c c c %c 6cL&B 0@/3EIJ9J9(J9 -J9 -	J9
 CJ9 MJ9Z$ # 	4 # 	,)D (,I(
 )-I(I(
+I( %I(
 &I( &I(rz   