
    Vh3                       U d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z
 d dlmZmZ d dlmZmZmZmZ d dlZd dlZd dlZd dlmZ d dlmc mZ d dlmZ d dlmZ d dl m!Z!m"Z" d dl#m$Z$m%Z% d d	l&m'Z'm(Z(m)Z)m*Z* d d
l+m,Z, d dl-m.Z. d dl/m0Z0 ddl1m2Z2 ddl3m4Z4 ddl5m6Z6m7Z7m8Z8 ddl9m:Z: ddl;m<Z< ddl=m>Z>m?Z? ddl@mAZAmBZB erd dlCZCe2j                  ZEeFeGd<    ej                  eI      ZJej                  eGd<   ej                  j                  ZMej                  j                  ZNe G d d             ZOe G d d             ZPe G d d             ZQdej                  deFfdZSd ej                  deFfd!ZUd ej                  deFfd"ZVdej                  deWfd#ZX G d$ d%      ZY eY       ZZ	 dnd&ej                  d'e\ej                     d(e\ej                     d)ee]   dej                  f
d*Z^dej                  deFfd+Z_dej                  deFfd,Z`dej                  deFfd-Zadej                  deFfd.Zbdej                  deFfd/Zcdej                  deFfd0Zddej                  deFfd1Zedej                  deFfd2Zfd3ej                  dege\ej                     e\ej                     f   fd4Zhd5e\ej                     d6e]fd7Zid3ej                  d5e\ej                     d8e\ej                     d9eWdegej                  ej                  f   f
d:Zjd3ej                  degej                  ej                  f   fd;Zk eWd<      Zld=eWdeWfd>Zmdej                  deWfd?Znd@ej                  fdAZo ej                  d      dB        ZqdCerej                  eWf   de\egej                  eWf      fdDZsdEej                  dej                  fdFZtdGej.                  j                  dHej.                  j                  dIej.                  j                  dJej.                  j                  dKej                  dLeWdMej.                  j                  dNej.                  j                  fdOZvd3ej                  dGej                  dHej                  dPeWdegej                  ej                  f   f
dQZwd3ej                  dej                  fdRZx	 dnd&ej                  dSePdTeQdUee.ej                        fdVZydW ZzdeOfdXZ{d@ej                  fdYZ|d&ej                  dZe\e}   d[e\e}   d\e}dSePd]e\ej                     dege}e\eW   e\eW   f   fd^Z~d d_lmZ d`ej                  daeWdej                  fdbZdc Z	 dod&ej                  dSePde\ej                     fddZ	 dpd3ej                  degej                  ej                  f   fdeZ	 	 	 	 	 dqdfej.                  j                  dge]dhe]dieFdjeee]e\e]   f      dkeFdlee]   ddfdmZy)r    Ndefaultdict)	dataclassreplace)CallableOptionalTYPE_CHECKINGUnion)(create_structured_trace_for_min_cut_info)BackwardState)is_sym_nodepy_sym_types)magic_methodsmethod_to_operator)find_symbol_binding_fx_nodesfree_symbolshint_intis_symbol_binding_fx_node)graph_drawer)
OrderedSet)CheckpointPolicy   )config)GraphInfoProvider)dp_knapsackgreedy_knapsackilp_knapsack)KnapsackEvaluator)get_aot_graph_name)get_cuda_generator_meta_valis_with_effects)fx_graph_cseget_aten_targetAOT_PARTITIONER_DEBUGlogc                      e Zd ZU dZee   ed<   ee   ed<   ee   ed<   ee   ed<   ee   ed<   dej                  fdZ	dej                  fd	Z
dej                  fd
Zdej                  fdZdej                  fdZy)OpTypesz8Class for keeping track of different operator categoriesfusible_opscompute_intensive_ops
random_opsview_opsrecomputable_opsnodec                 0    t        |      | j                  v S N)r#   r(   selfr-   s     M/home/dcms/DCMS/lib/python3.12/site-packages/torch/_functorch/partitioners.py
is_fusiblezOpTypes.is_fusibleF   s    t$(8(888    c                 0    t        |      | j                  v S r/   )r#   r)   r0   s     r2   is_compute_intensivezOpTypes.is_compute_intensiveI   s    t$(B(BBBr4   c                 0    t        |      | j                  v S r/   )r#   r*   r0   s     r2   	is_randomzOpTypes.is_randomL   s    t$77r4   c                 0    t        |      | j                  v S r/   )r#   r+   r0   s     r2   is_viewzOpTypes.is_viewO   s    t$55r4   c                 0    t        |      | j                  v S r/   )r#   r,   r0   s     r2   is_recomputablezOpTypes.is_recomputableR   s    t$(=(===r4   N)__name__
__module____qualname____doc__r   r   __annotations__fxNoder3   r6   r8   r:   r<    r4   r2   r'   r'   <   s    BH%%%h//8$$"" **9rww 9C C8bgg 86BGG 6>BGG >r4   r'   c                      e Zd ZU eej
                     ed<   eej
                     ed<   eej
                     ed<   eej
                     ed<   eej
                  e	f   ed<   e
j                  deej
                     fd       Zdej
                  defd	Zdej
                  defd
Zdej
                  defdZdej
                  de	fdZy)NodeInfoinputs_required_fw_nodesrequired_bw_nodesunclaimed_nodesfw_orderreturnc                 F     t        d  j                  D         fd      S )Nc              3       K   | ]  }|  y wr/   rD   .0ns     r2   	<genexpr>z-NodeInfo.required_fw_nodes.<locals>.<genexpr>c   s     01Q0s   c                 "    j                   |    S r/   )rK   )rQ   r1   s    r2   <lambda>z,NodeInfo.required_fw_nodes.<locals>.<lambda>c   s    a@P r4   key)sortedrH   r1   s   `r2   required_fw_nodeszNodeInfo.required_fw_nodes`   s!    0//06P
 	
r4   rQ   c                     || j                   v S r/   )rH   r1   rQ   s     r2   is_required_fwzNodeInfo.is_required_fwf   s    D++++r4   c                     || j                   v S r/   )rI   r[   s     r2   is_required_bwzNodeInfo.is_required_bwi   s    D****r4   c                     || j                   v S r/   )rJ   r[   s     r2   is_unclaimedzNodeInfo.is_unclaimedl   s    D((((r4   c                 R    || j                   v sJ d| d       | j                  |   S )NNode z not in fw nodes!)rH   rK   r[   s     r2   get_fw_orderzNodeInfo.get_fw_ordero   s4    D+++IuQC7H-II+}}Qr4   N)r=   r>   r?   listrB   rC   rA   r   dictint	functoolscached_propertyrY   boolr\   r^   r`   rc   rD   r4   r2   rF   rF   V   s     M"277++!"''**((277C<  
4= 
 

, ,D ,+ +D +)bgg )$ ) bgg  #  r4   rF   c                   @    e Zd ZU eed<   eed<   eed<   eed<   eed<   y)MinCutOptionsban_if_used_far_apartban_if_long_fusible_chainsban_if_materialized_backwardban_if_not_in_allowlistban_if_reductionN)r=   r>   r?   ri   rA   rD   r4   r2   rk   rk   t   s      $$"&&!!r4   rk   r-   rL   c                 z    | j                   j                  dd       t        j                  t        j                  fv S )N	recompute)metagetr   MUST_RECOMPUTEPREFER_RECOMPUTEr-   s    r2   must_recomputerx   }   s5    99==d+''))0  r4   fx_gc                 T    | j                   j                  D ]  }t        |      s y y)NTF)graphnodesrx   ry   r-   s     r2   has_recomputable_opsr~      s+    

   $ r4   c                     | j                   j                  D ]W  }t        |      st        |j                  d      s&t
        j                  j                  |j                  j                  v sW y y)NtagsTF)	r{   r|   rx   hasattrtargettorchTagnondeterministic_seededr   r}   s     r2   has_recomputable_rng_opsr      sU    

   4 V,		11T[[5E5EE r4   c                     t        | j                  d   t        j                  t        j                  f      ryt        | j                  d   t        j
                        sJ y)Nvalr      )
isinstancers   r   SymIntSymBoolSymFloatrw   s    r2   sym_node_sizer      sE    $))E"U\\5==$ABdii&777r4   c                       e Zd Zd Zy)InvalidNodeBasec                      y)NzInvalid NoderD   rX   s    r2   __repr__zInvalidNodeBase.__repr__   s    r4   N)r=   r>   r?   r   rD   r4   r2   r   r      s    r4   r   joint_graphrG   outputssubgraphc                 `  
 t        j                         }i 
|D ]3  }|j                  |j                        }|j                  |_        |
|<   5 | j
                  D ]  }t        |      r|dk7  r
t        
|<   |
v r#|j                  dk(  r
t        
|<   <|j                  dk(  rt        j                  |j                  i |j                  }|D cg c]/  }t        |t         j                        rt        
|   t              1 }}t!        |      r
t        
|<   |j#                  |
fd      
|<   |j                  dk(  r|j#                  |
fd      
|<   |j                  dk(  s g }	|D ]s  }t        |t         j                        rF|
vrt%        d| d	      t        
|   t              rJ d| d
       |	j'                  
|          c|	j'                  |       u |j)                  t+        |	             |j-                          |j/                          |S c c}w )a  
    Given a graph, extracts out a subgraph that takes the specified nodes as
    inputs and returns the specified outputs.

    This includes specifying non-placeholder nodes as inputs.

    The general strategy is to initialize all inputs with proxies as we
    encounter them, and trace through the graph, only keeping values which take
    in valid proxies. Then, all dead code is eliminated.
    backwardplaceholdercall_functionc                     |    S r/   rD   xenvs    r2   rT   z4_extract_graph_with_inputs_outputs.<locals>.<lambda>       CF r4   get_attrc                     |    S r/   rD   r   s    r2   rT   z4_extract_graph_with_inputs_outputs.<locals>.<lambda>   r   r4   outputrb   z couldn't be found in envz was invalid, but is output)rB   Graphr   namers   r|   _must_be_in_backwardInvalidNodeoppytreearg_tree_leavesargskwargsr   rC   r   any	node_copyRuntimeErrorappendr   tupleeliminate_dead_codelint)r   rG   r   r   	new_graphr-   new_nodeall_argsr   output_valuesr   s             @r2   "_extract_graph_with_inputs_outputsr      s%     
I
C  ((3		D		 !! %(j*@#CI3; WW%#CIWW'--tyyHDKKHH "a) 3q6?3H 
 8}'D	!++D2BCCIWW
"!++D2BCCIWW 56 M 	$a!|"U1#-F#GHH!A 6qc456    Q(  #	$ U=)*!!#NN9s   4H+c                     | j                   dk(  xr3 dt        | j                        vxr t        |        xr t	        |        S Nr   tangents)r   strr   _is_bwd_seed_offset_is_fwd_seed_offsetrw   s    r2   
_is_primalr      sK    =  	*c$++..	*#D))	* $D))	r4   c                 R    | j                   dk(  xr dt        | j                        v S r   r   r   r   rw   s    r2   _is_tangentr      s$    77m#F
c$++6F(FFr4   c                     | j                   dk(  xr0 dt        | j                        v xs dt        | j                        v S )Nr   bwd_seedbwd_base_offsetr   rw   s    r2   r   r      =    77m# c$++&&O*;s4;;?O*Or4   c                     | j                   dk(  xr0 dt        | j                        v xs dt        | j                        v S )Nr   fwd_seedfwd_base_offsetr   rw   s    r2   r   r      r   r4   c                 v    | j                   dk(  xr) t        | j                  j                  d      t              S )Nr   r   )r   r   rs   rt   r   rw   s    r2   _is_backward_stater     s*    77m#W
499==3G(WWr4   c                 @    | j                   j                  dd       dk(  S )Npartitioner_tagis_backwardrs   rt   rw   s    r2   _has_tag_is_backwardr   	  s    99==*D1]BBr4   c                 @    | j                   j                  dd       dk(  S )Nr   must_be_in_backwardr   rw   s    r2   _has_tag_must_be_in_backwardr     s    99==*D15JJJr4   c                 L    t        |       xs t        |       xr t        |       S r/   )r   r   r!   rw   s    r2   r   r     s&    '- T"<t'<r4   joint_modulec                    t        j                  d | j                  j                  d      D         }|d | }||d  }||fS )Nc              3   4   K   | ]  }|j                     y wr/   )r   rP   r-   s     r2   rR   z+_extract_fwd_bwd_outputs.<locals>.<genexpr>  s     	K$))	Ks   r   r   )r   r   r{   
find_nodes)r   num_fwd_outputsr   fwd_outputsbwd_outputss        r2   _extract_fwd_bwd_outputsr     sW     $$	K 2 2 = = = J	KG *?+K/*+K##r4   saved_valuesr   c                 V    | D ]$  }|j                   |k(  s| j                  |        y  y r/   )r   remove)r   r   saved_values      r2   _remove_by_namer   "  s0    # t#,r4   saved_sym_nodesr   c                Z   t        | |      \  }}| j                  j                  d      }g t        t        |      }g t        t
        |      }g t        t        |      }	g t        t        |      }
g t        t        |      }t        | j                  ||z   |z   |
z   |d      }|j                  d      D ]a  }|j                  s-t        ||j                         t        ||j                         <t        |      sHt        ||j                         |raJ  t               }g }g }|D ]C  }t        |      }|r#|j                  |       |j!                  |       3|j!                  |       E t#        | j                        }t%        j&                  |||      D ]]  }d|j(                  vrt+        |j(                  d         |z
  }t-        |d       D ]  }||vr|j!                  ||           ||z  }_ |j/                          |j1                  ||z          t        | j                  ||	z   ||z   |z   d      }t        | j                  ||z   |z   |
z   |z   |d      }t2        j4                  j7                  | |      }t2        j4                  j7                  | |      }||fS )	Nr   r   r   r   r   c                     | j                   S r/   r   )ss    r2   rT   z*_extract_fwd_bwd_modules.<locals>.<lambda>e  s
    166 r4   rU   forward)r   r{   r   filterr   r   r   r   r   r   usersr   r   r   r   addr   r   	itertoolschainrs   r   rW   clearextendrB   _lazy_graph_module_make_graph_module)r   r   r   r   r   r   placeholdersprimal_inputstangent_inputsfwd_seed_offset_inputsbwd_seed_offset_inputsbackward_state_inputs	bwd_graphr-   saved_symbolssaved_sym_nodes_bindingsaved_sym_nodes_derivedsymbolsymbol_bindingsnew_symbolsr   	fwd_graph
fwd_module
bwd_modules                           r2   _extract_fwd_bwd_modulesr  )  s     8o K  %%00M0BL7fZ67M9vk<89NIv&9<HIIv&9<HIGf%7FG2,&7:PP	I $$$6 )zzL$))4OTYY7%L$))4((() /9lM     1*40f%#**40#**401 3<3E3EFO 7~V %		!"499U#34}D)9: 	?A '#**?1+=>	? 	$%" 25LLM 3..l"_4	I 3
	
	 !	!  		 
 		I &&99,	RJ&&99,	RJz!!r4   c                \   t        |       rt        | ||      S t        t        t        | j
                  j                              }t        t        t        | j
                  j                              }||z   }t        | |      \  }}t        | j
                  ||d      }t        d |j                  D              }	g }
g }| j
                  j                  D ]  }|j                  |	vrt        |      r|j                  |       /d|j                  vrA|j                  dk(  r2|j                   }t#        d |D              sJ |
j%                  |       ~|j                   D cg c]  }|j                  |	vs| }}d|j                  v r$t#        d |D              r|j%                  |       |
j                  |        t        t&        j)                  |
      j+                               }
t        t&        j)                  |      j+                               }t-        | |
||      S c c}w )	a  
    Partitions the :attr:`joint_module` in a manner that closely resembles the
    behavior observed in the original ``.forward()`` and ``.backward()`` of the
    callable, i.e., the resulting forward graph contains those operators that
    are executed in the original ``.forward()`` callable passed to
    :func:`aot_function`.

    The default partitioner collects the operators that are between the forward
    inputs and the forward outputs. This helps in finding the tensors which have
    to be stashed for the backward pass. These stashed tensors become the output
    of the generated forward graph. The remaining operators are then placed in
    the backward graph.

    .. warning::
        This API is experimental and likely to change.

    Args:
        joint_module(fx.GraphModule): The joint forward and backward graph. This
            is the result of AOT Autograd tracing.

    Returns:
        Returns the generated forward and backward Fx graph modules.
    r   r   c              3   T   K   | ]   }|j                   d k7  s|j                   " ywr   Nr   r   r   s     r2   rR   z$default_partition.<locals>.<genexpr>  s$      $$''X:M		$   ((tensor_metar   c              3   V   K   | ]!  }|j                   t        j                  k(   # y wr/   )r   operatorgetitemrP   users     r2   rR   z$default_partition.<locals>.<genexpr>  s     I4t{{h&6&66Is   ')c              3   2   K   | ]  }t        |        y wr/   r   rO   s     r2   rR   z$default_partition.<locals>.<genexpr>  s      2#$A2   r   r   )r~   #min_cut_rematerialization_partitionrd   r   r   r{   r|   r   r   r   r   r   r   r   rs   r   r   allr   re   fromkeyskeysr  )r   _joint_inputsr   r   r   rG   r   r   forward_only_graphforward_node_namesr   r   r-   r   rQ   backward_usagess                   r2   default_partitionr    s   4 L)2-
 	
 
L,>,>,D,DEFM!&)<l>P>P>V>V"WX33F7o K <FK $ $066$  LO""(( *99..t ""4($))+?0JJJEI5IIII&  ::7I)IO  		)c 2(72 /  &&7##D);*< l388:;L4==9>>@AO#''	 )s   #H)7H)g    .Anumelc                      | |j                   z  S r/   )itemsize)r  dtypes     r2   _tensor_nbytesr    s    5>>!!r4   c                 V   dt         fdd| j                  v r| j                  d   }t        |t              ryt        |t        t
        f      rt        fd|D              S t        |t              r"t        fd|j                         D              S t        |t        j                        r |      S t        dt        |       d|        | j                  d	k(  s:| j                  t        j                  j                   j"                  j$                  u ry
t        d|  d      )NrL   c                     t        | t        j                        syt        t	        | j                         d      | j                        S )Nr      fallback)r   r   Tensorr  r   r  r  r   s    r2   object_nbytesz_size_of.<locals>.object_nbytes  s1    !U\\*hqwwy4@!''JJr4   r   r   c              3   .   K   | ]  } |        y wr/   rD   )rP   rQ   r'  s     r2   rR   z_size_of.<locals>.<genexpr>  s     5A}Q'5   c              3   4   K   | ]  \  }} |        y wr/   rD   )rP   _rQ   r'  s      r2   rR   z_size_of.<locals>.<genexpr>  s     @DAq}Q'@   zUnknown metadata type z	 on node r   r   rb   zO didn't have `val` metadata; we should always have `val` metadata on the nodes.)rf   rs   r   r   rd   r   sumre   itemsr   r%  r   typer   r   opsaten_assert_scalardefault)r-   r   r'  s     @r2   _size_ofr4    s    KC K
 		iic<( dE]+5555T"@CIIK@@@U\\* %%3DI;ivNOOww*uyy~~/L/L/T/T T

vde r4   r{   c                    ddl m}  |t              }| j                  D ]3  }|j                  dk(  s||j
                  j                  xx   dz  cc<   5 t        j                  dt        |j                         d d             y )	Nr   r   r   r   z%sc                     | d   S Nr   rD   r&  s    r2   rT   z_count_ops.<locals>.<lambda>  s
    QqT r4   TrV   reverse)collectionsr   rf   r|   r   r   r=   r%   inforW   r.  )r{   r   cntr-   s       r2   
_count_opsr=    sg    '%c*C +77o%$$%*%+ HHT6#))+>4HIr4   c                     g } t        t        j                  j                        D ]  }t	        t        j                  j                  |      }t        |t        j                  j                        sL|j                         D ]G  }t	        ||      }t        j                  j                  |j                  v s6| j                  |          | S r/   )dirr   r0  r1  getattrr   _opsOpOverloadPacket	overloadsr   	pointwiser   r   )r0  	attr_nameopoverloadpacketoverloadop_overloads        r2   pointwise_opsrI    s    
C( 
	"599>>9=*EJJ,G,GH(224 	H!"2H=Kyy""k&6&66

+,	
 Jr4   	depth_mapc                     | D ci c]7  }t        |t        j                  j                  j                        s2|||   9 }}t        |j                         d d      S c c}w )Nc                     | d   S r7  rD   r&  s    r2   rT   zsort_depths.<locals>.<lambda>&  s
    AaD r4   Tr8  )r   r   rB   r-   rC   rW   r.  )r   rJ  arg
arg_depthss       r2   sort_depthsrO  "  s[    '+ #z#uxx}}?Q?Q/RYs^J  *""$.$GGs
   3A A gmc                   
 t        j                         i 
| j                  j                  d      D ]  }j	                  |
fd      
|<    i t        | j                  j                        D ]
  \  }}||<    
fd}t        t        t        | j                  j                              }d}t        j                  }|D ]#  }|j                  D ]  }|   |k  s|   }|} % || S t        | j                  j                        |   d D ]
  } ||        t        j                   j                  |       }	|	S )a  
    This pass finds the first bwd node in the graph (by looking at users of
    tangents) and then reorders the graph by walking from this node to all the
    way to the end of the graph. At each op in this traveral, we insert this op
    in a new graph and try to bring only the relevant subgraph from the other
    non-bwd edges relevant for this op. This closely mimics the behavior of
    autograd engine.

    Why is this pass required in the first place?

    This is an artifact of how partitioners work today. The starting point of
    partitioner is a joint graph, which is fwd and then bwd graph. In the case
    of checkpointing, we keep portions of fwd graph in their original place in
    the joint graph, while obtaining a bwd graph. As a result, the resulting bwd
    graph has copies of recomputed fwd subgraphs followed by the original bwd
    graph. If we run this naively, this leads to bad memory footprint, because
    the fwd subgraphs are live for way longer duration than necessary. This pass
    reorders the operations such that we prioritize the ops for the original bwd
    graph while only realizing those ops from the fwd graph that are necessary
    at any given point in the graph.
    r   r   c                     |    S r/   rD   r   s    r2   rT   z5reordering_to_mimic_autograd_engine.<locals>.<lambda>E  s    A r4   c                 *   | g}t               }t        |      dkD  rH|j                         } | |v s| v r'|j                  |        || j                  z  }t        |      dkD  rHt        |fd      }|D ]  } j                  | fd      | <    y )Nr   c                     |    S r/   rD   )rQ   orders    r2   rT   zSreordering_to_mimic_autograd_engine.<locals>.insert_node_in_graph.<locals>.<lambda>X  s    %( r4   rU   c                     |    S r/   rD   r   s    r2   rT   zSreordering_to_mimic_autograd_engine.<locals>.insert_node_in_graph.<locals>.<lambda>Z  r   r4   )r   lenpopr   all_input_nodesrW   r   )r-   	cur_nodesinsertable_nodesr   r   rU  s      r2   insert_node_in_graphzAreordering_to_mimic_autograd_engine.<locals>.insert_node_in_graphK  s    F	0:)nq ==?D''43;  & ---I )nq  ""28JK$ 	DD!++D2BCCI	Dr4   N)rB   r   r{   r   r   	enumerater|   rd   r   r   mathinfr   r   GraphModule)rP  r-   idxr\  r   first_node_in_bwdminimum_ordertangentr  new_gmr   r   rU  s             @@@r2   #reordering_to_mimic_autograd_enginerf  )  sL   . 
I"$C ##}#5 @''.>?D	@ Erxx~~. 	TdD$ &bhhnn=>NHHM! )MM 	)DT{]* %d$(!	))  	 RXX^^$U+<%=%?@ #T"# XX!!"i0FMr4   	fw_module	bw_modulefw_nodebw_nodedevice	rng_countlast_fwd_inputlast_bwd_inputc                    |j                   }|J | j                  }	|j                  }
t        j                  j                  j
                  }| j                  j                  |      5  | j                  j                  d|       }t        |      |j                  d<   |}ddd       |j                  j                  |      5  |j                  j                  d|       }t        |      |j                  d<   |}ddd       t        |j                        }|d<   | j                  j                  |      5  |	j                  d||j                  g|j                  |      }ddd       |j                         |	j!                  |       t        |j                        }|d<   |
j#                  |      5  |
j                  d||j                  g|j                  |      }|j                  |       |
j!                  |       ddd       ||fS # 1 sw Y   zxY w# 1 sw Y   +xY w# 1 sw Y   xY w# 1 sw Y   ||fS xY w)a%  
    Note [CUDA Graph Safe RNG Functionalization]

    CUDA Graph capture doesn't work with get_rng_state and set_rng_state because these functions operate on CPU values,
    while CUDA Graph RNG capture uses on-device CUDA tensors. To solve this, we use graphsafe_set_state with a
    CUDA Generator registered to the CUDA Graph before capture begins. graphsafe_set_state updates the generator's pointer
    to reference a different GeneratorImpl, ensuring subsequent calls are correctly forwarded to the desired generator
    (and its cuda-tensor RNG state during graph capture).

    For each RNG operation's forward/backward pair:

    - We create two generators initialized with identical values
    - Each forward and backward call advances its respective generator equally
    - This keeps generators synchronized so forward and backward operations use matching RNG values

    When forward is called multiple times before backward (causing desynchronization):

    - We save the forward RNG state
    - We update the backward Generator's state before executing backward

    Before each CUDA Graph replay, replay_prologue updates captured RNG pointers with current states, ensuring backward Generator
    changes are reflected during replay.

    This function modifies both forward and backward computation graphs by:

    Creating RNG state placeholders for both passes
    Updating the forward node to use graph-safe RNG state
    Updating the backward node to use graph-safe RNG state

    For more details: https://github.com/pytorch/pytorch/issues/113541
    Nfwd_rng_state_r   bwd_rng_state_	rng_stater   r   r   )indexr{   r   _prims	rng_primsgraphsafe_run_with_rng_stateinserting_afterr   r    rs   re   r   create_noder   r   replace_all_uses_with
erase_nodeinserting_before)rg  rh  ri  rj  rk  rl  rm  rn  
device_idxfw_graphbw_graphrw  fwd_rng_statebwd_rng_state	fw_kwargsfunctional_fw_node
bwd_kwargs
rng_outputs                     r2   %apply_graphsafe_rng_functionalizationr  s  s2   R J!!!HH#(<<#9#9#V#V  
	(	(	8 '!33nYK4PQ$?
$K5!&' 
	(	(	8 '!33nYK4PQ$?
$K5!&	' W^^$I*Ik		(	(	1 
%11(..07<<0	 2 

 !!"45  gnn%J+J{		"	"7	+ %))(..07<<0	 * 

 	%%j1G$% >))M' '' '
 
% >))s1   (9H9H&:.H3=AH?H#&H03H<?Inum_sym_nodesc                   ' t        j                         }d }dt        t        j                     fd'dt        t        j                     fd} ||       } ||      } ||      }	i }
| j
                  j                  D ]  }t        |      st        |j                  d      s&t        j                  j                  |j                  j                  v sW||j                     }||j                     }|	|j                     }||d|
|<    t        j                  j                  j                   }t        j                  j                  j"                  }d }|j
                  j%                  d	      D ]  }d
|j                  v s|} n |t'        d      g }t)        t+        |j
                  j%                  d	                  }t)        t+        |j
                  j%                  d	                  }t-        'fd|
j/                         D              }|j1                  t        j                  d             t3        |      dkD  }t        j4                  j6                  }t6        j8                  xr* | xr% |j:                   xs |j<                  j>                  }tA        |
jC                               D ]  \  }\  }}|d   }|d   } '|      }|j
                  }|j
                  }|r'|%|jD                  dk(  rtG        ||||||||      \  }}]|jI                  |      5  |jK                  d||j                  g|jL                  |jN                        }|jK                  dtP        jR                  |dfi       }|jK                  dtP        jR                  |dfi       } |jU                  |        |jW                  |       |jY                  |       d d d        |jI                  |      5  dt)        |       }!|j[                  |!      }" ||      |"j\                  d<   d d d        |jI                  |      5  |jK                  d|"|j                  g|jL                  |jN                        } |jU                  |        |jW                  |       d d d         |rt)        t_        |j
                  j%                  d	                  }#|#jL                  d   }$t3        |$      |z
  }%|$d |% ta        |      z   |$|%d  z   }&|j
                  jc                  |&       |j
                  jW                  |#       |je                          |je                          ||fS # 1 sw Y   xY w# 1 sw Y   LxY w# 1 sw Y   xY w)Nc                    i }| j                   j                  D ]i  }|j                  dk(  st        |j                  d      s*t
        j                  j                  |j                  j                  v s[|||j                  <   k |S )Nr   r   )
r{   r|   r   r   r   r   r   r   r   r   )gmodrandom_nodesr-   s      r2   get_rng_opsz*functionalize_rng_ops.<locals>.get_rng_ops  sl    JJ$$ 	/D?*DKK0II559I9II*.TYY'	/ r4   rL   c                     d| j                   vry| j                   d   }t        |t              s|f}|D ]D  }t        |t        j                        s|j
                  j                  dk(  s8|j
                  c S  t        j
                  d      S )zV
        Check the example value of the node outputs to find the device type.
        r   Ncudacpu)rs   r   r   r   r%  rk  r/  )r-   
candidates	candidates      r2   
get_devicez)functionalize_rng_ops.<locals>.get_device  s     		!YYu%
*e,$J# 	,I)U\\2##((F2$+++	,
 ||E""r4   rk  c                     | -| j                   dk(  rt        j                  j                         S t        j                         S )Nr  )r/  r   r  get_rng_state)rk  s    r2   get_sample_rng_statez3functionalize_rng_ops.<locals>.get_sample_rng_state  s5    &++"7::++--""$$r4   r   )fwdbwdr   r   rd  zaCouldn't find tangent node in graph inputs. This is unexpected, please file a bug if you see thisc              3   4   K   | ]  } |d            yw)r  NrD   )rP   	node_pairr  s     r2   rR   z(functionalize_rng_ops.<locals>.<genexpr>/  s       )2
9U#$r,  r  r   r  r  r  r   rs  r   rng_state_output_r   r   )3r   countr   r   rk  r{   r|   rx   r   r   r   r   r   r   ru  rv  run_and_save_rng_staterun_with_rng_stater   r   nextreversedr   valuesdiscardrW  	_inductorr   graphsafe_rng_functionalizationfallback_randomtest_configs*graphsafe_rng_func_ignores_fallback_randomr]  r.  r/  r  r|  ry  r   r   r
  r  rz  r{  r   r   rs   iterr   r   	recompile)(r   rg  rh  r  uidr  r  joint_graph_rng_opsfw_graph_rng_opsbw_graph_rng_opsrecomputable_rng_ops_mapr-   	base_noderi  rj  run_and_save_rngr  bw_tangent_start_nodefw_rng_state_outputsrm  rn  devicesmulti_cuda_devices
ind_config'use_rng_graphsafe_rng_functionalizationrl  r  rk  r~  r  r  stater  
state_namebw_rng_state_nodefw_output_node
fw_outputssym_node_start_idxr   r  s(                                          @r2   functionalize_rng_opsr    s   2 //
C	#HU\\2 #$%Xell%; % &l3"9-"9-!""(( 	S4 V,		11T[[5E5EE+DII6I&tyy1G&tyy1G:A'2R$Y/	S ||--DD//BB **m*< 		!$(! $o
 	
 (9??#=#=#=#OPQN(9??#=#=#=#OPQN 6N6U6U6W G OOELL'( W) ''J.. 	
""	
 *** R&&QQ , .7 &&(. D-)	)Iy E"E"G$???? 4"v%-R	.*NN **73 3%-%9%9#$!..87<<8">>	 &: &" !,,#$$,a0	 -  &11#$$*  2 
 --j9##G,$++E2136 **+@A M0c<
$,$8$8$D!0DV0L!&&u-M
 **73 	-%11#&+W^^KgllK">>	 2 
 --j9##G,	- 	-wD-R d9??#=#=#=#JKL#((+
 _}<**+()*+,-. 	
 	w'"">2iw3 36M M
	- 	-s&   &B5U&45U3AV &U0	3U=	 V
	c                    | j                   j                  D ]  }t        |      s|j                  D ]K  }t        |      s|j                  d   |j                  d   kD  s/t
        j                  |j                  d<   M |j                  j                  dd      st        d |j                  D              rt
        j                  |j                  d<    | S )a  
    If there are two consecutive checkpointed blocks with no operator in
    between, we would still want to stash the tensor at the boundary of
    checkpointed blocks. The following pass makes the last output node
    non-recomputable to allow for that.
    ac_graph_idrr   has_backward_hookFc              3   2   K   | ]  }t        |        y wr/   )rx   r  s     r2   rR   z)cleanup_recompute_tags.<locals>.<genexpr>  s      E)-t$Er  )	r{   r|   rx   r   rs   r   	MUST_SAVErt   r   )r   r-   r  s      r2   cleanup_recompute_tagsr    s     ""(( D$

 H"4(		-0499]3KK-=-G-GDIIk*H yy}}0%8 E15E B& *:)C)C		+&7D8 r4   	node_infomin_cut_optionsdont_banc                   %&'()*+,-./0 
t               t               /t        rQt        d | j                  D              }|t        d /j                  D              z
  }t
        j                  d|       d &d '&'/fd(	 dd l}(/fd	**/fd
}(fd)dt        f)/fd}	|j                         .t               %%./fd}
| j                  D ]A  }|j                  dk(  r|j                  v rm|j                  vr0.j                  |j                   dz   dt"        j$                         `.j                  |j                   dz   dt"        j$                         t'        |      r0.j                  |j                   dz   dt"        j$                         t)        |      st+        |      r |
|       j-                  |      r ||      r |
|       d|j.                  vxr d|j.                  vxs8 d|j.                  v xr( t1        |j.                  d   t2        j4                         }t7        |      rt        t9        |            }nF|r<t1        |j.                  j;                  d      t<              rdnt"        j$                  }n |	|      }.j                  |j                   dz   |j                   dz   |       |j>                  D ]>  }.j                  |j                   dz   |j                   dz   t"        j$                         @ D dt@        tB        jD                     dtF        dtF        f(fd}jH                  r(jJ                  D ]  }|j>                  D cg c]$  }j-                  |      rjM                  |      & }}|j>                  D cg c]  }j-                  |      s| }}tO        |      dkD  sw ||tQ        |            }tS        |j>                        D ]x  }j-                  |      sjM                  |      |kD  s* (||      s4|%v r9t
        j                  d|jM                  |      ||jM                  |              |
|       z  jT                  r^t               }| j                  D ]D  }j-                  |      sjM                  |      |fg}jM                  |      }tO        |      dkD  sJtW        jX                  |      \  }}||v r,|j[                  |       jM                  |      |dz   kD  rNtO        |      dk(  r@t
        j                  d||jM                  |      jM                  |              |
|       |j>                  D ]J  }j-                  |      s (||      s|%vs$tW        j\                  |jM                  |      |f       L tO        |      dkD  rG 	 |j_                  .dd      \  }}|\  }-t               }.fd |D        D ]   \  0}|jm                  -0fd!|D               " t               }|D ](  \  } }!| d d" |!d d# k(  sJ | d d" }"|j[                  |"       * to        |       +tq        | j                        D #ci c]  \  }#}||#
 c}}#,ts        +fd$|D        ,fd%&      }$|$%fS # t        $ r}t        d      |d }~ww xY wc c}w c c}w # t`        $ ri t
        j                  d       t
        j                  djc                  |jd                  jf                  ji                  .                   tk        .        w xY wc c}}#w )'Nc              3      K   | ]H  }|j                   d k(  r7t        |j                  d      r!t        |j                  j                         J yw)r   _overloadpacketN)r   r   r   r   r  r   s     r2   rR   z solve_min_cut.<locals>.<genexpr>  sA      &
ww/)gdkkCT.U ++,&
s   AAc              3   2   K   | ]  }t        |        y wr/   )r   rP   is     r2   rR   z solve_min_cut.<locals>.<genexpr>  s      4
CF4
r  z&Ops banned from re-materialization: %sc                 D   |j                   t        j                  j                  j                  k7  ry|j
                  d   }t        j                  j                  j                  |      \  }}|D ].  }|j                  |   }| |u r yt        |t              s)| |v s. y yNFr   T)r   r   r0  higher_orderauto_functionalizedr   _higher_order_opsauto_functionalizeget_mutable_argsr   r   rd   )ab
mutable_opmutable_arg_namesr+  r   rM  s          r2   !can_fuse_into_auto_functionalizedz8solve_min_cut.<locals>.can_fuse_into_auto_functionalized  s    88uyy--AAAVVAY
 ##66GG
S	
% 	 D((4.CCx#t$8	  r4   c                     |j                   t        j                  j                  j                  k7  ry|j
                  d   }|D ]  }|j
                  d   |   }| |u s y y)NFtensors_to_cloner   T)r   r   r0  r   triton_kernel_wrapper_functionalr   )r  r  r  r   rM  s        r2   .can_fuse_into_triton_kernel_wrapper_functionalzEsolve_min_cut.<locals>.can_fuse_into_triton_kernel_wrapper_functional  sb    88uyy--NNNHH%78% 	D((8$T*CCx	 r4   c                 b   t        |      t        j                  k(  ry | |      ry | |      ry| j                  t        j
                  u r>| j                  d   j                  t        j                  j                  j                  u ryj                  |       xr j                  |      S )NTr   F)r#   r1  catr   r
  r  r   r   r0  r  r  r3   )r  r  r  r  op_typess     r2   r3   z!solve_min_cut.<locals>.is_fusible  s     1),Q29!Q?HH(((q	  yy%%FFG
 ""1%@(*=*=a*@@r4   r   zANeed networkx installed to perform smart recomputation heuristicsc                 <   j                  |       ryt        | g      }t        |      dkD  ro|j                         }|j                  D ]A  }j                  |      s ||      s yj                  |      s1|j                  |       C t        |      dkD  royr  )r:   r   rW  rX  r   r\   r   )r-   rZ  curr  r3   r  r  s       r2   is_materialized_backwardsz0solve_min_cut.<locals>.is_materialized_backwards  s    D!v&	)nq --/C		 ( //5jd>S##D)MM$'	( )nq  r4   c                 T   | j                   dk7  ry| j                  t        j                  k(  ry| j                  j                  dd       t        j                  k(  ryt        j                  rj                  |       ry| j                  t        j                  j                  t        j                  j                  fv ryj                  rj!                  |       s$yj#                  |       sj%                  |       ryj&                  r3 |       r+t(        j+                  d| t-        | j.                               y| j0                  dk  r| j0                  t        j2                  kD  ryj4                  r/t7        d | j8                  D              }t;        |       }|dz  |k  S y)	Nr   Frr   Tzmaterialized backwards: %s %si  c              3   h   K   | ]*  }t        |t        j                        st        |       , y wr/   )r   rB   rC   r4  r  s     r2   rR   zBsolve_min_cut.<locals>.should_ban_recomputation.<locals>.<genexpr>D  s&      % !*Q2H%s   22r   )r   r   r
  r  rs   rt   r   r  r   recompute_viewsr:   r1  lift_fresh_copyr3  
lift_freshro   r<   r8   r6   rn   r%   debugr   r   dist_from_bwmax_dist_from_bwrp   r-  r   r4  )r-   input_tensors_sizeoutput_sizer  r  r  s      r2   should_ban_recomputationz/solve_min_cut.<locals>.should_ban_recomputation  sc   77o%;;(***99==d+/?/I/II!!h&6&6t&<;;4//779P9PQQ22++D1!!$'8+H+H+N 77<U=
 II5tU4::=NO t#(9(9F<S<S(S ++!$ %%)YY% " #4.K?%777r4   c                 f      j                   dk(  ryt         fd j                  D               S )Nr   Tc              3   0   K   | ]  } |        y wr/   rD   )rP   r  r3   r-   s     r2   rR   z9solve_min_cut.<locals>.is_materialized.<locals>.<genexpr>O  s     E$z$-Es   )r   r  r   )r-   r3   s   `r2   is_materializedz&solve_min_cut.<locals>.is_materializedK  s*    77m#E$**EEEEr4   rL   c           
         t        |       }t        j                  r!j                  |       rt        j
                  S t        | j                  d   t              r-t        | j                  d   t        j                        st        S t        |dt        t        | j                  d      d      z  z        } |       r|S |dz  S )Nr   g?d   r      )r4  r   r  r:   r^  r_  r   rs   r   r   r   INT_INFrf   maxminr  )r-   mem_szr  r  s     r2   get_node_weightz&solve_min_cut.<locals>.get_node_weightQ  s    $!!h&6&6t&< 88Odii&5dii.= Vsc#d.?.?*Eq&IIJK4 MA:r4   c                 8   j                  |       ry| v ryt        |       ryd| j                  v r(t        | j                  d   t        j
                        ryj                  |        j                  d| j                  dz   t        j                         y)NFr   source_incapacityT)r:   rx   rs   r   r   r   r   add_edger   r^  r_  )r-   banned_nodesr  nx_graphr  s    r2   ban_recomputation_if_allowedz3solve_min_cut.<locals>.ban_recomputation_if_allowedm  s    D!8 $DII*TYYu-=u~~"N
 	(DII$5Ir4   r   r  sinkr  _outr   r          start_nodes	max_rangec                    g }| D ]*  }t        j                  |
j                  |      |df       , t        |      dkD  rt        j                  |      \  }}}|s
j                  |      S |j
                  D ]_  }
j                  |      s
j                  |      |kD  r*
j                  |      | 	||      f}||vsJt        j                  ||       a t        |      dkD  r|S )z
        Finds the first unfusible node in the chain of nodes starting from
        `start_nodes` and returns its position.
        Tr   )heapqheappushrc   rW  heappopr   r\   )r  r  sorted_nodesrQ   r+  r-   node_is_fusibler  r   r3   r  s            r2   find_first_unfusiblez+solve_min_cut.<locals>.find_first_unfusible  s    
 9; 	OANN<)*@*@*CQ)MN	O ,!#',}}\'B$At_" --d33

 
:++D1 --d3i? !..t4"4.6C
 ,.|S9
:	 ,!# r4   z1used above/below fusible %s:(%s) -> %s -> %s:(%s)r  ztoo long %s %s %s %sr  z-Failed to compute min-cut on following graph:
c              3   ,   K   | ]  }||   f  y wr/   rD   )rP   rQ   r  s     r2   rR   z solve_min_cut.<locals>.<genexpr>:  s     8Q$8s   c              3   0   K   | ]  }|v s|f  y wr/   rD   )rP   vnon_reachableus     r2   rR   z solve_min_cut.<locals>.<genexpr>;  s     Aa=.@q!fAs   		c              3   (   K   | ]	  }|     y wr/   rD   rP   r-   name_to_nodes     r2   rR   z solve_min_cut.<locals>.<genexpr>G  s     2d	2s   c                     |    S r/   rD   )r   node_idxs    r2   rT   zsolve_min_cut.<locals>.<lambda>G  s    (1+ r4   rU   ):r   get_default_op_listr$   r|   r,   r%   r;  networkxImportErrorr   floatDiGraphr   rI   rG   r  r   r^  r_  rx   r   r   r\   rs   r   r   r%  r   r   rt   r   r   rd   rB   rC   rf   rl   rY   rc   rW  r  r   rm   r  r	  r   r  minimum_cut	Exceptionjoin	readwriteedgelistgenerate_edgelistvisualize_min_cut_graphupdateget_name_to_noder]  rW   )1r   r  r  r  joint_module_opsops_ignorednxer  r  r   r-   is_non_tensor_nodeweightr  r  	used_nodeordersfw_usersfirst_unfusible_usevisited
start_nodefusiblestart_orderr+  r  	cut_value	partition	reachablecutsetnbrs	cut_nodesnode_innode_out	node_namera  r   r  r  r  r3   r  r  r  r  r  r  r  r  s1    ```                                 @@@@@@@@@@@@r2   solve_min_cutr?    s    <"$H% &
#))&
 

 ' 4
$554
 *
 
 	9;G"A&0dF 2 zz|H(2L* !! 3X77h9...9+++!!$))e"3Vdhh!O dii&0&488L$
 dii%/$((Kd248(.
 ##D).Ft.L(. "E}DII'EUtyy SDIIe4Dell)S%S 	 t=./F!$))--"6FDHH  %T*F$))e+TYY-?&QJJ 	XDdii&0$))e2CdhhW	Xe3XL$rww- C C 4 ,,"44 	;I &OO++D1 &&t,F  "+I4L4LT4RH  6{Q&:8S[&Q#!)//2 ;D!006%2248;NN&y$7</$O%%229=/ %2248 5T:!;	;P 11'1|%++ !	VJ++J7''
3Z@2G $00<Kg,"w/3'>C  **3/+2CCG)HH."!..s3!..z: 15II VD!006&sD1 4w1G1G1Mt0TUV) g,"!	VF!~~h&I	9  )I}*4,F8i8 B4AdAAB ",I# !s|x},,,CRL	i !
 $K0L+4[5F5F+GHic4c	HH2	28ML %%G
  O
	|
R  @A2<<00BB8LMN)	& Is=   [ 7)[%0[*[*-[/ ]$	["[["/A2]!c                    dd l }dd l}|j                  j                  |       j	                         }|j                  |      d   }|j                         D ]c  }| |j                            |j                            d   }|j                  t        |             |t        d      k(  sS|j                  d       e t        j                  d       |j                  d       y )Nr   r  r_  redz2Visualizing the failed graph to min_cut_failed.svgzmin_cut_failed.svg)r  pydotnx_pydotto_pydot	to_stringgraph_from_dot_data	get_edges
get_sourceget_destination	set_labelr   r  	set_colorr%   r;  	write_svg)r  r*  rB  
dot_format	dot_graphedger-  s          r2   r%  r%  L  s    %%h/99;J))*5a8I##% "$//+,T-A-A-CDZPs6{#U5\!NN5!" HHAB,-r4   c                  4   g t         j                  t         j                  t         j                  t         j                  t         j
                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                   t         j"                  t         j$                  t         j&                  t         j(                  t         j*                  t         j,                  t         j.                  t         j0                  t         j2                  t         j4                  t         j6                  t         j8                  t         j:                  t         j<                  t         j>                  t         j@                  t         jB                  t         jD                  t         jF                  t         jH                  t         jJ                  t         jL                  t         jN                  t         jP                  t         jR                  t         jT                  t         jV                  t         jX                  t         jZ                  t         j\                  t         j^                  t         j`                  t         jb                  t         jd                  t         jf                  t         jh                  t         jj                  t         jl                  t         jn                  t         jp                  t         jr                  t         jt                  t         jv                  t         jx                  t         jz                  t         j|                  t         j~                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t        j                  t         j                  t         j                  t         j                  t         j                  } t         j                  t         j                  t         j                  g}|t         j                  t         j                  t         j                  t        j                  t         j                  t         j                  t         j                  t         j                  gz  }|}| g t        j                  t        j                  t         j                  t         j                  t         j                  t        j                  t        j                  t         j                  t         j                  t        j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t        j                  t        j                  z  } | t         j                  t         j                  gz  } | |z  } | t               z  } | t         j                  gz  } | t        D cg c]  }t        |       c}z  } t        |       }t        t         j                  t         j                  t         j                  g      }t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                   t         j                  t         j                  t         j                  t         j                  g}||z  }t        |t        |      |t        |      |      S c c}w r/   )r1  r   subdivatan2mulr  r  pow	remainderfmod__and____or____xor__
__lshift__
__rshift__eqnegegtleltabsbitwise_notceilfloorfracnegreluroundsilutruncr%   log10log1plog2lgammaexpexpm1erferfccosacoscoshsinasinsinhtanatantanhatanhsqrtrsqrt
reciprocalsigmoidsoftplus	thresholdthreshold_backwardclampwherelerpaddcmulgelugelu_backwardr-  mean_grad_sum_to_sizesum_to_sizeamaxtotype_asr
  r  squeeze	unsqueezersub_to_copyaliasviewslicetprimsbroadcast_in_dimexpand
as_stridedpermuteselectconvert_element_typeclone	full_likevarstd_unsafe_viewreshapebroadcast_tensorsscalar_tensorones	new_zerosr  arangetriuvar_meanisinfr   fullzerosempty
empty_likeargmaxmaximumiota)_low_memory_max_pool2d_offsets_to_indicesrt  gatherrI  
zeros_liker   r   r   native_dropout	rand_like
randn_likemmconvolutionconvolution_backwardbmmaddmm#_scaled_dot_product_flash_attention'_scaled_dot_product_efficient_attention_flash_attention_forward_efficient_attention_forwardupsample_bilinear2d
_scaled_mmr'   )default_recomputable_opsrecomputable_view_opsr+   mr,   r*   r)   r(   s           r2   r  r  ]  sx   L0L0L0 	L0 	

	L0
 	L0 	L0 	L0 	L0 	L0 			L0 	L0 	L0 	L0 	L0 	L0  	!L0" 	#L0$ 	%L0& 	'L0( 	)L0* 	+L0, 	-L0. 	/L00 			1L02 	

3L04 			5L06 	7L08 			9L0: 	

;L0< 			=L0> 	

?L0@ 	AL0B 	

CL0D 	

EL0F 			GL0H 	IL0J 	KL0L 	

ML0N 	OL0P 			QL0R 	SL0T 			UL0V 			WL0X 	YL0Z 			[L0\ 			]L0^ 	_L0` 			aL0b 			cL0d 	

eL0f 			gL0h 	

iL0j 	kL0l 	mL0n 	oL0p 	qL0r 	sL0t 	

uL0v 	

wL0x 			yL0z 	{L0| 			}L0~ 	L0@ 	AL0B 			CL0D 	EL0F 	GL0H 			IL0J 	KL0L 	ML0N 	OL0P 	QL0R 	SL0T 			UL0V 	WL0Z "\\4>>4::F		

	 	 %H $!		$!""$! 	

$! 		$!
 	$! 			$! 			$! 	$! 	$! 	$! 	$! 	$! 			$! 	$! 	

$!  	!$!" 	#$!$ 	%$!& 			'$!( 	)$!* 	+$!, 	-$!. 			/$!0 	1$!2 	

3$!4 	5$!6 			7$!8 	9$!: 	

;$!< 	

=$!> 	?$!@ 	A$!B 	C$!D 	

E$!F 	77G$! $L T[[ 99(/!   N1!3A!6 NN!":;T00$..$//RSJ!!

0044%%))   #Z/K()8 ' !Os   (dc                 J    i }| j                   D ]  }|||j                  <    |S r/   )r|   r   )r{   r  r-   s      r2   r'  r'    s.    L '"&TYY'r4   memoryruntimes
max_memoryall_recomputable_banned_nodesc                    t         j                  }|dk(  rt        |||      S |dk(  rt        |||      S |dk(  rt	        |||      S |dk(  rZt
        j                  d       t        j                  | |||      }t	        ||t        |      j                  t        |            S t        |      r ||| |||      \  }}	d	||	fS t        d
|       )Ngreedyilpdpdynamic_memory_budget_dpzdynamic_memory_budget_dp is an experimental solver. It does not guarantee performance improvements. Additionally, it is not guaranteed to be stable.)r   r   recorded_knapsack_input_memories recorded_knapsack_input_runtimes)graph_info_provider)knapsack_algomax_mem_budgetr  z,Not aware of memory budget knapsack solver: )r   activation_memory_budget_solverr   r   r   r%   warningr   inialize_from_graphr   get_knee_point_memory_budgetcallabler   )
r   r  r  r  r  r  SOLVERr  saved_node_idxrecomp_node_idxs
             r2   #_optimize_runtime_with_given_memoryr  	  s    33Fvx<<	5FHj99	468Z88	-	-?	

 0CC#*G-3-5	
 $7**)) + 	
 		
 
&	*0KY8U+
' ^_55I&RSSr4   no_dispatchr   r$  c                     t        | j                        }fd}|D cg c]
  } ||       }}| j                         D cg c]
  } ||       }}| j                  ||      S c c}w c c}w )Nc                     t        |       S )Nr#  )r   )dr$  s    r2   realize_symbolz8_remove_symbols_without_guarding.<locals>.realize_symbol>  s    H--r4   )stride)rd   shaper  new_empty_strided)r   r$  r  r  r   r  s    `    r2    _remove_symbols_without_guardingr  ;  sk    ME. )..1^A.E.)*4AnQ4F4uV44 /4s   A'A,c                 F   	 t         j                  }d }|dk(  ry|dk(  rat               5  ddlm} t        j                  | j                   j                  f      \  	|j                  	 fd      }|cd d d        S |dk(  rudd	l
m} t        j                  | j                   j                  f      \  	 |d
      5 }  j                  i 	 d d d        j                         }t        |d      S t        d|       # 1 sw Y   y xY w# 1 sw Y   ?xY w)Nc                 z   t        | t        j                        rAt        | j                  d   t        j
                        rt        | j                  d   d      S t        | t        j                        rAt        | j                  d   t        j                        rt        | j                  d   d      S t        | t        j                        r(t        | j                  d   t        j                        ryt        | t        j                        r(t        | j                  d   t        j                        ry| S )Nr   r"  r#        ?T)r   rB   rC   rs   r   r%  r  r   r   r   r   r&  s    r2   materialize_argz)estimate_runtime.<locals>.materialize_argI  s    a!j&M3AFF5MDQQ277#
166%=%,,(OAFF5MD99277#
166%=%..(Q277#
166%=%--(PHr4   testingr   profiler   )benchmarkerc                  (     j                    i S r/   )r   )r   r   r-   s   r2   rT   z"estimate_runtime.<locals>.<lambda>]  s    ;4;;3O3O r4   flops)FlopCounterModeF)displayz Not aware of runtime estimator: )r   *activation_memory_budget_runtime_estimatorr  $torch._inductor.runtime.benchmarkingr  r   tree_mapr   r   benchmark_gputorch.utils.flop_counterr  r   get_total_flopsr  r   )
r-   RUNTIME_MODEr  r  msr  modecounted_flopsr   r   s
   `       @@r2   estimate_runtimer  F  s   DDL
 y 		"] 	H!???TYY<TULD&**+OPB	 	 
	 <DKK8PQfU+ 	)tDKK((	),,.=!$$=l^LMM#	 		) 	)s   ADDDD c                 
    !"#$%&'() |dkD  s|dk  rt        d|       t        t        j                  t        j                  t        j
                  t        j                  t        j                        }t        j                  rt        |dddd      }|dk(  rj                  S t         |      \  }}|dk(  r|S dt        t        j                     dt        fd	   j                        '  |      %%'k  r|S %'fd
}dt        t        j                     f %'fd!t        |ddd      }t         |      \  }} !|      |k  r|S t        |d      t               \  }	}
 !|	      |k  r|	S ddlm" t%        "fdj                  D              $dt$        t        j                     dt        t        j                     f"$fd} ||
      }t'        |d       }t'        |t(        d      t+              dk(  rj                  S D cg c]  } |t)        |             c}&D cg c]  }t-        |       c})ddlm( &()fd#t        j2                  r!# )fd} |d       |d      g}|d   dd  |d   dd  k7  r|d   |d   fg}|r|j5                         \  }}|d   |d   z
  dk  r#|j7                  |       |j7                  |       F ||d   |d   z   dz        }|dd  |dd  k7  r|j7                  ||f       |dd  |dd  k7  r|j7                  ||f       |r|j9                          dd lm} |D cg c]  }|d   	 }}|D cg c]  }|d   	 }}|j?                  d       |jA                  ||d !       tC        |      D ]"  \  }}|jE                  |d"|||   fd#d$d%&       $ |jG                  d'       |jI                  d(       |jK                  d)       |jM                  d       |jO                         }|jQ                          tS        jT                         }t        jV                  't        jV                  }tS        jX                  |d*       d+}tZ        j\                  j_                         r?tZ        j\                  ja                         r!d,tZ        j\                  jc                          }tR        jd                  jg                  |d-| d.ti                d/      }|jk                  |       tl        jo                  d0|        #| 1      d   S c c}w c c}w c c}w c c}w )2Nr   r   zJThe valid ranges for memory budget are 0 <= m <= 1. The provided value is )rl   rm   rn   ro   rp   F)rl   rm   rn   ro   r   rL   c                 :    t        t        t        |             dz  S N    eA)r-  mapr4  )r   s    r2   estimate_activations_sizez:choose_saved_values_set.<locals>.estimate_activations_size  s    3x./#55r4   c                     | dz  z
  z  S r  rD   )szmax_act_sizemin_act_sizes    r2   get_normalized_sizez4choose_saved_values_set.<locals>.get_normalized_size  s    S\L899r4   activationsc                 &     |       z
  z
  z  S r/   rD   )r  r  r
  r  s    r2   get_mem_ratioz.choose_saved_values_set.<locals>.get_mem_ratio  s"    )+6E<'
 	
r4   )rl   rm   rn   )ro   )get_node_storagec              3   .   K   | ]  } |        y wr/   rD   )rP   r-   r  s     r2   rR   z*choose_saved_values_set.<locals>.<genexpr>  s     T4 0 6Tr)  r  c                 r    | D cg c]&  }|j                   t        d      k  r |      vr|( c}S c c}w r  )r  rf   )r  r  r  input_storagess     r2   get_recomputable_banned_nodesz>choose_saved_values_set.<locals>.get_recomputable_banned_nodes  sD    
 "
 S)$Q'~= 
 	
 
s   +4c                     | j                   S r/   r   r&  s    r2   rT   z)choose_saved_values_set.<locals>.<lambda>  s    PQPVPV r4   rU   Tr8  r  c           
      d           5  t        |t        | d      |      \  }}}d d d        t               }D ]  }	 |j                  |           |j                        sJ t        ||
|      \  }}	t        rt        |||       |fS # 1 sw Y   pxY w# t        $ r Y rw xY w)Nr   )r   r  saved_node_idxsrecomputable_node_idxsexpected_runtimememories_banned_nodesruntimes_banned_nodesmin_cut_saved_values)	r  r  r   r   BaseExceptionissubsetr?  r$   r   )memory_budgetr  r   r  r  r  r  ra  r   r+  aggressive_optionsr  r  r  r  s             r2   get_saved_values_knapsackz:choose_saved_values_set.<locals>.get_saved_values_knapsack  s    ] 	
 4%%M1%-	 &		 )3) 	C:3?@	   !>???'	
a !4'.K /'=!1&;&;%1	 ---Q	 	$ ! s   B B#B #	B/.B/c                 N     |       \  }}| t              |z
   |      fS )N)r  r   )r-  )r  r   r  r  r!  r   r  r  s      r2   estimate_for_budgetz4choose_saved_values_set.<locals>.estimate_for_budget  s@    -FYK.*L* )*-==l+ r4   r  r  gMbP?r  )
      )figsizeo)markerz.4fzoffset points)r   r$  center)
textcoordsxytexthazMemory Budgetz Runtime of Recomputed Componentsz:Pareto Frontier of Memory Budget vs. Recomputation Runtime)exist_ok _rank_memory_budget_paretor+  z.svgz%Generated Pareto frontier curve at %s)r  r  r   )8r   rk   r   ban_recompute_used_far_apart!ban_recompute_long_fusible_chains#ban_recompute_materialized_backwardban_recompute_not_in_allowlistban_recompute_reductionsaggressive_recomputationr   rG   r?  rd   rB   rC   r  torch._inductor.fx_utilsr  r   rW   r4  rW  r  torch.utils._mode_utilsr  visualize_memory_budget_paretorX  r   sortmatplotlib.pyplotpyplotfigureplotr]  annotatexlabelylabeltitlegridgcfshowosgetcwdmemory_budget_pareto_dirmakedirsr   distributedis_availableis_initializedget_rankpathr!  r   savefigr%   r  )*r   r  r  r  runtime_optimized_saved_valuesr+  r  more_aggressive_optionsmore_aggressive_saved_values%aggressive_recomputation_saved_valuesr  r  recomputable_banned_nodesr  r-   r#  optionsbisectslhsrhsmidpltitemx_valuesy_valuestxtfigfig_dirrank_suffixfig_namer   r  r  r  r  r!  r  r
  r  r  r  r  s*   ``                            @@@@@@@@@@@@r2   choose_saved_values_setrc  m  so   
 qMA-XYfXgh
 	
 $$AA#)#K#K%+%O%O & E E88O &&!"'',).$)
 (5)%"A --6RWW 6% 6 -Y-=-=>L,-KLL|#--:
4= 

 &##(%*	 '4Y 7'# ! 12]B++  % ;HY 2;7)< :;mK449T9CSCSTTN
 )
	bgg
 !>l K &'@FV W %+!x%! ()Q.2O-.HQK( ,I#' 4). ).V ,,	 	 's+-@-EF1:ab>WQZ^+
GAJ/0G";;=Sq6CF?T)NN3'NN3')3q6CF?a*?@qr7c!"g%NNC:.qr7c!"g%NNC:.  	'(/0DG00(/0DG00 	

7
#8C0  ) 	FAsLLs)hqk"*  	 	

?#

56		NOggi
))+**655GKK$/))+0A0A0P0P0R"5#4#4#=#=#?"@AK77<<+K=:L:N9OtT
 	H;XF %#yk	 	y^ 10s   !U U
U/Uc          	         | j                   j                          | j                          | j                   }t        j                  rt        |      }|| _         | j                   }t        |       }t        |       }|rt        |       } fd}	 |	|       }
t        |
j                        dk(  rt        | |      S t        | j                   j                        D ]  }|j                  dk(  rt        d      |_        #|
j#                  |      sd|_        <t        d      |_        |j$                  D ]*  }t'        |j                   |j                   dz         |_        ,  t        j(                  }|j                  D ]=  }t+        |j,                  j/                  dd      t0              s.|j,                  d   } n t3        ||
|	      }t5        t7        t8        |            }t5        t7        d
 |            }t;        | ||      \  }}|r|rt=        | ||t        |            \  }}t?        |      }t@        rtC        |D cg c]  }tE        |      tG        |      f c}      }tI        d |D              dz  }tJ        jM                  d|       tJ        jM                  d|       tO        d |j                   j                  D              }tO        d |j                   j                  D              }||z  }tQ        t              }|j                   j                  D ]R  }|jR                  |v stU        |jV                  d      s)|tG        |jV                  jX                        xx   dz  cc<   T tJ        jM                  dt        |      t        |      t        |             tC        |j[                         d d      }tJ        jM                  d|       ||fS c c}w )ax  
    Partitions the joint graph such that the backward recomputes the forward.
    Recomputing helps in trading off memory bandwidth with computation.

    To create the fwd and bwd graph, we copy the joint graph, manually set the
    outputs to just original forward or backward outputs. And then we run the
    resulting graphs through dead code elimination.

    .. warning::
        This API is experimental and likely to change.

    Args:
        joint_module(fx.GraphModule): The joint forward and backward graph. This
            is the result of AOT Autograd tracing.
        _joint_inputs: The inputs to the joint graph. This is unused.
        compiler: This option determines the default set of recomputable ops.
            Currently, there are two options: ``nvfuser`` and ``inductor``.
        recomputable_ops: This is an optional set of recomputable ops. If this
            is not None, then this set of ops will be used instead of the
            default set of ops.
        num_fwd_outputs: The number of outputs from the forward graph.

    Returns:
        Returns the generated forward and backward Fx graph modules.
    c                    t        | j                        t               | j                  j                  D ]m  }|j                  dk(  r d|j
                  v rj                  |       nt        |      rj                  |       |v sSj                  |j                         o t        t        t        | j                  j                              }t        t        t        | j                  j                              }||z   }t        |       \  }}j                  d |D               t        | j                  ||d      }t        fd|j                  D              t        fd| j                  j                  D              }d}	i }
| j                  j                  D ]  }|v s|	|
|<   |	d	z  }	 t!        |||
      S )
Nr   r   r   c              3   F   K   | ]  }||j                   dk7  s|  y w)Nr   r   )rP   r'  s     r2   rR   zNmin_cut_rematerialization_partition.<locals>.classify_nodes.<locals>.<genexpr>  s$      !
am8HA!
s   !!!r   c              3   Z   K   | ]"  }|j                   d k7  r|j                      $ ywr  r  r  s     r2   rR   zNmin_cut_rematerialization_partition.<locals>.classify_nodes.<locals>.<genexpr>  s.      <
ww(" #<
s   (+c              3   2   K   | ]  }|vr|vr|  y wr/   rD   )rP   r-   rI   rY   s     r2   rR   zNmin_cut_rematerialization_partition.<locals>.classify_nodes.<locals>.<genexpr>  s*      :
,,=N1N :
s   r   r   )r'  r{   r   r|   r   r   r   r   r&  r   rd   r   r   r   r   r   rF   )r   r-   r   r   rG   r   r   r  rJ   fw_cntrK   r  rI   rY   r   s              @@@r2   classify_nodesz;min_cut_rematerialization_partition.<locals>.classify_nodes  s   '(:(:;1; &&,, 	5Dww-'J$++,E!%%d+%d+!%%d+((!((4	5 VJ0B0B0H0HIJ!%&(:(:(@(@A"
 !77#;/$
 [ 	   !
"!
 	
 @Y
 2< <
*00<
 2

 0: :
$**00:
 0

  &&,, 	D((!'!	 %'8/8
 	
r4   r   r   r   r  r   r  N)r  c                     t        |        S r/   r  )rQ   s    r2   rT   z5min_cut_rematerialization_partition.<locals>.<lambda>  s    [^); r4   r  c              3   2   K   | ]  }t        |        y wr/   )r4  r  s     r2   rR   z6min_cut_rematerialization_partition.<locals>.<genexpr>  s     'J'Jr  z'Theoretical Activations Stored: %.2f GBz,Theoretical Per Activation Storage Sizes: %sc              3   T   K   | ]   }|j                   d k(  s|j                   " ywr   Nr  r   s     r2   rR   z6min_cut_rematerialization_partition.<locals>.<genexpr>  $      %
477o;UDII%
r  c              3   T   K   | ]   }|j                   d k(  s|j                   " ywrn  r  r   s     r2   rR   z6min_cut_rematerialization_partition.<locals>.<genexpr>  ro  r  r  z# remat/fw/bw: %d/%d/%dc                     | d   S r7  rD   r&  s    r2   rT   z5min_cut_rematerialization_partition.<locals>.<lambda>  s
    !A$ r4   Tr8  zCount of Ops Rematerialized: %s).r{   r   r  r   cser"   r~   r   r  rW  rI   r  r  r|   r   rf   r  r\   r   r  activation_memory_budgetr   rs   rt   r  rc  rd   r   r   r  r  rf  r$   rW   r4  r   r-  r%   r;  r   r   r   r   r   r  r.  )r   r  compilerr   ry   	cse_graphr   graph_has_recomputable_opsgraph_has_recomputable_rng_opsrj  r  r-   r  r  r   r   rg  rh  r  sorted_sizestotal_activations_size_gbfw_module_nodesbw_module_nodesremat_nodescountsrematerialized_opss      `                      r2   r  r  Y  s   B **,D zz &	&$$K!5l!C%=l%K"!-l;,
\ |,I
 9&&'1, -
 	
 ++112 R77h #CD))$/ !D #CD

 R$'(9(94;L;Lq;P$Q!RR 33M!! diimmOT:EB IIo6M +#L 6+|<=O;\JKL 4''	Iy ")#8iC4H$ Iy 4I>IlKSV4KL %('J\'J$JS$P!:<UV 	?N$ %
"+//"7"7%
 
 % %
"+//"7"7%
 
 &7!,S!1OO)) 	>DyyK'GDKKAR,Ss4;;6678A=8	> 	%  		
 $FLLNPTU24FGi9 Ls   O tracedfnamefigname
clear_metaprogparse_stack_tracedot_graph_shapec                    |rWt        j                  | j                        }t        j                  | |      } | j                  j
                  D ]	  }i |_         t        j                  j                  |      \  }	}
|
sdt        j                  z   }
t        j                  d|	|
       t        j                  | |||      }|j!                         }t#        |d|
j%                  d      z         }|	 |
 }|	 ||       y  |||       y )N.zWriting FX graph to file: %s%s)r  r  write_)r  )copydeepcopyr{   rB   r`  r|   rs   rF  rN  splitextr   torch_compile_graph_formatr%   r;  r   FxGraphDrawerget_main_dot_graphr@  lstrip)r  r  r  r  r  r  r  r   r-   baseextgr   write_methods                 r2   
draw_graphr  	  s     MM&,,/		2LL&& 	DDI	  'ID#F555HH-tS9""+'		A 	
A1hC89LfSENE|UU&r4   r/   )r   )inductor)fx_graphTNFN)r  rg   r  r   loggingr^  r
  rF  os.pathr:  r   dataclassesr   r   typingr   r   r	   r
   r   torch._inductor.inductor_primstorch.distributedtorch.fxrB   torch.utils._pytreeutils_pytreer   ;torch._functorch._activation_checkpointing.ac_logging_utilsr   %torch.fx.experimental._backward_stater   "torch.fx.experimental.proxy_tensorr   r   torch.fx.experimental.sym_noder   r   %torch.fx.experimental.symbolic_shapesr   r   r   r   torch.fx.passesr   torch.utils._ordered_setr   torch.utils.checkpointr   r.  r   -_activation_checkpointing.graph_info_providerr   "_activation_checkpointing.knapsackr   r   r   ,_activation_checkpointing.knapsack_evaluatorr   _aot_autograd.logging_utilsr   _aot_autograd.utilsr    r!   compile_utilsr"   r#   sympydebug_partitionerr$   ri   rA   	getLoggerr=   r%   Loggerr0  r1  r  r'   rF   rk   rC   rx   r`  r~   r   rf   r   r   r   r   rd   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r4  r=  	lru_cacherI  re   rO  rf  rk  r  r  r  r?  r%  r  r'  r  r  r8  r  r%  r  r  rc  r  r  rD   r4   r2   <module>r     sJ          	  # * ; ;  %   $ $ @ H L  ) / 3  L 
 L ; M 8  %66 t 6'g''1W^^ 1yy~~		 > > >2      :    T r~~ $ 2>> d  C  
  #	DDMD "'']D sm	D
 XXDNRWW  Gbgg G$ Gbgg $ bgg $ XRWW X XCrww C4 CKrww K4 Krww 4 $..$
4=$rww-'($$rww- s `"..`"rww-`" "'']`"
 `" 2>>2>>)*`"FS..S
2>>2>>)*Sl c("# " "277 s :Jbhh J T "Hbggsl!3 HU277C<=P8Q HGBNN Gr~~ GTZ*xx##Z*xx##Z* XX]]Z* XX]]	Z*
 LLZ* Z* HHMMZ* HHMMZ*zH ..H ~~H  ~~H  	H 
 2>>2>>)*H V# #BNN #T /3	H&H&H& #H& z"''*+	H&V."bW bJBHH +T+TK+T 5k+T 	+T
 +T $(=+T 5$s)T#Y&'+T\ 05 5 5 5$NT i	i	i	 
"'']	i	^ m ..m  2>>2>>)*m f ,0#%)'HH  '' ' 	'
 5d3i(
)' ' c]' 
'r4   