
    Vh_=                        d dl Z d dlZd dlmZ d dlmZmZmZ d dlZd dl	m
Z
mZ d dlmZ ddlmZ  ej                   e      Zdej&                  d	eedf   fd
Zdee   dee   d	eee   ee   f   fdZdee   d	eeee   f   fdZdee   dee   d	eeeef      fdZdeej&                     deeej&                        deej&                     dee   d	eeeej&                     df   eeeef      f   f
dZ	 ddee   deeeef      d	eeej&                     df   fdZ	 ddeee      d	eeej&                     df   fdZ d Z!y)    N)Iterator)AnyOptionalUnion)GradientEdgeNode)	Parameter   )map_debug_infotreturnc                     | j                   rH| j                  <| j                  |       }|j                  }||j                  d   d   S t	        d      | j                  S )z
    Get the grad function or grad accumulator for a tensor.

    Accumulate grad nodes are lazily created, so we need to a
    dummy view in order to trigger its creation.
    r   zRAttempted to get grad_fn, but got None.Is this being created in a no-grad context?)requires_gradgrad_fnview_asnext_functionsRuntimeError)r   viewed_tr   s      V/home/dcms/DCMS/lib/python3.12/site-packages/torch/distributed/pipelining/_backward.py_get_grad_fn_or_grad_accr      sf     	199,99Q<""))!,Q//> 
 yy    rootstarget_nodesc                 |   t               }t               }t        j                         }| D ],  }|||vs|j                  |       |j	                  |       . |r^|j                         }||   }|D ]A  }||v s|
||v r|j                  |        |j                  |       |j	                  |       C |r^||fS )a   
    This function returns the reverse closure of the given roots,
    i.e. the set of nodes that can be reached from the roots by following the
    reverse edges of the graph. The target_nodes are the nodes that we want to
    include in the closure.
    )setcollectionsdequeaddappendpopleft)	r   r   reverse_edges_dictclosurevisited_target_nodesqnodereverse_edgesfns	            r   reverse_closurer(   (   s     G5!,!2!2!4A G 3KKHHTN yy{*40 	BW}
\!$((,KKOHHRL	  (((r   c                    t        j                         }t               }t        j                  t              }| D ],  }|||vs|j                  |       |j                  |       . |r`|j                         }|j                  D ]>  \  }}|	t        ||         dk(  r|j                  |       ||   j                  |       @ |r`|S )Nr   )
r   r   r   defaultdictlistr   r   r    r   len)r   r$   	root_seenr!   r%   r'   _s          r   construct_reverse_graphr/   G   s    !,!2!2!4A5I1<1H1H1N  I 5HHTNMM$  yy{(( 	4EB~)"-.!3HHRL"2&--d3		4  r   inputsparamsc                 $   t        | t               |      \  }}t               }|D ]p  }t        |g||      \  }}|h|d}	|D ]R  }
|j                  |
d      }|7|d   j	                  |	d         |d<   |d   j	                  |	d         |d<   |}	N|	||
<   T r t               }t               }g }|j                         D ]O  }	t        |	      |vs|j                  t        |	             |j                  |	       |j	                  |	d         }Q |S )a  
    Given a list of inputs and a list of parameters, return a list of parameter
    groups, where each group contains the parameters and the intermediates that
    are connected to the parameters.

    The returned list of parameter groups is a list of dictionaries, where each
    dictionary contains the following keys:
    - "params": a set of parameters
    - "intermediates": a set of intermediates

    The returned list of parameter groups is a list of dictionaries,
    )r1   intermediatesNr1   r3   )	r(   r   dictgetunionvaluesidr   r   )r0   r1   r!   inputs_closurer.   param_groupsparamr"   intersectedparam_group
input_nodeexistingunion_paramsseen_idsunique_param_groupss                  r   get_param_groupsrC   Y   sL   " (7IJNA/3vL 7.G^%7 
 g('
 & 		7J#''
D9H#%-h%7%=%=k(>S%T",4_,E,K,K0-) '+6Z(		77( "eLH#**, Ek?(*LLK)&&{3'--k(.CDL	E r   stage_outputs_or_lossoutput_gradsinput_valuesweights.c           	         t        t        dt        t        |                   }t        t        dt        t        |                  }t        t        dt        t        |                  }t	        |      }t        |||      }g }	|D ]D  }
t        |
d         D ]1  \  }}d }|j                   ||
|            }|	j                  |       3 F |"| D cg c]  }t        j                  |       }}t        j                  j                  | ||d      }t        |      D ]4  \  }}|j                  ||   |_        |xj                  ||   z  c_        6 | D ]  }|j                           |	D ]  }|j                           ||fS c c}w )aj  
    Compute the gradients for only the stage inputs with
    respect to the stage outputs (if non-last stage) or loss (if last stage)

    After computing input gradients, we save the intermediate nodes in `param_groups`
    for later use in stage_backward_weight. We don't need to save any other intermediate nodes
    that aren't needed for dW because when we do dW calculation, we start from saved intermediates.
    Detaching the stage_outputs_or_loss at the end of this function is important as
    it frees up the memory that the autograd graph is anticipating to be used later (but doesn't actually need).
    Nr3   c                       fd}|S )Nc                 d    j                  dd       d gt        d         z  d<   | d   <   y )Ngradsr3   )r5   r,   )grad_inputsir=   s    r   hookz4stage_backward_input.<locals>.get_hook.<locals>.hook   sE    "w5=04v'89 0G, /:K(+r    )r=   rM   rN   s   `` r   get_hookz&stage_backward_input.<locals>.get_hook   s    : r   T)r0   grad_outputsretain_graph)r+   filtermapr   r/   rC   	enumerateregister_prehookr   torch	ones_likeautogradgraddetach_remove)rD   rE   rF   rG   stage_output_grad_fnsstage_input_grad_fnsweight_grad_fnsr!   r:   handlesr=   rM   intermediaterP   handlestage_outputdinputsinpr   s                      r   stage_backward_inputrf      s     )-tS13HIJ) (,tS1<@A( #'tS17;<#O 11FG#o/AL G# #(_)EF 	#OA| "228K3KLFNN6"	##$  ?T
.:EOOL)
 
 nn!!!	 " G L) #388qzCHHH
"H	# # 			   L  7
s   Fr:   c           
      <   i }g }t        |       D ]2  \  }}t        |      }||f||<   |j                  |j                         4 |D ]  }t	        d |d   D              }	t	        d |d   D              }
|d= t        d |d   D              sJ t        j                  j                  |	|
t        |d   t	                     |      }|d= t        |d   |      D ]6  \  }}||   \  }}|j                  ||_        "|xj                  |z  c_        8  t	        |      S )Nc              3   4   K   | ]  }t        |d         ywr   Nr   ).0rM   s     r   	<genexpr>z(stage_backward_weight.<locals>.<genexpr>   s      #
#$LA#
   r3   c              3   4   K   | ]  }t        |d         ywri   rj   )rk   ws     r   rl   z(stage_backward_weight.<locals>.<genexpr>   s     PQl1a0Prm   r1   c              3   8   K   | ]  }t        |      d k(    yw)r
   N)r,   )rk   gs     r   rl   z(stage_backward_weight.<locals>.<genexpr>   s     =13q6Q;=s   rK   )rQ   rR   )
rU   r   r   rZ   tupleallrW   rY   sumzip)rG   r:   rR   grad_acc_to_weightweight_gradsindexweightgrad_accr=   intermediate_edgesweights_edgesdweightsdws                r   stage_backward_weightr      sS    13L"7+ )v+F3'-u}8$FKK()
 $  "" #
(3O(D#
 
 P+h:OPP (=G(<==== >>&&[157;%	 ' 
  H 5x@ 	"LHb.x8MFE{{" r!	"7 "D r   outputs_with_grads_idxsc           	        
 |$|D cg c]  }| |   	 } }|D cg c]  }||   	 }}	 g g 

fd} || ||       t         j                  j                  
       g }|D ]I  }t        |t         j                        r|j                  |j                         9|j                  d       K 	 t        |      S c c}w c c}w # t        $ r8}dt        |        dt        |       dt        |       d}	t        |	      |d}~ww xY w)a  
    This is a helper function to:
    1. compute the gradients for the stage inputs, and
    2. accumulate gradients for the stage module's parameters.

    Given the input value(s) and the corresponding gradient for the output
    value(s), compute and accumulate gradients for all parameter values (leaves
    in the autograd trace) as well as return a list of the gradients for the
    input values
    Nc                 .   t        | t        j                        rt| j                  s| j                  y t        |t        j                  t        d       f      sJ dt        |              j                  |        j                  |       y t        | t        t        f      rp|y t        |t        t        f      sJ dt        |        dt        |              t        |       t        |      k(  sJ t        | |      D ]  \  }} ||||        y t        | t              rp|y t        |t              sJ t        | j                               t        |j                               k(  sJ | j                         D ]  } || |   ||   |        y y )Nz)Expected Tensor or None gradient but got z!grad_value expected to have type z	 but got )
isinstancerW   Tensorr   r   typer   rr   r+   r,   ru   r4   r   keys)
output_valgrad_valextract_tensors_with_gradsovgvkoutput_grad_tensorsstage_output_tensorss         r   r   z2stage_backward.<locals>.extract_tensors_with_grads,  s    *ell3!//J4F4F4N!(U\\4:,FG ?X?OPG %++J7#**84J6#!(UDM: 7Z8H7ISWX`SaRbc: :#h-777!*h7 FB.2 J-#!(D111:??,-X]]_1EEEE#* A."1x{4N r   )grad_tensorsz=
        Failed to run stage backward:
        Stage output: z
        Output gradient: z
        Input: z	
        )rW   rY   backwardr   r   r   rZ   	Exceptionr   r   rr   )rc   rE   rF   r   rM   r   rL   valeexc_msgr   r   s             @@r   stage_backwardr     sB     *1HIAQII1HIAQII\+ 46<>&	` 	#,(B	
 	 , 	  	
 57 	)C#u||,""388,""4(		)		( C JIn  +%l34 5(67 8|,- .		 7#*+s"   B;C BC 	D3DDc                 >    | |S || S t        j                  | |      S )z]
    Coalesce two values, even if one of them is null, returning the non-null
    value.
    )rW   r   )lhsrhss     r   _null_coalesce_accumulater     s)    
 {
	
yyc""r   )F)N)"r   loggingcollections.abcr   typingr   r   r   rW   torch.autograd.graphr   r   torch.nnr	   _debugr   	getLogger__name__loggerr   r   r+   r   rr   r(   r4   r/   strrC   rf   r   intr   r   rO   r   r   <module>r      s     $ ' '  3  " 
		8	$ tTz1B ,):)%(Y)
3t9c$i )>4: $tT$Z7G2H $3J3 $T
3	$sCx.3lO!-O!4-.O! u||$O! i 	O!
 5%,,',-tDcN/CCDO!f TY-i -04T#s(^0D-
8ELL!3&'-h 48	s &d3i0	s
 8ELL!3&'sp
#r   