
    Vh\                        d dl mZ d dlmZmZ d dlmZmZ d dlm	Z	m
Z
mZmZmZmZ d dlmZmZmZ d dlZd dlZd dlmZmZ d dlmZ d d	lmZmZmZ d d
lmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z&m'Z' dZ(dgZ) ed      Z* ed      Z+ ed      Z,ejZ                  j\                  Z. G d de      Z/ G d de      Z0 G d de      Z1 G d d      Z2 G d de      Z3 G d de      Z4y)    )deepcopy)autoEnum)partialwraps)AnyCallable
NamedTupleOptionalTypeVarUnion)	ParamSpecTypeVarTupleUnpackN)nnoptim)active_fake_mode)_RefType_State
MemTracker)
FSDPModule)FSDPParamGroup)TorchDispatchMode)tree_map_only)WeakIdKeyDictionaryweakrefTotalFSDPMemTracker_P_R_Tsc                   <    e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
ZdZdZy)_FSDPRefTypea  
    Enumerates categories of memory usage in FSDP modules, including parameters, gradients, activations,
    and optimizer states.

    Attributes:
        SHARDED_PARAM (str): Memory usage of sharded parameters.
        UNSHARDED_PARAM (str): Memory usage of unsharded parameters.
        SHARDED_GRAD (str): Memory usage of sharded gradients corresponding to the sharded parameters.
        UNSHARDED_GRAD (str): Memory usage of unsharded gradients corresponding to the unsharded parameters.
        ACT (str): Memory usage of activations and tensors from forward and AC recomputation.
        TEMP (str): Memory usage of temporary tensors during the backward pass including gradients of activations.
        ALL_GATHER (str): Memory usage of all_gather output tensor.
        REDUCE_SCATTER (str): Memory usage of reduce_scatter input tensor.
        OPT (str): Memory usage of tensors storing optimizer states.
        INP (str): Memory usage of input tensors.
    zSharded ParamzUnsharded ParamBufferzSharded GradzUnsharded Grad
ActivationTempz
All GatherzReduce ScatterOptStateInputsN)__name__
__module____qualname____doc__SHARDED_PARAMUNSHARDED_PARAMBUFFERSHARDED_GRADUNSHARDED_GRADACTTEMP
ALL_GATHERREDUCE_SCATTEROPTINP     Z/home/dcms/DCMS/lib/python3.12/site-packages/torch/distributed/_tools/fsdp2_mem_tracker.pyr#   r#      s@    " $M'OF!L%N
CDJ%N
C
Cr9   r#   c                   "    e Zd ZU eed<   eed<   y)_SavedFSDPMethodspre_backwardpost_backwardN)r)   r*   r+   r	   __annotations__r8   r9   r:   r<   r<   =   s    r9   r<   c                   @    e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
ZdZdZdZy)_FSDPModStatezW
    Enumerates the states of FSDP modules during the forward and backward passes.
    zBefore Pre-ForwardzAfter Pre-ForwardzBefore Post-ForwardzAfter Post-ForwardzBefore Pre-BackwardzAfter Pre-BackwardzBefore Post-BackwardzAfter Post-BackwardzPre-Forward ACzPost-Forward ACzPeak ForwardzPeak BackwardN)r)   r*   r+   r,   
BEF_PRE_FW
AFT_PRE_FWBEF_POST_FWAFT_POST_FW
BEF_PRE_BW
AFT_PRE_BWBEF_POST_BWAFT_POST_BW	PRE_FW_AC
POST_FW_ACPEAK_FWPEAK_BWr8   r9   r:   rA   rA   B   sE     &J$J'K&K&J%J(K'K I"JGGr9   rA   c                        e Zd ZdZdeddfdZy)_FSDPModMemStatsa  
    A class to store the memory statistics of an FSDP module.

    Args:
        mod_fqn (str): The fully qualified name of the FSDP module.

    Attributes:
        snapshots (Dict[_FSDPModState, Dict[torch.device, Dict[str, int]]]): A dictionary of memory snapshots
        of the module at different states as defined by ``_FSDPModState``. Each key is a device, and
        each value is another dictionary with keys as memory reference types defined by ``_FSDPRefType`` and
        values as the memory consumed in bytes.

    mod_fqnreturnNc                 .    || _         i | _        i | _        y N)rP   
local_peak	snapshots)selfrP   s     r:   __init__z_FSDPModMemStats.__init__d   s    35  	r9   )r)   r*   r+   r,   strrW   r8   r9   r:   rO   rO   U   s      r9   rO   c                   `    e Zd Z e       Z e       Z e       Z e       Z e       Z e       Z	y)
_FSDPStateN)
r)   r*   r+   r   PRE_FWFWPOST_FWPRE_BWBWPOST_BWr8   r9   r:   rZ   rZ   l   s,    VF	BfGVF	BfGr9   rZ   c                   ,    e Zd ZdZ	 ddej
                  j                  deej                  j                     ddf fdZ
deddfdZd	ed
eeeeee      eeef   f   f   deeeeee      eeef   f   f   fdZd	edeeef   deeef   fdZd	edeeef   deedf   fdZd	edeeef   deedf   fdZd dZd dZd dZd dZdeedf   ddfdZde ej                  ej                  ejB                  f   ddfdZ"d!dZ#deddfdZ$d"dZ% xZ&S )#r   a  
    A ``TorchDispatchMode`` based context manager that extends ``torch.distributed._tools.mem_tracker.MemTracker`` to track
    and categorize the peak memory and module-wise memory usage of FSDP modules.

    It tracks the peak memory usage across all the devices of all the FSDP modules in the module tree and categorizes
    the tensor memory usage as defined by ``_FSDPRefType``. Further, it captures memory `snapshots` at different stages of
    the module execution defined by ``_FSDPModState``.

    Attributes:
        memory_tracking: A weakref key dictionary to store the memory statistics of each module. Each key is a reference
        to a module, and each value is a ``_FSDPModMemStats`` object that stores the memory statistics of the module.

    Args:
        mod (torch.nn.Module): The root FSDP module to be tracked.
        optm (torch.optim.Optimizer, optional): The optimizer to be tracked.

    Note: Please refer to ``torch.distributed._tools.mem_tracker.MemTracker`` to learn about the limitations.

    Example usage

    .. code-block:: python

        module = ...
        optimizer = ...
        inp = ...
        fmt = FSDPMemTracker(module, optimizer)
        fmt.track_inputs((inp,))
        with fmt:
            optimizer.zero_grad()
            loss = module(inp)
            print("After Forward:")
            fmt.display_snapshot("current")
            loss.backward()
            optimizer.step()
        fmt.display_snapshot("peak")
        fmt.display_modulewise_snapshots(depth=3, units="MB")

    NmodoptmrQ   c                     t         |           t        |t              sJ d       || _        || _        t               | _        t        j                  | _
        t        | _        y )Nz)FSDPMemTracker only supports FSDP modules)superrW   
isinstancer   	_root_mod_optmr   _fsdp_mod_to_saved_methodsrZ   r[   _fsdp_stater#   
_ref_class)rV   rb   rc   	__class__s      r:   rW   zFSDPMemTracker.__init__   sT    
 	#z*W,WW*
?R?T''1'8'8*6r9   fsdp_param_groupc                     |j                   D ]e  }| j                  |j                  t        j                         |j                  j
                  }|F| j                  |t        j                         g y rS   )fsdp_params_update_and_maybe_create_winfossharded_paramr#   r-   gradr0   )rV   rm   
fsdp_paramsharded_grads       r:   %_instrument_fsdp_sharded_params_gradsz4FSDPMemTracker._instrument_fsdp_sharded_params_grads   sl     +66 
	J00((** &3388L'44  --
	r9   fsdp_modorig_fsdp_state_pre_fwc                      t              dt        j                  dt        j                  dt        t        t
        t              t        t        t        f   f   f fd       }|S )NargskwargsrQ   c                  ,   t         j                  _        j                  j	                        }|J j
                  vrt        |      }|j
                  <   j                         }|j                         D ci c]  \  }}||t            c}}|_
        |j                  j                  t        j                  g       j                  |       |j                  j                  t        j                   g       j                  t#        |             nMj                  j$                  s7j                  j&                  |hz
  }t)        |      dk(  rd|v rt+        d       | i |\  } }j-                         }|j.                  x}	r;|	j0                  D ],  }
j3                  |
j4                  t6        j8                         . j
                     }j                  j$                  r>t        j:                  }j<                  2t?        j@                        _        d_!        nt        jD                  }|j                  j                  |g       j                  j                                t         jF                  _        | |fS c c}}w )N   GlobalzFSDPMemTracker does not support memory tracking for multiple iterative calls. Either use ``reset_mod_stats`` to clear module memory stats for the previous iteration or file a github issue if you need this feature.T)$rZ   r[   rj   _mod_trackerget_known_fqnmemory_trackingrO   get_tracker_snapshotitems
_TOTAL_KEYrT   rU   
setdefaultrA   rL   appendrB   r   is_bwparentslenNotImplementedError_get_fsdp_state_fsdp_param_groupro   rp   unsharded_paramr#   r.   rJ   _ac_modr   ref_in_acrC   r\   )ry   rz   rP   mod_statsnapshotdevdev_snapr   
fsdp_staterm   rs   staterv   rw   rV   s               r:   innerz5FSDPMemTracker._fsdp_state_pre_forward.<locals>.inner   sL     *00D''55h?G&&&t333+G419$$X.446CK>>CS'2?#xC*--'# ""--m.C.CRHOO ""--m.F.FKRRX& &&,,++33wi?w<1$W)<-L  24B6BLD&!113J#-#?#???"2">"> J88"22$44
 ++H5H  &&%//<<'#*;;x#8DL"&DK%00))%4;;D<U<U<WX)}}D<K's   J)
r   r   ry   rz   tupler   r!   dictrX   r   )rV   rv   rw   r   s   ``` r:   _fsdp_state_pre_forwardz&FSDPMemTracker._fsdp_state_pre_forward   s`    $ 
%	&/	 77/	 &(ii/	 5%tCH~56/	  
'/	 b r9   orig_fsdp_state_post_fwc                      t              dt        j                  dt        j                  dt        f fd       }|S )Nry   rz   rQ   c                  R   j                      }j                  j                  r=t        j                  }j
                  1j                         u rd _        d_        nt        j                  }|j                  j                  |g       j                  j                                t        j                  _         | i |}j                  j                  sG|j                  j                  t        j                  g       j                  j                                |S NF)r   r~   r   rA   rK   r   r   rD   rU   r   r   r   rZ   r]   rj   rE   )ry   rz   r   r   outputrv   r   rV   s        r:   r   z6FSDPMemTracker._fsdp_state_post_forward.<locals>.inner
  s    ++H5H  &&%00<<+(0J#'DL"'DK%11))%4;;D<U<U<WX)11D,d=f=F$$**""--m.G.GLSS--/ Mr9   )r   r   ry   rz   r    )rV   rv   r   r   s   ``` r:   _fsdp_state_post_forwardz'FSDPMemTracker._fsdp_state_post_forward   sA     
&	'	 	BII 	" 	 
(	( r9   "orig_fsdp_param_group_pre_backwardc                 |     t              dt        j                  dt        j                  dd f fd       }|S )Nry   rz   rQ   c                     t         j                  _        j                     }j	                         }|j                         D ci c]  \  }}||t            c}}|_        |j                  j                  t        j                  g       j                  |       |j                  j                  t        j                  g       j                  t        |              | i | |j                  j                  t        j                  g       j                  j	                                t         j                   _        y c c}}w rS   )rZ   r^   rj   r   r   r   r   rT   rU   r   rA   rM   r   rF   r   rG   r_   )	ry   rz   r   r   r   r   rv   r   rV   s	         r:   r   z<FSDPMemTracker._fsdp_param_group_pre_backward.<locals>.inner(  s	   )00D++H5H002H?G~~?O#.;c8Xj))#H ))-*?*?DKKHU))-*B*BBGNN" /??))-*B*BBGNN))+  *}}D#s   	Er   r   ry   rz   )rV   rv   r   r   s   ``` r:   _fsdp_param_group_pre_backwardz-FSDPMemTracker._fsdp_param_group_pre_backward!  sA     
1	2	- 	-BII 	-$ 	- 
3	-$ r9   #orig_fsdp_param_group_post_backwardc                 |     t              dt        j                  dt        j                  dd f fd       }|S )Nry   rz   rQ   c                     j                         }|j                  x}rL|j                  D ]=  }|j                  j                  }|
j                  |t        j                  d       ? 
j                     }|j                  j                  t        j                  g       j                  
j                                t        j                   
_         	| i | |j                  x}rJ|j                  D ];  }|j$                  j                  }|
j                  |t        j&                         = |j                  j                  t        j(                  g       j                  
j                                y )NTupdate_existing)r   r   ro   _unsharded_paramrr   rp   r#   r1   r   rU   r   rA   rH   r   r   rZ   r`   rj   rq   r0   rI   )ry   rz   r   rm   rs   unsharded_gradr   rt   rv   r   rV   s           r:   r   z=FSDPMemTracker._fsdp_param_group_post_backward.<locals>.innerE  s`   !113J#-#?#???"2">"> J%/%@%@%E%EN%1<<*(77,0 =  ++H5H))-*C*CRHOO))+  *11D/@@#-#?#???"2">"> J#-#;#;#@#@L#/<<((55 ))-*C*CRHOO))+r9   r   )rV   rv   r   r   s   ``` r:   _fsdp_param_group_post_backwardz.FSDPMemTracker._fsdp_param_group_post_backward=  sB     
2	3	 	BII 	$ 	 
4	@ r9   c                 X   | j                   j                         D ]M  }t        |t              s|j	                         }|j
                  x}s4| j                  |       |j                  j                          |j                  j                          |j                  | j                  ||j                        dd      |_        |j                  | j                  ||j                        dd      |_	        t!        |j"                  |j$                        | j&                  |<   | j)                  ||j"                        |_        | j+                  ||j$                        |_        P | j                   j-                         D ]"  }| j/                  |t0        j2                         $ y )NTprependwith_kwargsF)r   always_call)rg   modulesrf   r   r   r   ru   _pre_forward_hook_handleremove_post_forward_hook_handleregister_forward_pre_hookr   _pre_forwardregister_forward_hookr   _post_forwardr<   r=   r>   ri   r   r   buffersrp   r#   r/   )rV   moduler   rm   buffers        r:   _instrument_fsdp_modulez&FSDPMemTracker._instrument_fsdp_moduleh  s    nn,,.  	F&*-#335
'1'C'CC#C>>?OP77>>@88??A88 88 &
(?(? %)(, 9  7 <B;W;W55fj>V>VW %$( <X <J8
 ?P(55(66?D33F; 594W4W 0 = =5$1 <<"$4$B$B %29 	D nn,,. 	F00##	r9   c                 z     j                    j                  t        j                   j                          dt        j
                  dt        dt        dd f fd}dt        j
                  dt        dt        dd f fd} j                   j                  |       j                   j                  |      f _	        y y )N	optimizerry   rz   rQ   c                     d_         y )NT)_in_optr   ry   rz   rV   s      r:   _opt_step_pre_hookz@FSDPMemTracker._instrument_optimizer.<locals>._opt_step_pre_hook  s      $r9   c                 T    j                  t        j                  |        d_        y r   )_track_optimizer_statesr#   r6   r   r   s      r:   _opt_step_post_hookzAFSDPMemTracker._instrument_optimizer.<locals>._opt_step_post_hook  s"     ,,\-=-=yI$r9   )
rh   r   r#   r6   r   	Optimizerr   register_step_pre_hookregister_step_post_hook_optimizer_hook_handles)rV   r   r   s   `  r:   _instrument_optimizerz$FSDPMemTracker._instrument_optimizer  s     ::!(()9)94::F$ ??$25$?B$$
% ??%25%?B%% 

112DE

223FG,D( "r9   c                 D    | j                          | j                          y rS   )r   r   )rV   s    r:   $_register_module_and_optimizer_hooksz3FSDPMemTracker._register_module_and_optimizer_hooks  s    $$&""$r9   c                 Z   | j                   j                         D ]  \  }}|j                         }|j                  j	                          |j
                  j	                          |j                  |j                  dd      |_        |j                  |j                  d      |_        |j                  x}s|j                  |_        |j                  |_         | j                   j                          | j                  )| j                  D ]  }|j	                           d | _        y y )NTr   F)r   )ri   r   r   r   r   r   r   r   r   r   r   r=   r>   clearr   )rV   rv   saved_methodsr   rm   handles         r:   &_deregister_module_and_optimizer_hooksz5FSDPMemTracker._deregister_module_and_optimizer_hooks  s(    ,,224	M 
!113J//668007792:2T2T''4 3U 3J/ 4<3Q3Q((% 4R 4J0 $.#?#???0=0J0J -1>1L1L .	M  	''--/''366   +/D( 4r9   inputs.c                 l     dt         j                  ddf fd}t        t         j                  ||       y)a%  
        This is used to track the input tensors to the model and annotate them as ``Inputs``.
        Args:
            inputs (Tuple[Any]): A tuple containing the input data. This can include tensors
                        as well as other data types. Only tensors will be tracked.
        trQ   Nc                 F    j                  | t        j                         y rS   )rp   r#   r7   )r   rV   s    r:   _track_inputsz2FSDPMemTracker.track_inputs.<locals>._track_inputs  s    00  r9   )torchTensorr   )rV   r   r   s   `  r:   track_inputszFSDPMemTracker.track_inputs  s,    	U\\ 	d 	 	ellM6:r9   externalc                      y)z$This is no-op for ``FSDPMemTracker``Nr8   )rV   r   s     r:   track_externalzFSDPMemTracker.track_external  s    r9   c                    | j                   dk(  r| j                          | j                          | j                          | j	                         | _        | j
                  j                         D ci c]  \  }}||t            c}}| _        | j                  j                          t        j                  |        | xj                   dz  c_         | S c c}}w )Nr   r|   )_depthr   _track_resize_track_dtensor_dispatchr   _peak_mem_snapr   r   	_peak_memr~   	__enter__r   )rV   r   r   s      r:   r   zFSDPMemTracker.__enter__  s    ;;!557 ((*"&";";"=D &*%8%8%>%>%@!C Xj))DN '')##D)qs   2Cry   c                 
   | xj                   dz  c_         | j                   dk(  rI| j                          | j                          | j                           | j                  j
                  |  t        j
                  | g|  y )Nr|   r   )r   r   _restore_resize_restore_dtensor_dispatchr~   __exit__r   )rV   ry   s     r:   r   zFSDPMemTracker.__exit__  si    q;;!779  "**,&D&&-""4/$/r9   c                    |t         j                  j                  j                  j                  k(  rt               r|d   }n ||i |xs i }| j                  rt        j                  }nC| j                  j                  r| j                  st        j                  }nt        j                  }|t        j                  j                  k(  rS| j                   t"        j$                  t"        j&                  fv r'|d   }| j)                  |t        j*                  d       |t        j,                  j                  k(  rD| j                   t"        j.                  k(  r'|d   }| j)                  |t        j0                  d       t3        t         j4                  t7        | j8                  |      |       | j                  j                  rt:        j<                  nt:        j>                  }	| jA                  |	       |S )Nr   Tr   r|   )!r   ops_c10d_functionalwait_tensordefaultr   r   r#   r6   r~   r   r   r3   r2   c10d_allgather_base_rj   rZ   r[   r^   rp   r4   _reduce_scatter_base_r`   r5   r   r   r   _trackrA   rM   rL   _update_peak_stats)
rV   functypesry   rz   resreftypeoutput_tensorinput_tensor
peak_states
             r:   __torch_dispatch__z!FSDPMemTracker.__torch_dispatch__  s   EII..::BBB " q'C-"-C <<"&&G$$T[["''G"&&G4((000T5E5EJ
 6
 !GM00'' $ 1  D..666  J$6$667L00++ $ 1  	ellGDKK$A3G%)%6%6%<%<M!!-BWBW 	 	
+
r9   rS   )rQ   N)rQ   r   ).N)'r)   r*   r+   r,   r   r   Moduler   r   r   rW   r   ru   r   r	   r   r   r   r!   r   rX   r   r   r    r   r   r   r   r   r   r   r   r   r   r   r   r   r   __classcell__)rl   s   @r:   r   r   u   s   %T 157XX__7 u{{,,-7 
	7 .	 DD !)U53EtCQTH~3U-V)V WD 
"eE&+.S#X>??	@	DL "*"b&!1 
"b&		B -5RW,= 
"d(		8)) .6b#g->) 
"d(		)V,\.%00;5c? ;t ; 3ryy%//5<<GH3	3
0c 0d 0-r9   )5copyr   enumr   r   	functoolsr   r   typingr   r	   r
   r   r   r   typing_extensionsr   r   r   r   )torch.distributed._tools.fake_collectivesr   r   torch._guardsr   $torch.distributed._tools.mem_trackerr   r   r   torch.distributed.fsdpr   5torch.distributed.fsdp._fully_shard._fsdp_param_groupr   torch.utils._python_dispatchr   torch.utils._pytreer   torch.utils.weakr   r   r   __all__r   r    r!   r   r   r#   r<   rA   rO   rZ   r   r8   r9   r:   <module>r     s      $ F F = =  0  * M M - P : - 9 

t_T]5yy~~8 >
 
F & . nZ nr9   