
    Vh-                        d dl Z d dlZd dlmZ d dlmZ d dlZd dlmc m	c m
Z d dlmZ d dlmZmZmZmZmZmZ d dlmZmZmZmZmZmZ d dlmZ ddlmZ d	Z  ejB                         d
ede"fd       Z#dedejH                  ddfdZ%dedejH                  ddfdZ&e jN                  dedejH                  defd       Z(dede"de"de"de"ddfdZ)e jN                  dejH                  dede"de"de"de"fd       Z*e jN                  dejH                  dede"de"de"de"fd       Z+e jN                  dejH                  de"de"de"de"de"fd       Z,dedejH                  ddfdZ-dedejH                  ddfdZ.y)    N)	Generator)cast)
_FSDPState_get_module_fsdp_state_has_fsdp_params_module_handleHandleTrainingStateTrainingState)
_lazy_init%_reset_flat_param_grad_info_if_needed_reshard_reshard_grads_unshard_unshard_grads)	_p_assert   )FlatParamHandle_flat_paramhandlewriteback_gradc                     dt         j                  dt         j                  f fd} | j                        } j                  j                  d|j	                          j                  |       |rf j                  }|W j                  j                  J  | j                  j                        }|d|j	                          j                  |       yyy)aP  
    For the handle, writes back the this rank's shard of the unsharded
    flattened parameter to the sharded flattened parameter. If
    ``writeback_grad=True``, then writes back to the sharded gradient as
    well.

    Precondition: The handle's ``FlatParameter`` 's data points to the
    padded unsharded flattened parameter.
    flat_param_or_gradreturnc                     j                   r0t        j                  | j                  j                        \  }}|S | S )N)uses_sharded_strategyr   _get_unpadded_shardrank
world_size)r   shard_r   s      [/home/dcms/DCMS/lib/python3.12/site-packages/torch/distributed/fsdp/_unshard_param_utils.py
_get_shardz-_writeback_to_local_shard.<locals>._get_shard1   sD    '' '::"!!HE1
 L "!    N)torchTensor
flat_param_local_shardnumelcopy_sharded_gradgrad)r   r   r"   param_shardexisting_grad
grad_shards   `     r!   _writeback_to_local_shardr/   "   s    "u|| " " V../K
""#8[%6%6%89??L++$$$))555#F$5$5$:$:;J.J,,./55jA % r#   statemoduler   c                     t        | |      rCt        t        j                  |j                        j
                  j                  t        d       yy)a  
    De-registers the flattened parameter from the wrapped module, hiding it
    from ``nn.Module`` methods.

    We do not use ``del`` because we want ``FLAT_PARAM`` to always be an
    attribute but dynamically change whether it is visible to ``nn.Module``
    methods.
    N)r   r   nnModuler1   _parameterspop
FLAT_PARAM)r0   r1   s     r!   _deregister_flat_paramr8   J   s8     v&RYY&2266z4H 'r#   c                     t        | |      }t        | |      r@|j                  t        t        j
                  |j                        j                  t        <   yy)a2  
    Registers the flattened parameter to the wrapped module, making it
    visible to ``nn.Module`` methods.

    We do not use :meth:`nn.Module.register_parameter` because we want
    ``FLAT_PARAM`` to always be an attribute but dynamically change whether
    it is visible to ``nn.Module`` methods.
    N)	r   r   r&   r   r3   r4   r1   r5   r7   r0   r1   r   s      r!   _register_flat_paramr;   X   sD     E6*Fv&AGARARRYY&22:> 'r#   c              #     K   t        | |      }|sd yt        | |       	 |j                         5  d ddd       |j                  st	        | |       yy# 1 sw Y   #xY w# |j                  st	        | |       w w xY ww)ax  
    Assumes that the flattened parameter is unsharded. When in the context,
    de-registers the flattened parameter and unflattens the original
    parameters as ``nn.Parameter`` views into the flattened parameter.
    After the context, re-registers the flattened parameter and restores
    the original parameters as ``Tensor`` views into the flattened
    parameter.
    N)r   r8   unflatten_as_params_use_orig_paramsr;   r:   s      r!   _unflatten_as_paramsr?   g   s      E6*Fuf-	4++-  **$UF3 +  **$UF3 +s2    BA& AA&  BA#A& &BB	writeback
rank0_onlyoffload_to_cpu
with_gradsc           	         |r-|s| j                   st        d| d| j                    d| d      |r-| j                  r!| j                  j                  st        d      |r|rt        d      |r|st	        j
                  d       y y y )Nzwith_grads=z, use_orig_params=z, offload_to_cpu=z is not supported yetz5offload_to_cpu=True and NO_SHARD is not supported yetz7writeback=True and rank0_only=True is not supported yetzoffload_to_cpu=True and rank0_only=False may result in theunsharded parameters being redundantly copied to CPU memory for GPUs sharing the same CPU memory, which risks CPU OOM. We recommend using offload_to_cpu=True with rank0_only=True.)r>   NotImplementedError_handler   warningswarn)r0   r@   rA   rB   rC   s        r!   _validate_unshard_params_argsrI   ~   s     ~U-C-C!* &$556 7,- .#$
 	
 %--1T1T!C
 	
 Z "E
 	
 jH	
 )~r#   c              #     K   t        |||||       |j                  j                          t        ||       }d}|r|j                  t
        j                  k7  r|}|sd y|j                  t
        j                  k(  sJ d|j                          t
        j                  |_        t        |       |j                         }|j                  j                         }	t        |||	|	       |rt        |       |rD|j                  dk7  r5t        |||       |rt        |       	 d t
        j                  |_        yt!        j"                         5 }
|r+|j$                  r|
j'                  |j)                                |j*                  s|
j'                  t-        ||              	 d |
j/                          |rt1        ||       t        |||       |rt        |       t
        j                  |_        	 ddd       y# t
        j                  |_        w xY w# |
j/                          |rt1        ||       t        |||       |rt        |       t
        j                  |_        w xY w# 1 sw Y   yxY ww)zl
    This unshards the parameters for a single FSDP state ``state`` that
    corresponds to ``module``.
    Nz/Expects the handle training to be IDLE but got r   )rI   _device_handlesynchronizer   _training_stater	   SUMMON_FULL_PARAMSIDLEr   needs_unshardcurrent_streamr   r   r   r   r   
contextlib	ExitStackr   enter_contextto_cpur>   r?   closer/   )r1   r0   r@   rA   rB   rC   maybe_handler   free_unsharded_flat_paramcomputation_streamstacks              r!   _unshard_fsdp_state_paramsr[      s1     "y*nj 
$$&!%0LF((,?,R,RR!!%8%=%== 
9&:P:P9QR= 1CCF)&1 & 4 4 6 --<<>UF.0BCvejjAo 9:6"	>%8%=%=F" !!# 	Bu&">">##FMMO4
 ))##$8$GH	B-fjA(AB"6*)<)A)A&'	B 	B &9%=%=F"  -fjA(AB"6*)<)A)A&'	B 	BsQ   D!I;$H (*I;AI/(H,AI/:	I;HI;AI,,I//I84I;c              #     K   t        |||||       t        ||        |j                  t        j                  k(  rt        d      |j                  t        j                  k(  rt        d      t        | |||||      5  	 t        j                  |_        d  t        j                  |_        	 d d d        y # t        j                  |_        w xY w# 1 sw Y   y xY ww)Nz:Cannot manually unshard parameters during forward/backwardzECannot manually unshard parameters when already unsharding parametersr1   r0   r@   rA   rB   rC   )	rI   r   training_stater
   FORWARD_BACKWARDAssertionErrorrN   r[   rO   r]   s         r!   _unshard_params_for_summonra      s      "y*nj uf}===H
 	
 
		!A!A	AS
 	
 
$%
 6	6#0#C#CE #0#5#5E 6 6 $1#5#5E 6 6s6   A=C?CB9C0	C9CCCCrecursec              #   x  K   |s6t        |       }|"t        j                         5  d ddd       y|g| gf}nt        j                  |       }t        j
                         5 }t        | D ]%  \  }	} |j                  t        | |	||||             ' d ddd       y# 1 sw Y   yxY w# 1 sw Y   yxY ww)z~
    This unshards FSDP-managed parameters for all modules with FSDP applied in
    the module tree rooted at ``module``.
    Nr]   )	r   rR   nullcontexttraversal_utils_get_fsdp_states_with_modulesrS   ziprT   ra   )
r1   rb   r@   rA   rB   rC   optional_statestates_and_modulesrZ   r0   s
             r!   _unshard_paramsrj     s      /7!'') -.9,JJ6R				 5 "45 
	ME6*!')#1)	
	 	  s3   $B:B"9B:$5B.	B:"B+'B:.B73B:c                     t        | |      }|syt        |j                  d| j                   d|j                          |j                          t	        | |       y)zO
    Deregisters the original parameters; registers the ``FlatParameter``.
    Nz)Inconsistent `_use_orig_params` -- FSDP: z	 handle: )r   r   r>   _deregister_orig_paramsr;   r:   s      r!   rl   rl   5  sd     E6*F
3E4J4J3K L**+	-
 ""$'r#   c                     t        | |      }|syt        | |       |j                  |j                        r!|j	                          |j                          y|j                  d       y)zO
    Deregisters the ``FlatParameter``; registers the original parameters.
    NT)	as_params)r   r8   
is_shardedr&   _use_sharded_views_use_sharded_grad_views_use_unsharded_viewsr:   s      r!   _register_orig_paramsrs   E  s]     E6*F5&)**+!!#&&(##d#3r#   )/rR   rG   collections.abcr   typingr   r$   'torch.distributed.fsdp._traversal_utilsdistributedfsdp_traversal_utilsre   torch.nnr3   $torch.distributed.fsdp._common_utilsr   r   r   r   r	   r
   %torch.distributed.fsdp._runtime_utilsr   r   r   r   r   r   torch.distributed.utilsr   r   r   r7   no_gradboolr/   r4   r8   r;   contextmanagerr?   rI   r[   ra   rj   rl   rs    r#   r!   <module>r      sw     %   A A    . ( 
 $B$B$B $BNI* Ibii ID IS
 SBII S$ S 4
 4BII 4) 4 4,


 
 	

 
 

B IBIIIBIB IB 	IB
 IB IB IBX  6II 6 6  6 	 6
  6  6  6F !II!! ! 	!
 ! ! !H(: (ryy (T ( 4 4RYY 44 4r#   