
    Vh                     `   U d dl Z d dlZd dlZd dlZd dlmZmZ d dlmZm	Z	m
Z
 d dlmZ d dlmZmZmZmZmZmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZmZmZm Z m!Z!m"Z" d dl#m$Z$ d d	l%m&Z&m'Z'm(Z)m*Z*m+Z+m,Z,m-Z-m.Z. d d
l/m0Z0m1Z1 d dl2m3Z3 d dl4m5Z5 d dl6m7Z8 d dl9m:Z: g dZ;dZ<dZ=dZ>dZ?e@eA   ZBee3eej                  eDeEeAf   ZFeeFeGeF   eHeF   eIeAdf   f   ZJeIeAeJf   ZKeGeK   ZLeIeAeeKeLf   f   ZM e@       ZNe@e   eOd<   e j                  d        ZQe	 G d d             ZRe	 G d deR             ZSej                  	 	 	 dIdej                  deAdeAdeVdeVd eBfd!       ZW G d" d#      ZXdJd$ZYddd%dej                  d&eHej                  j                  d'f   d(eVd)ee@ej                        d*eeR   d eSfd+Z\d,eIeAeJf   d-eMd.eSd dfd/Z]d0eej                  ej                  j                  f   d1eAd efd2Z^d3eIeAef   d.eSd eIeAef   fd4Z_ ej                         dej                  d.eSd eIeAeJf   fd5       Za ej                         dej                  d3eIeAeJf   d.eSd e5fd6       Zbd7ej                  j                  d dfd8Zcd3eMd eIeAeJf   fd9Zdd7ej                  j                  d3eIeAeJf   d.eSd eMfd:Ze ej                         dej                  d;eHej                  j                  d'f   d.eSd eMfd<       Zfdej                  d7ej                  j                  d-eMd.eSd eMf
d=Zg ej                         dej                  d;eHej                  j                  d'f   d3eMd.eSd df
d>       Zhddd%dej                  d)ee@ej                        d*eeR   d eIeAeJf   fd?Ziddd%dej                  d;eej                  j                  eej                  j                     f   d)ee@ej                        d*eeR   d eMf
d@Zjddd%dej                  d;eej                  j                  eej                  j                     f   d)ee@ej                        d*eeR   d eHeIeAeJf   eMf   f
dAZkdej                  d3eeIej                  eIeAeJf   f   eIeAeJf   f   d eIeAeJf   fdBZlddCdej                  d,eIeAeJf   d*eeR   d e5fdDZmddCdej                  d;eej                  j                  eej                  j                     f   d-eMd*eeR   d df
dEZnddCdej                  d;eej                  j                  eej                  j                     f   d,eIeAeJf   d-eMd*eeR   d e5fdFZoeddCdej                  d*eeR   d dfdG       ZpeddCdej                  d;eHej                  j                  d'f   d*eeR   d dfdH       Zqy)K    N)	GeneratorIterable)asdict	dataclassfield)chain)AnyCallablecastno_type_checkOptionalUnion)ShardedTensor)_broadcast_state_dict_distribute_state_dict_flatten_state_dict_gather_state_dict_offload_state_dict_to_cpu_unflatten_state_dict)_CHECKPOINT_PREFIX)FullOptimStateDictConfigFullStateDictConfigFullyShardedDataParallelOptimStateDictConfigShardedOptimStateDictConfigShardedStateDictConfigStateDictConfigStateDictType)._get_module_fsdp_state_if_fully_sharded_moduleFSDP_WRAPPED_MODULE)DTensor)_IncompatibleKeys)DistributedDataParallel)tree_map_only)FQNS_TPrimitiveType	ValueTypeDictValueTypeListDictValueTypeOptimizerStateTypeStateDictOptionsget_model_state_dictget_optimizer_state_dictget_state_dictset_model_state_dictset_optimizer_state_dictset_state_dict_flat_paramparam_groupsparamsstater'   _patched_state_dictc               #      K   t        j                         } t        j                          	 d  | rt        j                          y y # | rt        j                          w w xY wwN)gc	isenableddisableenable)
is_enableds    W/home/dcms/DCMS/lib/python3.12/site-packages/torch/distributed/checkpoint/state_dict.py_gc_contextr?   Q   sD     JJJLIIK :IIK s   )A$A A$A!!A$c                       e Zd ZU dZdZeed<   dZeed<   dZeed<   dZ	eed<   dZ
eed<   dZeed	<   dZeed
<   dZeed<   y)r+   ap  
    This dataclass specifies how get_state_dict/set_state_dict will work.

    - ``full_state_dict``: if this is set to True, all the tensors in the
      returned state_dict will be gathered. No ShardedTensor and DTensor
      will be in the returned state_dict.

    - ``cpu_offload``: offload all the tensors to cpu. To prevent CPU OOM, if
      ``full_state_dict`` is also true, then only the rank0 will get the
      state_dict and all other ranks will get empty state_dict.

    - ``ignore_frozen_params``: if the value is True, the returned state_dict
      won't contain any frozen parameters -- the ``requires_grad`` is False.
      The default value is False.

    - ``keep_submodule_prefixes`` (deprecated): when ``submodules`` is not None, this option
      indicates whether to keep the submodule prefixes from the state_dict keys.
      or example, if the submodule is ``module.pretrain`` and the full FQN of
      the parameter is ``pretrain.layer1.weight`` of the param. When this option
      is True, the parameter's key in the returned state_dict will be
      ``pretrain.layer1.weight``. If the options is False, the key will be
      ``layer1.weight``.
      Note that if ``keep_submodule_prefixes`` is False, there may be conflicted
      FQNs, hence there should be only one submodule in ``submodules``.

    - ``strict``: the ``strict`` option when ``set_state_dict`` calls
      model.load_state_dict().

    - ``broadcast_from_rank0``: when the option is True, rank0 should receive a
       full state_dict and will broadcast the tensors in the state_dict/
       optim_state_dict one by one to other ranks. Other ranks will receive
       the tensors and shard according to the local shards in the model and
       optimizer. ``full_state_dict`` must be set to True when using this option.
       This option currently only supports DTensor, not the legacy ShardedTensor.
    Ffull_state_dictcpu_offloadignore_frozen_paramsTkeep_submodule_prefixesstrictbroadcast_from_rank0flatten_optimizer_state_dict_fqn_modifiersdsd_fqn_modifiersN)__name__
__module____qualname____doc__rA   bool__annotations__rB   rC   rD   rE   rF   rG   rI   str     r>   r+   r+   \   s_    "H "OT!K!&$&$(T(FD!&$&). $.-s-rR   r+   c                   h   e Zd ZU  ee      Zeeeej                  f   ee
ej                  f   f   ed<    ee      Zeeeej                  f   ee
ej                  f   f   ed<    ee      Zee   ed<   dZeed<   dZeed<   ej&                  Zeed<    ee      Zeej2                     ed	<   y
)_StateDictInfo)default_factoryfqn_param_mappingshared_params_mappingsubmodule_prefixesThandle_modelhandle_optimfsdp_contextfsdp_modulesN)rJ   rK   rL   r   dictrV   r   rP   torchTensorr%   rO   rW   setrX   rY   rN   rZ   
contextlibnullcontextr[   r
   listr\   nnModulerQ   rR   r>   rT   rT      s    
 	d# tc5<< fell"#	% $ 	d# 4c5<< fell"#	% $ $)#=C=L$L$'33L(3$)$$?L$ryy/?rR   rT   modelnamerI   skip_ddp_prefixskip_compiler_prefixreturnc                    |j                  t        d      }d|vr|hS |j                  d      }g }| }t        |      D ]  \  }}	t	        |t
              r(|	dk(  sJ |j                  }|r-|j                  |	       ?t	        |t              r|t        |      dz
  k  rW||dz      t        k(  rHdj                  |      }
t        |t              }|
r|
 d}
|j                  D ch c]  }|
 | 
 c}c S t        |t              }|	t        k7  s|j                  |	       t        ||	      }t	        |t        j                   j"                  j$                        r*|	dk(  sJ |j&                  }|r4|j                  |	       Gt)        ||      r: t        ||             j+                  |	      x}rt)        ||      rt        ||      }|j                  |	       |	t,        j.                  j                  j0                  k(  r|t        |      dz
  k7  st3        d      t        ||	      } dj                  |      j                  t        d      hS c c}w )a  
    This API is used to convert the name of a parameter to the FQNs. For FSDP
    without `use_orig_params`, the name of FlatParameter can be mapped to
    multiple original parameters. As a result, the return type of this function
    is `set[str]`.

    Args:
        module (nn.Module): the root model.
        name (str): the name
        skip_ddp_prefix (bool): whether to skip DDP's `module` prefix

    Returns:
        The canonical FQNs based on the model traversal.
     .module   	_orig_modz-Expect `_extra_state` to be the last obj name)replacer   split	enumerate
isinstanceDDPrn   appendFSDPlen_FLAT_PARAMjoingetattr_fqnsr    r^   _dynamo
eval_frameOptimizedModulerp   hasattrgetrd   modules_EXTRA_STATE_KEY_SUFFIXRuntimeError)rf   rg   rI   rh   ri   	obj_namesfqn_obj_namescurr_objicurr_obj_nameprefix
flat_paramfqnremoved_fqns                 r>   	_get_fqnsr      s4   0 <<*B/D
$v

3IMH%i0 $<=h$ H,,,H"$$]3$'3y>A%%)AE*:k*I-0$X{;
 &xq\F4>4D4DES6(3%(EEx)<=H 33$$]3"8];%--":":"J"JK K///))H'$$]3 x!23"F'(4E"F"H"L"L!# ;  x5#*8[#A  /

 1 1 I III**&'VWW"8];I$<L HH]#++,>CDD5 Fs   #Ic                       e Zd Zy)_EXTRA_STATEN)rJ   rK   rL   rQ   rR   r>   r   r      s    rR   r   c              #      K   t               dt        j                  dt        dt        ffd | d      E d {    y 7 w)Nrn   curr_fqnrj   c              3     K   j                  |        |r| dnd}| j                         D ]T  \  }}|v rt        |       r'| t        |              j	                         v r|d d }n| | } ||      E d {    V t        | j                  d      | j                  d            D ]   \  }}|| j                  v r| | }||f " t        | j                  dt        j                  j                        t        j                  j                  k7  r7| t        j                  j                  j                   }|t!               f y y 7 ׭w)Nrm   rl   F)recurseget_extra_state)addnamed_childrenr   r{   valuesr   named_buffersnamed_parameters_non_persistent_buffers_set	__class__rd   re   r   r   rn   r   r   )	rn   r   rg   	submodulenew_fqnobjrI   r   visited_moduless	         r>   r   z+_iterate_valid_model_state.<locals>.recurse   sr    F#%-hZq>2%446 	3OD)O+  12>GF,=>@GGII #3B-%Jtf-y'222	3    /1H1HQV1H1W
 	ID# v999!
4&)G3,	 F$$&79R9RSyy(() "
2::#4#4#L#L"MNG<>))	) 3s   A;E>E?CErl   )r`   rd   re   rP   r   )rf   rI   r   r   s    `@@r>   _iterate_valid_model_stater      s?     &)eO *		  *S  *Y  *D ub!!!s   ;A AA)
submodulesoptionsoptims.
optim_onlyr   r   c                   |rt        j                  dt               |r|st        d      |xs
 t	               }i }i }t        |       D ]  \  }}t        |t              rt        | |      }	|j                  |d      }
|
2t        t        t           ||         j                  |	       ||   ||<   n|	j                         ||<   |	D ]  }
t        |t              r|||
<     t        |j!                               D ])  \  }}|D ]  }
t        t"        j$                  |      ||
<   ! + t               }|rat        |      }| j'                         D ]C  \  }}||vrt        | |      }	t)        |	      dk(  sJ d       |j                  d |	D               E |j*                  r|j,                  st/        d      t1        j2                  |       }|r|j,                  rat5        |j6                  |j6                        }t9        |j6                  |j6                  xs |j*                        }t:        j<                  }n<t?        |j6                  	      }tA        |j6                  	      }t:        jB                  }tD        jF                  d
        }tI        jJ                  || |||      }ntD        jL                  }tO        di tQ        |      ||||t        t        tR        jT                     |      | t)        |      dkD  dS )zW
    Verify the model and options passed by the user and generates _StateDictInfo.
    zGetting submodules only model/optim state_dict is deprecated and will be removed in 2.5. This feature can be achieved by manually filtering out the state_dict returned from get_state_dict.z;Optimizers are not passed in but optim_only is set to True.Nro   z)Submodule FQN should only have 1 instancec              3   &   K   | ]	  }| d   yw)rm   NrQ   ).0r   s     r>   	<genexpr>z"_verify_options.<locals>.<genexpr>L  s     %@CQi%@s   z?full_state_dict must be True when broadcast_from_rank0 is True.)offload_to_cpu
rank0_only)r   c              3      K   t        j                         5  t        j                  ddt               t	        j
                  | |||      5  d  d d d        d d d        y # 1 sw Y   xY w# 1 sw Y   y xY ww)NignorezFSDP.state_dict_type)messagecategoryrn   state_dict_typestate_dict_configoptim_state_dict_config)warningscatch_warningsfilterwarningsFutureWarningrw   r   r   s       r>   $fsdp_state_dict_type_without_warningz=_verify_options.<locals>.fsdp_state_dict_type_without_warningj  sy      ((* 
''&<} ))!$3&7,C	  	
 
 	
 
s4   A;6A/A#A/	A;#A,	(A//A84A;r   r   )rV   rW   rX   r[   r\   rY   rZ   rQ   )+r   warnr   r   r+   r   rt   r   r   r   r   r`   rP   updatecopyrc   itemsr^   r_   named_modulesrx   rF   rA   
ValueErrorrw   r\   r   rB   r   r   FULL_STATE_DICTr   r   SHARDED_STATE_DICTra   contextmanager	functoolspartialrb   rT   r   rd   re   )rf   r   r   r   r   rV   rW   rg   paramfqnsr   param_fqns_rX   rn   r\   r   r   r   r   r[   s                        r>   _verify_optionsr     s+    I 		
 &I
 	
 +)+G 	 
 	  2%8 /ee\*%##E40?S,U34;;DA+<U+C!%( (,yy{e$ 	/Ce\2).!#&	//  399;< D 	DC)-ellF)C!#&	DD $'5_
!//1 	ALD&Z'UD)Dt9>N#NN>%%%@4%@@	A ##G,C,CM
 	
 $$U+L "" 3&22w?R?R! '?&22#//O73O3O'# ,;;O 6&22! 'B&22'# ,>>O		"	"	 
#	$ !((0+/$;
 "-- 	
/	+3-!$ryy/<8#^&kAo	 	rR   model_state_dictoptim_state_dictinfoc                     |j                   D ]  }t        |      }|J d        |j                  rk| si|j                  s]|j                  sQ|j
                  r|j                  s9|j                  r-|j                  s!t        dt        j                         d      |j                  r4|s2|j
                  r|j                  s|j                  st        d|       | j                         D ]  }t        |v st        | dt         d       y )Nz)Expected a fsdp_state with a fsdp module.z}The option indicates that model state_dict is required to save or load, but model state_dict is empty.rank = dist.get_rank()=rm   zgThe option indicates that model state_dict is required to save, or load but optim state_dict is empty. z
 contains z6. This can happen if the model is not the root module.)r\   r   rY   rX   rC   rB   rA   rE   rF   r   distget_rankrZ   keysry   )r   r   r   rn   
fsdp_statekeys         r>   _verify_state_dictr     s'   
 ## SCFK
%R'RR%S 	 ''))!!d&:&:KK))'mmo'q*
 	
  %%$*>*>..::J9KM 
  $$& #%z+ /* * rR   r   apic                     t        | |      }|t        v r+t        j                  t        | j                  |      |       }|S )N)self)r{   r6   r   r   r   )r   r   calls      r>   _state_dict_fnr     s9    3D""  !<3GKrR   
state_dictc                     |j                   rF|j                  rt        j                  j	                         sdnd}t        | |j                  |      S |j                  rt        |       S | S )NrQ   )r   )rB   
ranks_only)rA   rB   r^   distributedis_initializedr   r   )r   r   r   s      r>   _maybe_full_or_cpu_state_dictr     sn      $$E,=,=,L,L,N  	
 "D$4$4
 	
 
		)*55rR   c                    |j                   si S |j                         5   t        | d             }d d d        t        j	                               D ]w  }t        | |      }t        |      dk(  s	J ||f       t        t        |            }||k7  s@dt        fd} |||      st        d| d|       |j                  |      ||<   y |j                  rgi }|j	                         D ]P  }|j                  D ]?  }|j                  |      s|j                  r	||   ||<   *|t        |      d  }	||   ||	<   A R |}|j                  rI| j!                         D ]6  \  }}
|
j"                  rt        | |      }|D ]  }|j                  |        8 t        |j%                               D ]9  \  }}t'        j(                  |      s|j*                  s)|j                  |       ; t-        ||      S # 1 sw Y   xY w)Nr   ro   rj   c                    t        |      t        |       k\  ry|j                  d      }| j                  d      }d}t        |      D ]:  \  }}|||   k(  r'|dz  }|t        |      k(  s"|t        |      dz
  k(  c S |dv r: y y)NFrm   r   ro   )rn   rp   T)rx   rr   rs   )r   r   	fqn_split	key_splitfqn_idxkey_idxkey_names          r>   verifyz%_get_model_state_dict.<locals>.verify  s    s8s3x' IIcN	IIcN	)29)= %%GX9W#551"c)n4#*c)nq.@#@@!%<< $% rR   zAn unexpected key, z, exists. FQN is )rY   r[   r   rc   r   r   rx   nextiterrN   r   poprX   
startswithrD   rC   r   requires_gradr   r^   	is_tensoris_metar   )rf   r   r   r   r   r   r   new_state_dictr   r   r   ps               r>   _get_model_state_dictr     s    					 ;8^E<8:
; JOO%& 2$4yA~*T{*~4:#:D " #s#"%8=Nse#TUU(nnS1JsO72: /1??$ 	>C11 >~~f-//*4S/N3'!#f+-0G.8oN7+>	> $
  002 	$JC""UC(D $s#$		$ z'')*  Q??1!))NN3  )T::u; ;s   G55G?c           	         |j                   r|s|j                  st        i i       S i }t        | |j                        D ]  \  }}t        | ||j                        }t        | ||j                  dd      }t        ||      D ]f  \  }}	|j                  rt        j                         dk(  r9||	k7  r4|j                  |d       }
|
|j                  rt        d| d      |
||	<   |||	<   h  d}|j                  s|j                  r}t               }|j                         D ]J  \  }}t        j                   |      s|j#                         dkD  s0|j%                  |j&                         L t        j&                  d      |v r&|j)                  t        j&                  d             d}t+        |      dk(  r.|j%                  t        j,                  j/                                nt+        |      dkD  rt1        d	      |j                  r3t3        |||j                         |j                  |j4                  
       n(|j                  rt7        |||j                                |j                         D ]
  \  }}|||<    |j9                         5  t;        t         t=        | d      ||j                  |            cd d d        S # 1 sw Y   y xY w)NF)rh   ri   r   zMissing key: rm   metaTro   zMultiple devices found)devicerE   rB   r   load_state_dict)r   rE   assign)rY   rF   r"   r   rI   r   zipr   r   r   rE   r   rA   r`   r   r^   r   dimr   r   removerx   distributed_c10d_get_pg_default_devicer   r   rB   r   r[   r   r   )rf   r   r   local_state_dictr   valuer   fqns_with_prefixr   fqn_with_prefix
load_valuer   deviceslocal_states                 r>   _load_model_state_dictr    s    Z8Q8Q R((08N8NO 6
UT%;%;<$""!!&
 %(.>$? 
	6 C--A1E('^^C6
%{{*]3%q+ABB2<J/05_-
	66, F  D$8$8%*002 	*JCu%%))+/ELL)	*
 <<7*NN5<</0Fw<1KK--DDFG\A566$$! {{}{{ ,, !!":/?V 0 6 6 8 	*C)JsO	* 
			 
4N5"34%dkk&

 
 
s   -KKoptimc                 h   | j                   ry| j                  D ]  }|t           D ]  }|j                    y ! | j                  D ]7  }|t           D ])  }|j                  st        j                  |      |_        + 9 g }| j                  D ]R  }d|v s|j                  |d          t        |d   t
        j                        rt        j                  d      nd|d<   T | j                  d       | j                  D ]  }d|v s|j                  d      |d<    | j                  d       y)zH
    Initialize optim states by calling the step() with zero grads.
    Nlrg        )closurer   T)set_to_none)r5   r3   _PARAMSgradr   r^   
zeros_likerv   rt   r_   tensorstepr   	zero_grad)r  param_groupr   lrss       r>   _init_optim_stater  c  sB    {{ ))  ) 	Ezz%	
 )) 5 ) 	5E"""--e4
	55 C)) ;JJ{4() k$/> S!  
JJtJ )) +; #
K+ 
OOO%rR   c           
         d }i }t        t        | t                 j                         D ]D  \  }}t        t        |      j                         D ]  \  }} ||       ||t         d| d| <     F t        t        | t
                 D ]\  }|j                  t              }t        t        t           |      D ]+  }|j                         D ]  \  }}||t
         d| d| <    - ^ |S )aI  
    This API flattens the optimizer state_dict to support optimizer resharding for
    MPMD, e.g., pipeline parallelism.

    Without the API, the original optimizer state_dict looks like:
    {
        "state": {
            "layer1.weight": {
                "step": 10, "exp_avg": SomeTensor, "exp_avg_sq": SomeTensor
            },
            "layer2.weight": {
                "step": 10, "exp_avg": SomeTensor, "exp_avg_sq": SomeTensor
            },
        },
        "param_group": [
            {
                "lr": 0.0,
                "betas": (0.9, 0.95), ...,
                "params": ["layer1.weight", "layer2.weight"]
            }
        ]
    }

    With this API, the optimizer state_dict looks like:
    {
        "state.layer1.weight.step": 10,
        "state.layer2.weight.step": 10,
        "state.layer1.weight.exp_avg": SomeTensor,
        "state.layer2.weight.exp_avg": SomeTensor,
        "state.layer1.weight.exp_avg_sq": SomeTensor,
        "state.layer2.weight.exp_avg_sq": SomeTensor,
        "param_group.layer1.weight.lr" : 0.1,
        "param_group.layer2.weight.lr" : 0.1,
        "param_group.layer1.weight.betas" : (0.9, 0.95),
        "param_group.layer2.weight.betas" : (0.9, 0.95),
    }

    Note that if any of the value is a container, like the betas in the example,
    this API won't flattent it.
    c                 ~    t        | t        j                  t        t        f      st        dt        |        d      y )NzUFlattening optimizer state_dict only supports tensor, int, float states now. Type is rm   )rt   r^   r_   intfloatNotImplementedErrortype)vs    r>   _raise_if_type_not_supportedz?_flatten_optim_state_dict.<locals>._raise_if_type_not_supported  s>    !ellC78%7)1&  9rR   rm   )
r   r(   _STATEr   r)   _PGr   r	  rc   rP   )	r   r  retr   r5   kr  r  r   s	            r>   _flatten_optim_state_dictr    s    T !#C=*V*<=CCE +
U.446 	+DAq(+)*C6(!C5!%&	++
 -z#? ,w'S	4( 	,C#))+ ,1*+se1SE1#&',	,,
 JrR   c                    i }g }t         |t        |i}| j                  D ]  }|j                  t        g i       |t           D ]  }|j
                  |   D ]  }||j                  v r7d}	|j                         D ]!  }
|
t        k(  rt         d| d|
 }||v rd}	 n nd}	|	sM|d   t           }t        |t              sJ |j                  |       |j                  si ||<   | j                  |   j                         D ]'  }|t          d| d|    t        t        ||         |<   )   t        t        t           |d   t                 d   }|j                         D ]V  }
|
t        k(  r|t         d| d|
    }|
|d   vr	||d   |
<   .|d   |
   |k7  s:t        d| d|
 d| d|d   |
    d	        |S )	z
    This API unflattens the state_dict generated by _flatten_optim_state_dict().
    See the docstring of _flatten_optim_state_dict() for more detail.
    Frm   Tr   r   zaAll the parameters in the same parameter group should have the same saved param_group value. But z is z while other(s) is )r  r  r3   rv   r	  rV   rW   r   rt   rc   r   r5   r   r(   rP   r   )r  r   r   r5   pg_state
return_osdr  r   r   	in_paramsr  flatten_keyr4   
state_namefirst_param_fqnr   s                   r>   _unflatten_optim_state_dictr&    s:    E"$H&,eS(%CJ)) -"& ) 	E--e4  $444 %I(--/ <$),Qse1QC&8&*4(,I !%I !"g.!&$///c"**c
"'++e"4"9"9"; JBL!(!C5*6CDc
3J?3	> tCy(2,w*?@C!!# 	AG|#a'8!=>E$"'Q"aE)"==L<MQqc R 3HRLO3DAG 	E-^ rR   
optimizersc                    |j                   si S t        i t        g i}|D ]  }t        |        t	        |d             }|j
                  r|j                         5  t        j                  | ||      }d d d        |s_t        |t           j                               D ]9  }d|v s|t           j                  |      |t           |j                  dd      <   ; |t           D ]1  }|t           D cg c]  }|j                  dd       }}||t        <   3 n/t        t        j                  d |j                   D                    }t#        t%        |t'        t)        |                        }	i }
| j+                         D ]I  \  }}t-        | |      }t)        |      dk(  sJ t/        t1        |            }||	vr;|	|   }||
|<   ||
|<   K t        |t           j                               D ])  }|
|   }|t           j                  |      |t           |<   + |t           D ]#  }|t           D cg c]  }|
|   	 c}|t        <   % |s-t3        t4        |t                 j7                  |t                  t3        t8        |t                 j;                  |t                   |j<                  rt3        t>        tA        |            }tC        ||      S # 1 sw Y   ixY wc c}w c c}w )Nr   rp   z
_orig_mod.rl   c              3   .   K   | ]  }|t              y wr8   )r	  )r   gs     r>   r   z(_get_optim_state_dict.<locals>.<genexpr>*  s     -UQaj-Us   ro   )"rZ   r  r  r  r   r\   r[   rw   r   rc   r   r   rq   r	  r   from_iterabler3   r]   r   rangerx   r   r   r   r   r   r(   r   r)   extendrG   r*   r  r   )rf   r'  r   r   r  osdr  r*  r4   param_pid_mappingfqn_pid_mappingr   r   r   r   pidgroups                    r>   _get_optim_state_dictr3    s    	,2BR+@ ,H% 1nUL13""$ ?++E5#>? #f+**,- R!#?B6{q?QCK		, ;<R X $?@zJ!!))L"5JJ#'
$ %---U%BTBT-UUVF $Ss6{1C%D E O#446 +
U ,4yA~%~4:& 11'.'*$'*$+ CK,,./ 8%c*#&v;??3#7FC 8 S RBG.!Q3/#"6!QgR ],V45<<S[I 0 56==c#hGY,H\ (( 9:J K
 ))94@@_? ? K* "Rs   K1K'0K,K$	c           
         i }g }t         |t        |i}i }t        d t        t        |t                  j                         D              r|S |j                  D ]*  }|j                  t        g i       |t           D ]%  }	|j                  |	   D ]  }
|
|j                  v rCd}t        t        |t                 D ]&  }|
t        t        t           |t                 v s$d} n nd}|sZ|d   t           }t        |t              sJ |j                  |
       |	j                  rt        t        |t                  |
   ||
<   t        t        |t                 D ]D  }|
t        t        t           |t                 v s$t!        |t                 dz
  |t#        |      <   F  ( t!        |t                 dk(  sdg }t        t        |t                 D ]>  }t!        t        t        t           |t                       dk(  s.|j                  |       @ t!        |      dk7  rt%        d      t!        |t                 t!        |j                        k7  rt%        d      t!        |t                 dz
  |t#              <   - t        t        |t                 D ]M  }|j'                  t#        |      d      }|dk(  r$|j)                         D ]  \  }}|t        k(  r|||   |<    O |S )	a  
    Extract the corresponding optim state_dict from ``optim_state_dict`` for
    ``optim`` and return the result optim state_dict.

    Args:
        model (nn.Module): the root model.
        optim (torch.optim.Optimizer): the optimizer.
        optim_state_dict (Dict[str, ValueType]): the superset optim state_dict that
            contains the optim state_dict of ``optim``.
        info (_StateDictInfo): state dict information.

    Returns:
        The optim state_dict of ``optim``.
    c              3   <   K   | ]  }t        |t                y wr8   )rt   r  )r   r  s     r>   r   z*_split_optim_state_dict.<locals>.<genexpr>f  s       
1cs   FTr   ro   r   zThere are param groups that have zero parameters. In such a case, DSD only support exactly one param group with zero parameters.But the loaded state_dict has zero or more than one param groups that have zero parameters.z`When there is a parameter group that has zero parameters, multiple optimizers are not supported.)r  r  allr   r(   r   r3   rv   r	  rV   rW   r)   rc   rP   rt   r   rx   idr   r   r   )rf   r  r   r   r5   r   r!  
pg_mappingr  r   r   r"  loaded_param_groupr4   r  pg_idxr   r   s                     r>   _split_optim_state_dictr;  L  s	   * E"$H&,eS(%CJ!#J
 $(8H8P$Q$V$V$X   )) /J"& ) 	VE--e4 V$444 %I.2)+;C+@/ "* $tCy2DW2M"NN(,I!" !%I !"g.!&$///c"&&!%m5Ef5M!Ns!SE#J*.%'7'<+ V& d49.@.IJJ=@C=QTU=U
2&8#9:	V'V	V4 {7#$)C&*+<>Ns>S&T 3"tDI'9''BCDIJJ123 3x1} 1  #C()S1C1C-DD =  25Z_1E1IJr,-._/Jb -/?/DE 	*;4R<%++- 	*JCg~$)HVS!		*	* rR   c           
         |j                   sy |D ]  }t        |       |r@t        |v rt        | |||      }n+t	        |t        t        t        t        f   |      |      }ni }|j                  rT| j                         D ]  \  }}t        | |      }t        | |d      }	||	k(  r't        |      dk(  sJ |j                         }
|	j                         }|t           D ]N  }t        t        t        t        f   |      }|t            D cg c]  }|j#                  |
|       }}||t         <   P t        t$        |t                 }t'        |j)                               D ]+  }|
|v s|j                  |      ||j#                  |
|      <   -  |j+                         5  t-        j.                  | ||      }d d d        n
|j0                  rd|_        t3        | |f|      }d|_        d fd}t5        t6        j8                  ||      }J t;        |      \  }}t;        |      \  }}|j<                  rt?        ||       ntA        ||       |j)                         D ]  }||vs||v sJ ||   ||<   ||   ||<    tC        ||      }|t           D ]/  }t         |vsg t        t        t        t        f   |      t         <   1  tE        |d      |        y c c}w # 1 sw Y   %xY w)	NF)ri   ro   Tc                     | j                         dkD  r*| j                  | S | j                  k7  rt        d      | S )Nr   zDevice mismatch)r   r   r   )tr   s    r>   _devicez'_load_optim_state_dict.<locals>._device  sD    557Q;~!"   188+():;;rR   r   r   )r   )#rZ   r  r  r;  r&  r   r]   rP   r'   r\   r   r   rx   r   r  r	   r	  rq   r(   rc   r   r[   rw   optim_state_dict_to_loadrA   r3  r$   r^   r_   r   rF   r   r   r   r   )rf   r'  r   r   r  r   original_fqn_r   fqns_with_compilerr   fqn_with_compilerr*  valr   r4   	osd_stater  r   r?  flatten_osdosd_mappingflatten_local_osdlocal_osd_mapping	optim_keypgr   s                             @r>   _load_optim_state_dictrM    sC     TN% ##:5*d$  $?4S)^ 4jA4$   " $)#9#9#; Xa 5%.<e&" --4yA~%~hhj$6$:$:$<!)#. *AtCH~q1CGJ7|@CC):;F  $*CL* !0@0HI	inn./ XAaxGP}}UVGW	!))C1B"CDX%X, ""$ #'#@#@5"2$   !!#(D 4UUHdK#'D F ellG5EFA%%%':;K'L$K3FGW3X00((%k3DVT&{4EfU
 )--/ J	$55$3333>y3I%i03>y3I%i0	J
  5!#4  's+ A"$>@Dc9n-r27;A 	1u/0<LMiTN: s   K&
#K++K4	c                    t               5  t        | dd||      }t        | |      }t        |i |       |cddd       S # 1 sw Y   yxY w)aH  
    Return the model state_dict of ``model``.

    See ``get_state_dict`` for the detail usage.

    Args:
        model (nn.Module): the nn.Module to the model.
        submodules (deprecated): Optional[set[nn.Module]]: only return the model parameters
            that belong to the submodules.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be returned. See
            `StateDictOptions` for the details.

    Returns:
        The state_dict for ``model``.

    :rtype: typing.Dict[str, ValueType]
    rQ   Fr   r   r   N)r?   r   r   r   )rf   r   r   r   r   s        r>   r,   r,     sV    0 
 
 !
 1=+R6
  
  
 s   +A  A	c                    t               5  t        |t        j                  j                        r|fn
t        |      }t        | |d||      }t        | ||      }t        i ||       |cddd       S # 1 sw Y   yxY w)a  
    Return the combined state_dict for optimizers.

    See ``get_state_dict`` for the detail usage.

    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[None, Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        submodules (deprecated): Optional[set[nn.Module]]: only return the model parameters
            that belong to the submodules.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be returned. See
            `StateDictOptions` for the details.

    Returns:
        The state_dict for ``optimizers``.

    :rtype: OptimizerStateType
    TrO  N)	r?   rt   r^   r  	Optimizertupler   r3  r   )rf   r'  r   r   r   r   s         r>   r-   r-   0  s    6 
   *ekk&;&;< Mz" 	
 !
 1
DI2/6     s   AA33A<c                   t               5  t        |t        j                  j                        r|fn
t        |      }t        | |d||      }t        | |      }t        | ||      }t        |||       ||fcddd       S # 1 sw Y   yxY w)a  
    Return the model state_dict and optimizers state_dict.

    ``get_state_dict`` can process any module that is parallelized by PyTorch
    FSDP/fully_shard, DDP/replicate, tensor_parallel/parallelize_module, and any
    combination of these parallelisms. The main functions of ``get_state_dict``
    are: 1.) returning a model and optimizer state_dict that can be resharded
    with a different number of trainers and/or different parallelisms.
    2.) hiding the parallelism-specific state_dict APIs. Users don't have to call
    these APIs.
    3.) sanity checking the result state_dict.

    The keys of the result state dictionary are the canonical FQNs (Fully
    Qualified Names).  A canonical FQN refers to the FQN based on a parameter's
    position in an nn.Module hierarchy. More specifically, a canonical FQN to a
    parameter is the FQN returned by ``module.named_parameters()`` or
    ``module.named_buffers()`` when the module is not distributed by any
    parallelisms. Since the optimizer internally uses parameter IDs to represent
    a parameter, there will be a conversion from the parameter IDs to the
    canonical FQNs when calling this API.

    ``get_state_dict`` can also process a module that is not parallelized. In
    such a case, ``get_state_dict`` only performs one function -- converting the
    optimizer parameter IDs to the canonical FQNs.

    Example:
        >>> # xdoctest: +SKIP
        >>> import torch
        >>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
        >>> from torch.nn.parallel import DistributedDataParallel as DDP
        >>> from torch.distributed.checkpoint.state_dict import get_state_dict

        >>> fsdp_model = FSDP(copy.deepcopy(model))
        >>> fsdp_optim = torch.optim.Adam(model.parameters(), lr=1e-3)
        >>> ddp_model = DDP(copy.deepcopy(model))
        >>> ddp_optim = torch.optim.Adam(model.parameters(), lr=1e-3)


        >>> ddp_state_dict, ddp_optim_state_dict = get_state_dict(ddp_model, ddp_optim)
        >>> fsdp_state_dict, fsdp_optim_state_dict = get_state_dict(
        ...     fsdp_model, fsdp_optim
        ... )

        >>> # if we simply call ddp_model.state_dict() and fsdp_model.state_dict(),
        >>> # the asserts will fail.
        >>> assert ddp_state_dict == fsdp_state_dict
        >>> assert ddp_optim_state == fsdp_optim_state_dict


    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[None, Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        submodules (deprecated): Optional[set[nn.Module]]: only return the model parameters
            that belong to the submodules.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be returned. See
            `StateDictOptions` for the details.

    Returns:
        ``Tuple`` that contain model state_dict and optimizer state_dict.

    :rtype: typing.Tuple[typing.Dict[str, ValueType], OptimizerStateType]
    FrO  N)
r?   rt   r^   r  rQ  rR  r   r   r3  r   )rf   r'  r   r   r   r   r   s          r>   r.   r.   ]  s    P 
 2 *ekk&;&;< Mz" 	
 !
 1=0
DI+-=tD!11!2 2 2s   A,BB
c           
         |si S t        t        t        |j                                     t        j
                        rt        j                  dt               t        t        t        j
                  t        t        t        f   f   |      }i }|j                         D ]  \  }}| j                         D ]y  \  }}||k7  rt        | |      }t!        |      dk(  sJ d       t        t        |             d}	|j#                  |j                         D 
ci c]  \  }
}|	|
z   | c}}
       {  |S t        t        t        t        f   |      S c c}}
w )NzPassing model_state_dict as a ``Dict[nn.Module, Dict[str, Any]]``is deprecated and will be removed in 2.5. If you need this feature, please preprocessing the model_state_dict to achieve the same functionality.ro   z/FQNs for a submodule should only have 1 elementrm   )rt   r   r   r   rd   re   r   r   r   r   r]   rP   r'   r   r   r   rx   r   )rf   r   cast_state_dictr   r   sub_state_dictrg   mr   r   subfqnr   s               r>   _unflatten_model_state_dictrY    s<    	$tJOO-./;" 	
 tBIItCN/C$CDjQ/1)8)>)>)@ 
	%I~ ..0 	a	> -4yA~X'XX~ d,-Q/%%AOAUAUAWXVf_e+X	
	 Di(*55	 Ys   E)r   c                    t        | |      }t               5  t        | dd|      }t        |i |       t	        | ||      cddd       S # 1 sw Y   yxY w)a=  Load the model state_dict.

    The counterpart of ``get_model_state_dict`` to set the state_dict to the
    model. See ``set_state_dict`` for the detail usage.

    Args:
        model (nn.Module): the nn.Module to the model.
        model_state_dict: (Dict[str, ValueType]):
           the model state_dict to load. If the key of the ``model_state_dict``
           is nn.Module, the key is a submodule of ``model`` and the value should
           be the state_dict of the submodule. When loading the state_dict,
           the prefix of the submodule will be append to the state_dict.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.

    Returns:
        ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
            * **missing_keys** is a list of str containing the missing keys
            * **unexpected_keys** is a list of str containing the unexpected keys

    :type model_state_dict: typing.Dict[str, ValueType]
    rQ   Fr   r   N)rY  r?   r   r   r  )rf   r   r   r   s       r>   r/   r/     s`    : .I. 
 EubUGL+R6%e-=tD	E E Es   )A

Ac                    t               5  t        |t        j                  j                        r|fn
t        |      }t        | |d|      }t        i ||       t        | |||       ddd       y# 1 sw Y   yxY w)a  Load the optimizers state_dict.

    The counterpart of ``get_optimizer_state_dict`` to set the state_dict to the
    optimizers. See ``set_state_dict`` for the detail usage.

    WARN: ``set_optimizer_state_dict`` can only be called before ``backward()`` or after
        ``step()`` is called on the optimizers. Otherwise, the optimizer states won't be
        initialized correctly.

    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        optim_state_dict: OptimizerStateType:
            the optimizer state_dict to load.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.

    Returns:
        None

    :type optim_state_dict: typing.OptimizerStateType
    Tr[  N)	r?   rt   r^   r  rQ  rR  r   r   rM  )rf   r'  r   r   r   s        r>   r0   r0      sz    > 
 	J *ekk&;&;< Mz" 	
 ujT7S2/6uj2BDI	J 	J 	Js   AA11A:c                .   t        | |      }t               5  t        |t        j                  j
                        r|fn
t        |      }t        | || |      }t        |||       t        | |||       t        | ||      cddd       S # 1 sw Y   yxY w)a  Load the model state_dict and optimizers state_dict.

    The counterpart of ``get_state_dict`` to set the state_dict to the model and
    optimizers.  The given ``model_state_dict`` and ``optim_state_dict`` do not
    have to be returned by ``get_state_dict`` but must meet the following
    requirements: 1) all FQNs are canonical FQNs as defined in ``get_state_dict``,
    2) if a tensor is sharded, it must be either a ShardedTensor or DTensor,
    3) optimizer state_dict cannot contain the parameter IDs; the keys should be
    the canonical FQNs.

    WARN: ``set_state_dict`` can only be called before ``backward()`` or after ``step()``
        is called on the optimizers. Otherwise, the optimizer states won't be initialized
        correctly.

    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        model_state_dict: (Union[Dict[nn.Module, Dict[str, ValueType]], Dict[str, ValueType]]):
           the model state_dict to load. If the key of the ``model_state_dict``
           is nn.Module, the key is a submodule of ``model`` and the value should
           be the state_dict of the submodule. When loading the state_dict,
           the prefix of the submodule will be append to the state_dict.
        optim_state_dict: OptimizerStateType:
            the optimizer state_dict to load.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.

    Returns:
        ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
            * **missing_keys** is a list of str containing the missing keys of the model state_dict.
            * **unexpected_keys** is a list of str containing the unexpected keys of the model state_dict.

    :type model_state_dict: typing.Dict[str, ValueType]
    :type optim_state_dict: typing.OptimizerStateType
    r[  N)rY  r?   rt   r^   r  rQ  rR  r   r   rM  r  )rf   r'  r   r   r   r   s         r>   r1   r1   +  s    \ .I. 
 E *ekk&;&;< Mz" 	
 :.>*>
 	+-=tDuj2BDI%e-=tDE E Es   A*BBc                $   t        j                  t        | |      fd}|| _        t        j                  t        | |      dt
        t        t        f   ffd}|| _        t        j                  |       t        j                  |       y)a  Patch the ``state_dict`` and ``load_state_dict`` attributes of ``model``.

    Patch the ``state_dict`` and ``load_state_dict`` attributes of ``model`` to
    be a partial function to call ``get_state_dict`` and ``set_state_dict``.

    Example:
        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
        from torch.distributed.checkpoint.state_dict import patch_model_state_dict

        model = fsdp(model)
        patch_model_state_dict(model)

    Args:
        model (nn.Module): the nn.Module to the model.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.
    Returns:
        None
    )rf   r   c                               S r8   rQ   _state_dict_calls   r>   state_dict_callz0_patch_model_state_dict.<locals>.state_dict_call      !!rR   r   c                      |        y )N)r   rQ   r   _load_state_dict_calls    r>   load_state_dict_callz5_patch_model_state_dict.<locals>.load_state_dict_call      z:rR   N)r   r   r,   r   r/   r]   rP   r	   r   r6   r   )rf   r   rb  rg  rf  ra  s       @@r>   _patch_model_state_dictri  m  s    6 !((" 'E%--;c3h ; 1EO,01rR   c                   t        j                  t        | ||      fd}t        j                  t        | ||      dt        t
        t        f   ffd}t        j                  |       t        j                  |       t        |t        j                  j                        r|fn
t        |      }|D ]  }||_        ||_         y)a  Patch the ``state_dict`` and ``load_state_dict`` attributes of ``optimizers``.

    Patch the ``state_dict`` and ``load_state_dict`` attributes of ``optimizers`` to
    be a partial function to call ``get_state_dict`` and ``set_state_dict``.

    Note that if there are multiple optimizers, all of the optimizers will be patched.
    So users only need to call one of the state_dict() to get the full result.

    Example:
        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
        from torch.distributed.checkpoint.state_dict import patch_model_state_dict

        model = fsdp(model)
        patch_model_state_dict(model)

    Args:
        model (nn.Module): the nn.Module to the model.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.
    Returns:
        None
    )rf   r'  r   c                               S r8   rQ   r`  s   r>   rb  z4_patch_optimizer_state_dict.<locals>.state_dict_call  rc  rR   r   c                      |        y )N)r   rQ   re  s    r>   rg  z9_patch_optimizer_state_dict.<locals>.load_state_dict_call  rh  rR   N)r   r   r-   r0   r]   rP   r	   r6   r   rt   r^   r  rQ  rR  r   r   )rf   r'  r   rb  rg  r  rf  ra  s         @@r>   _patch_optimizer_state_dictrm    s    > !(( 	" &-- 	;c3h ; O,01 j%++"7"78 
: 
  5* 45rR   )rH   TT)rH   )rra   r   r9   r   collections.abcr   r   dataclassesr   r   r   	itertoolsr   typingr	   r
   r   r   r   r   r^   torch.distributedr   r   torch.nnrd   'torch.distributed._shard.sharded_tensorr   #torch.distributed._state_dict_utilsr   r   r   r   r   r   ;torch.distributed.algorithms._checkpoint.checkpoint_wrapperr   torch.distributed.fsdpr   r   r   rw   r   r   r   r   r   $torch.distributed.fsdp._common_utilsr   r    torch.distributed.tensorr!   torch.nn.modules.moduler"   torch.nn.parallelr#   ru   torch.utils._pytreer$   __all__ry   r  r	  r  r`   rP   r%   r_   r  r  r&   rc   rR  r]   r'   r(   r)   r*   r6   rO   r   r?   r+   rT   cachere   rN   r   r   r   r  rQ  r   r   r   r   no_gradr   r  r  r  r&  r3  r;  rM  r,   r-   r.   rY  r/   r0   r1   ri  rm  rQ   rR   r>   <module>r     s     	  / 0 0  F F     A 	 	 	 - 5 < -" 
		Sg}ellCKL4&m(<d3CS>TT	 S)^$' #u]4E%EFFG  &)U S] *   ,. ,. ,.^ @% @ @   . !%DE99DE
DE DE 	DE
 DE DE DEN	 	%"Z ,0*.99%++'',- 
 RYY( &' D*3	>**(* * 
	*Zbii)>)>>? c h S#X&4	#s(^$ @;99@;*@;	#y.@; @;F B
99B
S)^$B
 B
 	B
 B
J'&U[[22 '&t '&T=*< =c9nAU =@<;;  <S)^$< < 	<~ <A99<Aekk++S01<A <A 	<A <A~[99[;;  [ )[ 	[
 [| ]N99]Nekk++S01]N #]N 	]N
 
]N ]NF ,0*.	" 99"  RYY("  &'	" 
 
#y." R ,0*.* 99* ekk++Xekk6K6K-LLM*  RYY(	* 
 &'*  * b ,0*.X299X2ekk++Xekk6K6K-LLMX2 RYY(	X2
 &'X2 4Y!334X2v6996d299d3	>&::;T#y.=QQR6 
#y.6J +/	$E99$E3	>*$E &'	$E
 $EX +/(J99(Jekk++Xekk6K6K-LLM(J )(J
 &'(J 
(Jb +/=E99=Eekk++Xekk6K6K-LLM=E 3	>*	=E
 )=E &'=E =ED  +/129912 &'12 
	12 12l 
 +/	;599;5 ekk++S01;5 &'	;5
 
;5 ;5rR   