
    Vh                        d dl Z d dlZd dlZd dlZd dlmZmZ d dlmZm	Z	m
Z
mZ d dlZd dlmZ d dlmc mc mc mZ d dlmZ d dlmc mZ d dlmZmZmZ d dlmZ d dlm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' d dl(m)Z) d dl*m+Z+m,Z,m-Z-m.Z. d d	l/m0Z0m1Z1m2Z2 d d
l3m4Z4 d dl5m6Z6 ddl7m8Z8m9Z9m:Z:m;Z;m<Z< ddl=m>Z>m?Z?  ej                  eA      ZBde deCfdZDdeEdeEfdZFdej                  de deeHeEeEeEf      fdZIdej                  deeHeEeEeEf      fdZJe	 	 	 	 d=dej                  de deCdeCdeCdeCddfd       ZKedej                  de ddfd       ZLdej                  de ddfdZMdej                  de deCdeCddf
dZNedej                  de deOeEef   d eEd!e	deOeEef   fd"       ZPede dej                  ddfd#       ZQedej                  de deOeEef   d eEdeOeEef   f
d$       ZRdej                  de deOeEef   d eEddf
d%ZSdej                  de ddfd&ZTde dej                  ddfd'ZUedej                  de deOeEef   d eEdeOeEef   f
d(       ZVdej                  de ddfd)ZWdej                  de deOeEef   d eEddf
d*ZXde dej                  ddfd+ZYedej                  de deOeEef   d eEdeOeEef   f
d,       ZZedej                  de ddfd-       Z[edej                  de deOeEef   d eEddf
d.       Z\e j                  de defd/       Z^e ej                         dej                  deOeEef   d eEd0edeOeEef   f
d1              Z`e ej                         dej                  ddfd2              Zaede ddfd3       Zbe ej                         dej                  deOeEef   d eEd0eddf
d4              Zce ej                         dej                  d5eHedeE   edeE   f   d0eddfd6              Zed7e fd8Zfed7e d9eEd:e	d;eOeEef   ddf
d<       Zgy)>    N)	GeneratorIterator)AnyCallablecastno_type_check)init_from_local_shardsShardShardedTensor)_mesh_resources)
_FSDPState._get_module_fsdp_state_if_fully_sharded_module_has_fsdp_params_is_composable_module_handleclean_tensor_nameFSDP_PREFIXFSDP_WRAPPED_MODULE)SimpleProfiler)!_cast_buffers_to_dtype_and_device_get_orig_buffer_dtypes
_lazy_init%_reset_flat_param_grad_info_if_needed)FullStateDictConfigShardingStrategyStateDictType)DTensor)_replace_by_prefix   )_ext_all_gather_dtensor_ext_chunk_dtensor_ext_chunk_tensor_ext_post_unflatten_transform"_ext_pre_load_state_dict_transform)_unshard_fsdp_state_params
FLAT_PARAM
fsdp_statereturnc                 t    | j                   t        j                  k(  xr t        |       xs | j                   S N)sharding_strategyr   NO_SHARDr   _use_orig_params)r'   s    X/home/dcms/DCMS/lib/python3.12/site-packages/torch/distributed/fsdp/_state_dict_utils.py_should_unshard_paramsr/   :   s:    $$(8(A(AA 	HJ'F:+F+F     module_namec                     | j                  t         d      } | j                  t         d      } | r|  d} | j                  t        j                  d      } | S )N .)replacer   r   checkpoint_wrapper_CHECKPOINT_PREFIX)r1   s    r.   _convert_to_wrapped_module_namer8   A   sZ    %%;K%%)<(=CK$Q'%%&8&K&KRPKr0   modulec              #      K   t        ||       sy t        ||       j                         D ]  \  }}t        |      }| | }|||f  y wr*   )r   r   param_module_namesr8   r9   r'   
param_namer1   fqns        r.   _param_name_infosr?   K   sc      J/#1F$+
K 6kBj\*:{**+s   A	Ac              #      K   t        ||       j                         D ]  \  }}t        |      }| | }|||f  y wr*   )r   shared_param_module_namesr8   r<   s        r.   _shared_param_name_infosrB   X   sV      $2F$!+
K 6kBj\*:{**+s   <>	writeback
rank0_onlyoffload_to_cpu
with_gradsc                     | |j                   vsJ d       t        | |||||      |j                   | <   |j                   |    j                          y)z
    state_dict hooks cannot use the pure context call as the checkpoint flow
    requires to enter the context in the pre-hook but leave the context in the
    post-hook. This API enters the context of ``_unshard_fsdp_state_params``.
    z`Entering the ``_unshard_fsdp_state_params`` context but _unshard_params_ctx[module] is not None.)rC   rD   rE   rF   N)_unshard_params_ctxr%   	__enter__)r9   r'   rC   rD   rE   rF   s         r.   _enter_unshard_params_ctxrJ   c   sg     777 	7 .H%.J""6* ""6*446r0   c                 z    |j                   |    j                  ddd       |j                   j                  |        y)zAA helper function to exit ``_unshard_fsdp_state_params`` context.N)rH   __exit__popr9   r'   s     r.   _exit_unshard_params_ctxrO      s5     ""6*33D$E""&&v.r0   c                     |j                   j                         r|j                   j                          t        ||        |j                  rt        |j                         yy)zAPerforms the pre-state_dict tasks shared by all state_dict types.N)_device_handleis_availablesynchronizer   _is_rootr   _all_handlesrN   s     r.   _common_pre_state_dict_hookrV      sN    
   --/!!--/z6"-j.E.EF r0   c                 <    t        |      syt        | |d||       y)z
    Performs the pre-state_dict tasks shared by all state_dict types that require
    ``_unshard_fsdp_state_params()``. FULL_STATE_DICT and SHARDED_STATE_DICT use this hook.
    NF)rC   rE   rD   )r/   rJ   )r9   r'   rE   rD   s       r.   #_common_unshard_pre_state_dict_hookrX      s&     "*-%r0   
state_dictprefix
param_hookc                    t        ||t         z   |       |rt        ||       st        |      rt	        | |       |S |j
                  t        j                  k(  xr$ t        t        |j                        j                  }|xr |j                  dk7  }|r}|j                  sq|j                  D ];  }|j                  t         j"                   dd      }|j%                  | | d       = |j%                  | t&                t	        | |       |S t)        | |      D ]`  \  }}	}
| | }|r|j%                  |       !||v s2J d| d|j+                          d| d|
 d	|	 d
|j                   d        ||||       b t        |      rt	        | |       t-        j.                  d      }g }g }|j                  D ]  }t1        |      }| | }||vr|r|j%                  |       -||   }|j                  j2                  r#|j.                  |k7  r|j5                  |      ||<   ||j6                  vsz|j9                  |       |j9                  ||           |rt;        |      s|j=                         n|j>                  j@                  du}|rqtC        ||      }tE        |||jF                         tI        ||      D ]?  \  }}| | }tJ        jM                  d||jN                         |jQ                         ||<   A |S )z
    The post-state_dict flow that shared by all state_dict types that require
    ``_unshard_fsdp_state_params()``. FULL_STATE_DICT and SHARDED_STATE_DICT use this
    hook.
    r   r4   r3   NzFSDP assumes z2 is in the state_dict but the state_dict only has z	. prefix=z, module_name=z, param_name=z rank=cpuz%FSDP is casting the dtype of %s to %s))r   r   r   r/   rO   _state_dict_typer   FULL_STATE_DICTr   r   _state_dict_configrD   rankr-   _buffer_namesr5   r6   r7   rM   r&   r?   keystorchdevicer   rE   to_ignored_buffer_namesappendr   $_mixed_precision_enabled_for_buffersmixed_precisionbuffer_dtyper   r   compute_deviceziploggerinfodtypeclone)r9   r'   rY   rZ   r[   rD   no_fsdp_return	clean_keyr>   r=   r1   
cpu_devicebuffer_clean_fqnsbuffersbuffer#mixed_precision_enabled_for_buffersbuffer_dtypes	clean_fqns                     r.   $_common_unshard_post_state_dict_hookr{      s>    z6{m#<fE-j&A!*-$VZ8 	##}'D'DD 	P$j&C&CDOO   8JOOq$8Nj99#11 	9I!))%889;RI NNfXi[148	9 	&*./ 4 ):&*(M ,$ZNN3j  	
C5 !??$% &X^K= 9$VJOO+<A?	
  	:vs+, j) 4e$JG-- 0	%i0	$j NN3_F--<<MMZ/"())J"7
3
 @ @@!((3z#/'0*  "*- ;;=,,99E 	,
 /3J@QRM-
(A(A &)2C%D 1!	,CS&,,W"(,,.
31 r0   c           	          t        | dd      rt        j                  | j                         t	        ||        t        || | j                  j                  t        t        | j                        j                         y)aU  
    Hook that runs before model.state_dict() is called. pre-state_dict hook is
    not actually supported by ``nn.Module``. As a result, this API is called
    from ``_full_post_state_dict_hook()`` to simulate the case. Once pre-state_dict
    is supported in ``nn.Module``, this hook will be registered as a hook in
    ``nn.Module``.
    _device_meshFrE   rD   N)getattrr   get_root_meshr}   rV   rX   r`   rE   r   r   rD   r'   r9   argskwargss       r.   _full_pre_state_dict_hookr     sb     z>51%%j&=&=>
3'!44CC+Z-J-JKVV	r0   c                 p    dt         t        t        f   dt        dt        ddffd}t        | |||      S )a!  
    Hook that runs after model.state_dict() is called before returning result to
    user. For FSDP, we may have to clone the tensors in state_dict as params go
    back to sharded version after _unshard_fsdp_state_params ends, and also remove
    the ``FSDP_WRAPPED_MODULE`` prefix.
    rY   rZ   r>   r(   Nc                 V   |}t        |      }|j                  |      }t        | |   dd      s0	 | |   j                         j	                         | |<   d| |   _        y y # t        $ r>}t        j                  d| dj                   d| dt        |              Y d }~y d }~ww xY w)N_has_been_clonedFTz#Failed to clone() tensor with name z	 on rank z. This may mean that this state_dict entry could point to invalid memory regions after returning from state_dict() call if this parameter is managed by FSDP. Please check clone implementation of z	. Error: )r   removeprefixr   detachrq   r   BaseExceptionwarningswarnra   str)rY   rZ   r>   rs   clean_prefixer'   s         r.   r[   z._full_post_state_dict_hook.<locals>.param_hook<  s    
 	(0 **<8	 z#(:EB
",S/"8"8":"@"@"B
337
30 C ! 9#i
GX Y) *-Ys1vh	@ s   .A! !	B(*4B##B(dictr   r   r{   r9   r'   rY   rZ   r[   s    `   r.   _full_post_state_dict_hookr   .  sN    cN  
	4 0
J
 r0   c                     t        ||        t        |      r,t        j                  d      5  t	        | |d       d d d        t        |      st        |||t         z          y y # 1 sw Y   +xY w)NrJ   TrC   )r   r/   r   profilerJ   r   r   r   )r9   r'   rY   rZ   s       r.   _full_pre_load_state_dict_hookr   [  sl     z6"j)##$?@ 	J%fjDI	J *%:vv;-/HI &	J 	Js   A&&A/c                     t        |      r+t        j                  d      5  t        | |       d d d        y y # 1 sw Y   y xY wNrO   )r/   r   r   rO   r9   r'   r   r   s       r.   _full_post_load_state_dict_hookr   j  sD     j)##$>? 	9$VZ8	9 	9 *	9 	9s	   8Ac                 v    t        | |      r!t        | |      j                  st        d      t	        ||        y)z
    Hook that runs before model.state_dict() is called. Right now, pre-state_dict
    hook is not supported by the PyTorch core. So this API is called from
    `_local_post_state_dict_hook()` to simulate the case.
    zN``local_state_dict`` can only be used when parameters are flatten and sharded.N)r   r   uses_sharded_strategyRuntimeErrorrV   r   s       r.   _local_pre_state_dict_hookr   r  s<     	V,z62HH
 	
  
3r0   c                 Z   t        || t         |       t        ||       s|S t        ||       sJ d       t        ||       j                  }|j
                  j                         }|j                         |j                  z  }|j                         |j                  z
  }|dkD  r8|d| j                  |      }t        j                  ||g|j                        g}ng }t        |||j                        }	|j                  j                  r|	j!                         }	|	|| t"         <   |S )z
    This hook create a ShardedTensor from the local flat_param and replace
    the state_dict[f"{prefix}{FLAT_PARAM}] with the ShardedTensor. No copy
    will happen. The underlying storage is the same.
    zShould have returned earlyr   N)process_group)r   r   r   r   
flat_param_unpadded_unsharded_sizenumelra   _shard_numel_paddedviewr
   from_tensor_and_offsetsr	   r   r`   rE   r]   r&   )
r9   r'   rY   rZ   r   
full_numelshard_offsetvalid_data_sizelocal_shardssharded_tensors
             r.   _local_post_state_dict_hookr     s0    zfXk]#;VDJ/ *f-K/KK-
F3>>J 44::<J##%
7L &&(:+I+IIO
   0166G
))*|njooV
 +j
0H0HN $$33'++-*8J&*&'r0   c                      y r*    r   s       r.    _local_post_load_state_dict_hookr     s     	r0   c                    t        ||        t        ||| t                | t         t         }||vrt	        ||       rJ d       y||   }t        |t              sJ d       t        ||       j                  }|J |j                         |j                  z
  }|j                         }|dkD  rt        |      sJ d       |d   j                  }|j                  dkD  rp|j                         |j                         k  s*J d|j                          d|j                          d       t        j                  |d|j                  g      }n|}|||<   y)	z
    This hook finds the local flat_param for this FSDP module from the
    state_dict. The flat_param should be a ShardedTensor. This hook converts
    the ShardedTensor to a tensor. No copy happen unless padding is required.
    zONo `FlatParameter` in `state_dict` for this FSDP instance but it has parametersNz4Tensors in local_state_dict should be ShardedTensor.r   z9load_local_state_dict assume one shard per ShardedTensor.zLocal shard size = z% and the tensor in the state_dict is r4   )r   r   r   r&   r   
isinstancer   r   r   r   r   r   lentensorFpad)	r9   r'   rY   rZ   r>   load_tensorr   r   shardss	            r.   _local_pre_load_state_dict_hookr     s    z6"z6fXk]+CDH[M*
.C
*#J7 	
$	
7 	S/Kk=1 >1
  
F3>>J!!! &&(:+I+IIO%%'F6{WWW{Qi&& ))A-$$&)9)9);; %j&6&6&8%9 :%%0%6%6%8$9<; %%a1O1O-PQK !JsOr0   c                     t        | |      r!t        | |      j                  st        d      t	        ||        t        || dd       y)zz
    Hook that runs before model.state_dict() is called. Check
    ``_full_pre_load_state_dict_hook`` for the detail.
    zP``sharded_state_dict`` can only be used when parameters are flatten and sharded.Fr~   N)r   r   r   r   rV   rX   r   s       r.   _sharded_pre_state_dict_hookr     sR     	V,z62HH
 	
  
3 (	r0   c                 l    dt         t        t        f   dt        dt        ffd}t        | |||      S )z
    The hook replaces the unflattened, unsharded parameter in the state_dict
    with a unflattened, sharded parameter (a ShardedTensor).
    rY   rZ   r>   c                    | |   }j                   j                  sRt        |j                  j                  j
                  j                         j                  j                        }n-t        |j                  j                  j                        }j                   j                  r|j                         }|| |<   y )N)r   ra   
world_sizenum_devices_per_nodepgfsdp_extension)r   ra   device_meshr   )r`   _use_dtensorr"   ra   r   rQ   device_countr   _fsdp_extensionr!   r}   rE   r]   )rY   rZ   r>   paramr   r'   s        r.   r[   z1_sharded_post_state_dict_hook.<locals>.param_hook  s    3,,99.__%00%/%>%>%K%K%M++)99N 0__&33)99	N ((77+//1N(
3r0   r   r   s    `   r.   _sharded_post_state_dict_hookr     s?    )tCH~ )s ) ), 0
J
 r0   c                     t        ||       r+t        j                  d      5  t        | |       d d d        y y # 1 sw Y   y xY wr   )r   r   r   rO   r   s       r.   "_sharded_post_load_state_dict_hookr   4  sF     
F+##$>? 	9$VZ8	9 	9 ,	9 	9s	   9Ac                    t        ||        t        |      st        |||t         z          t	        ||       syt        ||       }|j                  st        d      t        t        |j                  j                  |j                  j                              }t        | |      D ]@  \  }}}t        |      s| t         | }n| | }	 |j                  |      }	|j$                  j&                  sBt)        |	|j*                        \  }	}
t-        |
      dk  s!J dt-        |
       d|j.                   d       |	j1                         j3                         }|	j1                         d	   }t5        j6                  ||j8                  z        |z  |z  }t-        |
      d
k(  r|
d	   j:                  j=                         }t?        j@                  t>        jB                  jD                        5  |jG                  |jH                        }ddd       ||j3                         z
  }|d	kD  rEtK        jL                  |d	|g      }n,tO        jP                  ||	jR                  |jH                        }tO        jT                  ||j8                  z  |jR                  |jH                        }t?        j@                  t>        jB                  jV                        5  tY        jZ                  |||j\                         ddd       |j_                  d	d	|      ja                  |	j1                               }|||<   |	jb                  |jd                  jf                  k7  r%|	jG                  |jd                  jf                        }	ti        jj                  |jd                        }tm        |	||j*                        }|jo                  |      ||   }tq        |||j*                        }|||<   C t?        j@                  d      5  ts        | |d       ddd       y# t        $ r t         j#                  d| d       Y w xY w# 1 sw Y   8xY w# 1 sw Y   WxY w# 1 sw Y   yxY w)z
    The hook combines the unflattened, sharded parameters (ShardedTensor) to
    a new FlatParameter and shards the new FlatParameter to the local chunk.
    NzUload_sharded_state_dict can only be called when parameters are flattened and sharded.zDid not find param with FQN zD, skipping it. The weight will not be filled if you expect it to be.   z&Expects 0 or 1 shard per rank but got z shards on rank r4   r   r   )rp   re   )grouprJ   Tr   ):r   r   r   r   r   r   r   r   r   rm   r   _fqns_param_extensionsr?   rM   KeyErrorrn   warningr`   r   r$   r   r   ra   sizer   mathceilr   r   flattenr   r   TypeH2Drf   rl   r   r   rd   zerosrp   empty	ALLGATHERdistall_gather_into_tensorr   narrowreshapere   r}   device_typer   r   r    getr#   rJ   )r9   r'   rY   rZ   handlefqn_to_param_extr>   _fqn_from_global_rootr   r   param_numel
dim_0_size
chunk_sizelocal_tensornum_paddingr   	root_meshexts                      r.   !_sharded_pre_load_state_dict_hookr   =  s    z6"*%:vv;-/HIJ/J/F'')
 	
 F##V%6%6%H%HI 'vz: D<	Qj)&,Xk]3%#@ &,XcU#3 	NN#78E ,,99>z11ME6 v;? v;-'7
7HK?  **,,,.KaJ		*z'<'<<= 
 6{a%ay//779#++N,?,?,C,CD N#/??:3L3L#MLN(<+=+=+???#$55;7G#HL${{ekk*:S:S  [[Z222"((!00F
  ''(;(;(E(EF ++L
0H0H ]]1a5==ejjlKF/5J+,||z66BBB!8!8!D!DE'55j6M6MNI2y*"<"<L  ##C(4&s+< #z'A'A  0<J+,ID<L 
		 ;	< F!&*EF F  	NN./C.D EH H 	6N N ,F Fs6   P.Q #Q?Q"P=<P= Q
	Q	Q#c              #      K   | j                   }| j                  }t               | _         t        j                  | _        d  || _         || _        y wr*   )r`   r^   r   r   r_   )r'   old_state_dict_configold_state_dict_types      r.   "_replace_with_full_state_dict_typer     sJ     &99$55$7$9J!"/"?"?J	$9J!"5Js   AAr   c           
      B   t        |       }|j                  t        j                  k(  r!t	        |      }t        j                  d       nt        j                         }|5  t        j                  t        t        j                  t        t        j                  t        i} ||j                      | |||      }ddd       |j"                  rFt$        j'                  d|       t)        j+                               D ]  \  }}	|j-                  |      st/        |	t0        j2                        s4|	j4                  }
d}t/        |	t6              rGd}
|	j9                         }|rv|d   j:                  j4                  }
|d   j:                  j<                  }nCt/        |	t>              r'|	jA                         j4                  }
|	j<                  }n|	j<                  }t$        j'                  d|tC        |	      |	j4                  |
|	jD                  |        S # 1 sw Y   _xY w)z
    _post_state_dict_hook() is called after the state_dict() of this
    FSDP module is executed. ``fsdp_state._state_dict_type`` is used to decide
    what postprocessing will be done.
    RWhen using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.Nz0FSDP finished processing state_dict(), prefix=%sr   z>FQN=%s: type=%s, shape=%s, local_shape=%s, dtype=%s, device=%s)#r   r+   r   r,   r   r   r   
contextlibnullcontextr   r_   r   LOCAL_STATE_DICTr   SHARDED_STATE_DICTr   r^   rT   rn   ro   sorteditems
startswithr   rd   Tensorshaper   r   r   re   r   to_localtyperp   )r9   rY   rZ   r   r'   context_post_state_dict_hook_fnprocessed_state_dictkeyr   local_shapere   r   s                r.   _post_state_dict_hookr    s    @GJ##'7'@'@@4Z@	

 ((*	 
))+E**,G,,.K$
 
  U7
8S8STJ
F 

 FO!"6"<"<">? 	KC~~f%*VU\\*J$llfm4"&K#002F&,Qi&6&6&<&<!'!1!1!8!80"(//"3"9"9K#]]F#]]FTLLLLL	2  K
 
s    AHHc                    t        |       }|j                  t        j                  k(  r!t	        |      }t        j                  d       nt        |       t        j                         }|5  t        j                  t        t        j                  t        t        j                  t         i} ||j"                     || g|i | ddd       y# 1 sw Y   yxY w)z
    This is called before the core state dict saving logic of ``module``.
    ``fsdp_state._state_dict_type`` is used to decide what postprocessing will
    be done.
    r   N)r   r+   r   r,   r   r   r   _set_use_dtensorr   r   r   r_   r   r   r   r   r   r^   )r9   r   r   r'   r   _pre_state_dict_hook_fns         r.   _pre_state_dict_hookr    s     @GJ##'7'@'@@4Z@	

 	$((*	 
))+D**,F,,.J#

 	=
 ; ;<	
 	
 		

 
 
s   +ACCc                     t        | dd       r>| j                  }|t        j                  k(  rt	        ddd      d| j
                  _        y y )Nr}   z&Found state_dict_type LOCAL_STATE_DICTz3DeviceMesh is not compatible with LOCAL_STATE_DICT.zKPlease set state_dict_type to SHARDED_STATE_DICT to get DTensor state_dict.T)r   r^   r   r   r   r`   r   )r'   state_dict_types     r.   r  r    sT     z>40$55m<<<8E]  :>J))6 1r0   c                 n   t        |       }|j                  t        j                  k(  r!t	        |      }t        j                  d       nt        |       t        j                         }t        ||        |j                  rt        j                          |5  t        j                  t         t        j"                  t$        t        j&                  t(        i}|j*                  j-                         r|j*                  j/                           ||j0                     | |||       ddd       y# 1 sw Y   yxY w)z
    This is called before ``module._load_from_state_dict()``.
    ``fsdp_state._state_dict_type`` is used to decide what preprocessing will
    be done.
    r   N)r   r+   r   r,   r   r   r   r  r   r   r   rT   r   resetr   r_   r   r   r   r   r   rQ   rR   rS   r^   )r9   rY   rZ   r   r'   r   _pre_load_state_dict_hook_fns          r.   _pre_load_state_dict_hookr  !  s     @GJ##'7'@'@@4Z@	

 	$((*z6"	 
))+I**,K,,.O(
$ $$113%%113A$Z%@%@AJ
F	

 
 
s   BD++D4incompatible_keysc                    t        |       }|j                  t        j                  k(  r!t	        |      }t        j                  d       nt        j                         }|5  t        j                  t        t        j                  t        t        j                  t        i} ||j                      | |       d d d        |d   }|d   }t#        t%        |            D ]  }t'        ||         ||<    t#        t%        |            D ]  }t'        ||         ||<    |j(                  rt+        j,                  d       y y # 1 sw Y   xY w)Nr   r   r   z&FSDP model load_state_dict profiling: )r   r+   r   r,   r   r   r   r   r   r   r_   r   r   r   r   r   r^   ranger   r   rT   r   dump_and_reset)	r9   r  r   r'   r   _post_load_state_dict_hook_fnmissing_keysunexpected_keysis	            r.   _post_load_state_dict_hookr  L  sC    @GJ##'7'@'@@4Z@	

 ((*	 	W))+J**,L,,.P)
% 	C%j&A&AB6:V	W %Q'L'*O3|$% =+LO<Q= 3'( C.q/ABC %%&NO )	W 	Ws    AD>>Estatec                 z    dt         i fdt        i fdt        ddifdt        i ffD ]  \  }}}t	        | |||        y)zR
    Registers pre-save, post-save, pre-load, and post-load state dict hooks.
    register_state_dict_pre_hook_register_state_dict_hook"_register_load_state_dict_pre_hookwith_moduleT"register_load_state_dict_post_hookN)r  r  r  r  _register_state_dict_hooks_base)r  hook_registration_fn_strhookhook_registration_fn_kwargss       r.   _register_all_state_dict_hooksr!  u  sj    
 
()=rB	$&;R@0%D!	

 
./I2N	H 
C $(C 	(+T3N	

r0   hook_registration_fn_namer  r   c                     t        |       s t        | |      |fi | y| j                  }|r t        |j                  |      |fi | yy)z2Registers ``hook`` using ``hook_registration_fn``.N)r   r   _handle_fully_sharded_module)r  r"  r  r   r   s        r.   r  r    sX     % 101$V:UVLGF002KL3 r0   )FFFF)hr   loggingr   r   collections.abcr   r   typingr   r   r   r   rd   torch.distributeddistributedr   ;torch.distributed.algorithms._checkpoint.checkpoint_wrapper
algorithms_checkpointr6   torch.nnnntorch.nn.functional
functionalr   'torch.distributed._shard.sharded_tensorr	   r
   r   torch.distributed.device_meshr   $torch.distributed.fsdp._common_utilsr   r   r   r   r   r   r   r   #torch.distributed.fsdp._debug_utilsr   %torch.distributed.fsdp._runtime_utilsr   r   r   r   torch.distributed.fsdp.apir   r   r   torch.distributed.tensorr   torch.distributed.utilsr   _fsdp_extensionsr    r!   r"   r#   r$   _unshard_param_utilsr%   r&   	getLogger__name__rn   boolr/   r   r8   Moduletupler?   rB   rJ   rO   rV   rX   r   r{   r   r   r   r   r   r   r   r   r   r   r   r   contextmanagerr   no_gradr  r  r  r  listr  r!  r  r   r0   r.   <module>rD     sE       / 5 5    X X    
 :	 	 	 ?  
 - 6  I 
		8	$z d   
+II
+#-
+eCcM"#
++II+eCcM"#+   7II77 7 	7
 7 7 
7 78 /RYY /J /4 / /
GII
G
G 

GII  	
 
, fIIff S#Xf 	f
 f 
#s(^f fR II
 
 2 )II)) S#X) 	)
 
#s(^) )XJIIJJ S#XJ 	J
 
J9II9#-9	944II4
 
4, .II.. S#X. 	.
 
#s(^. .b	II	#-			-"II-"-" S#X-" 	-"
 
-"`II
 
: #II## S#X# 	#
 
#s(^# #L 9II9#-9	9 9 aFIIaFaF S#XaF 	aF
 
aF aFH 6: 6) 6 6 : II: S#X:  :  	: 
 
#s(^:   : z  
II 
 
	 
   
F > > > > &
II&
S#X&
 &
 	&

 
&
  &
R $PII$PT#YS	12$P $P 
	$P  $PN
* 
& "  "&c3h	
 
 r0   