
    Vh                     B   d dl Z d dlZd dlZd dlZd dlmZmZmZ d dlm	Z	m
Z
mZmZmZmZ d dlZd dlmZ d dlmc mc mZ d dlmc mc mZ d dlmc mc mZ d dlmZ d dlm Z  d dl!m"Z"m#Z# d dl$m%Z% d dl&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z- d dl.m/Z/m0Z0m1Z1m2Z2 d d	l3m4Z4 d d
l5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z= d dl>m?Z? d dl@mAZA d dlBmCZC d dlDmEZE erd dlFmGZG dZH	 d dlImJZJmKZK  eMd      ZNdZOePej                  ej                  f   ZReeej                  eRf      ZSe;j                  e2j                  e;j                  e2j                  e;j                  e2j                  e;j                  e2j                  e;j                  e2j                  iZYe;j                  e;j                  gZZe;j                  e;j                  fZ[e	 d_de(deSde;dee?   dee#   de(fd       Z\ede(deSde#de(fd       Z]ede	de^fd       Z_ede#de^fd       Z`edeMdej                  fd        Zaed!ej                  deMdej                  fd"       Zbd!ej                  deMdePej                  ej                  f   fd#Zce	 d_de(d$ej                  d%eeej<                  j                        d&eeeej<                  j                        eeej<                  j                        f   de(f
d'       Zfd&ege	   d(e^ddfd)Zhede(d$ej                  d*eiej                     d+eeeMej                  f      de(f
d,       Zkede(d$ej                  de(fd-       Zlede(dee;   d.ee:   d/ee7   d0e^d1e^d2eMd3eMde(fd4       Zmede(de(fd5       Znede(d6e6d7e^de(fd8       Zoed_de(de#de(fd9       Zpede(de(fd:       Zqd$ej                  d;egej                     ddfd<Zrede(d=ej                  d+eeeMej                  f      d>ee
ej                  gdf      d?e^de(fd@       Zsede(d;egej                     d=ej                  fdA       ZtdBej                  dCeeej<                  j                        deiej                     fdDZu	 d_dBej<                  j                  d%eiej<                  j                     dEeeej<                  j                        deiej<                  j                     fdFZvdBej<                  j                  d%eiej<                  j                     deiew   fdGZxdBej                  deiew   fdHZyd$ej                  d*eiej                     d+eeeMej                  f      ddfdIZzd+eeeMej                  f      dJeMdKe'deej                     fdLZ{d$ej                  d*eiej                     d%eiej                     dePe^e^f   fdMZ|dBej                  d>e
ej                  gdf   d%eiej                     ddfdNZ}dBej                  dOeej                     d%eiej                     dKe'fdPZ~dBej                  d%eiej                     degej                     fdQZd$ej                  d*eiej                     dReiej                      dOeej                     ddf
dSZd;egej                     dTegej                      dOeej                     ddfdUZdV Zd$ej                  d*eiej                     dOeej                     dJeMdKe'dej                  fdWZd$ej                  d;egej                     dej                  ddfdXZdYegej                      ddfdZZd$ej                  d*eiej                     deej                     fd[Zd*eiej                     ddfd\Zde;fd]Zdej                  de j                  fd^Zy# eL$ r dZHY w xY w)`    N)	GeneratorIterableIterator)AnyCallableno_type_checkOptionalTYPE_CHECKINGUnion)default_hooks)_mesh_resources
DeviceMesh)_get_default_group)_FSDPDeviceHandle
_FSDPState_get_module_fsdp_state_is_fsdp_flattened!_named_parameters_with_duplicatesclean_tensor_nameTrainingState)_FSDP_USE_FULL_PREC_IN_EVALFlatParameterFlatParamHandleHandleShardingStrategy)_FreeEventQueue)BackwardPrefetch
CPUOffloadFullOptimStateDictConfigFullStateDictConfigMixedPrecisionShardingStrategyStateDictConfigStateDictType)_Policy)DTensorExtensions)_sync_params_and_buffers)is_traceable_wrapper_subclass)RemovableHandleT)deferred_initfakeFi  _fsdp_syncedstateprocess_groupsharding_strategypolicydevice_meshreturnc                 (   ||t        d      |t        v }|r#|||t        d| d      t        | ||      } n4|r|| _        |j	                  d      | _        n||n	t               | _        | j
                  j                         | _        | j
                  j                         | _	        | j                  }|r|| j                  j                         z  }t        j                  j                  |      | _        || j                  z  | _        | S )NzcCannot pass both process_group and device_mesh at the same time. Please just pass only one of them.zManual wrapping with zA requires explicit specification of process group or device_mesh.r   mesh_dim)
ValueErrorHYBRID_SHARDING_STRATEGIES*_init_process_group_state_for_hybrid_shard_device_mesh	get_groupr-   r   ranksize
world_size_inter_node_pgr   DefaultState_get_gradient_predivide_factor_gradient_predivide_factor_gradient_postdivide_factor)r,   r-   r.   r/   r0   is_hybrid_strategydata_parallel_world_sizes          R/home/dcms/DCMS/lib/python3.12/site-packages/torch/distributed/fsdp/_init_utils.py_init_process_group_staterE   Y   sI     [%<<
 	
 +.HH V^8K '(9': ;S S 
 ?}kE !,E"-"7"7"7"CE "/!:@R@T  $$))+EJ**//1E$// E$8$8$=$=$?? ""AA$	
 
$ 	!5#C#CC 
% L    c                    |rYt        |      r6|| _        |j                  d      | _        |j                  d      | _        nt        d|j                         |@t               }t        || j                  j                               \  }}|| _        || _        n2t        |      r|\  | _        | _        nt        dt        |             t        | j                        | _        | S )Nr   r3      z,Expected device_mesh to have ndim=2 but got zmExpected process_group to be passed in as either None or Tuple[dist.ProcessGroup, dist.ProcessGroup] but got r-   )"_is_valid_hybrid_shard_device_meshr8   r9   r=   r-   r5   ndimr   !_init_intra_and_inter_node_groups_device_handledevice_count_is_valid_hybrid_shard_pg_typetype_get_default_comm_hook_state_inter_node_state)r,   r-   r0   default_groupintra_node_groupinter_node_groups         rD   r7   r7      s
    -k:!,E $/#8#8!#8#DE "-"7"7"7"CE>{?O?O>PQ  
	*,-N5//<<>.
** // *-8 9F5E!5GGKMGZF[] 
 ;**E LrF   c                 j    t        | t              xr" t        |       dk(  xr t        d | D              S )N   c              3   P   K   | ]  }t        |t        j                           y wN)
isinstancedistProcessGroup).0pgs     rD   	<genexpr>z1_is_valid_hybrid_shard_pg_type.<locals>.<genexpr>   s     Jb
2t001J   $&)rZ   tuplelenallrI   s    rD   rO   rO      s:     	=%( 	K!#	KJMJJrF   c                 D    t        | t              xr | j                  dk(  S )NrW   )rZ   r   rK   )r0   s    rD   rJ   rJ      s    k:.H;3C3Cq3HHrF   num_devices_per_nodec                 6    t        j                  |       \  }}|S )aU  
    Return a process group across the current node.

    For example, given each row is a distinct node:
    0  1  2  3  4  5  6  7
    8  9 10 11 12 13 14 15
    This API would return an intra-node subgroup across
    [0, 1, ..., 7] or [8, 9, ..., 15] depending on the process's rank.
    For example, rank 3 would get [0, 1, ..., 7].
    )r[   new_subgroups)re   intra_node_subgroup_s      rD   _init_intra_node_process_grouprj      s!     "//0DErF   global_process_groupc                 T   d}t        j                  |       }t        j                  |       }||z  }t        j                  |       |z  }t	        |      D ]?  }t	        |      D cg c]
  }|||z  z    }	}t        j
                  |	|      }
||k(  s>|
}A |
J | d       |S c c}w )a  
    Return an inter-node process group where each contained rank has the same local rank.

    For example, given each row is a distinct node:
    0  1  2  3  4  5  6  7
    8  9 10 11 12 13 14 15
    This API would return inter-node process group [0, 8], [1, 9], [2, 10], and so forth
    depending on the process's rank. For example, rank 1 would get [1, 9], rank 5
    would get [5, 13].
    N)ranksbackendz. expected to assign inter-node pg, but did not)r[   get_backendget_world_sizeget_rankrange	new_group)rk   re   inter_node_pgsharding_backendr<   	num_nodesmy_local_rank
local_rankiranks_for_inter_groupgrps              rD   _init_inter_node_process_groupr|      s      M''(<=$$%9:J22IMM"67:NNM01  
=B9=M!
89J!223!
 !
 nn#8BRS&M  $ /GH$ !
s   %B%c                 0    t        |      t        | |      fS )a  
    Initialize intra and inter-node process groups and return the ones corresponding to this process's rank.

    This function can be used to initialize process groups for ``HYBRID_SHARD`` or
    ``_HYBRID_SHARD_ZERO2`` in FSDP.
    This function assumes each node has an equal number of CUDA-enabled devices.
    Returns:
        Tuple[dist.ProcessGroup, dist.ProcessGroup]: Intra and inter-node process group.
    )rj   r|   )rk   re   s     rD   rL   rL      s#     	'';<&';=QR rF   moduleignored_modulesignored_statesc                    ||t        d      d }|d u}|rt        |      }t        |d       ng }t        |t        |      ng d       t        |      dkD  r"t	        |d   t
        j                        r|}n|}t        ||      | _        t        || j                  |      | _
        t        || j                        | _        | S )NzfCannot pass both ignored_modules and ignored_states at the same time. Please just pass ignored_states.TFr   )r5   list_check_ignored_statesrb   rZ   nn	Parameter_get_ignored_modules_ignored_modules_get_ignored_params_ignored_params_get_ignored_buffer_names_ignored_buffer_names)r,   r~   r   r   ignored_parameterspassed_as_ignored_statesignored_states_lists          rD   _init_ignored_module_statesr     s     "~'A:
 	
 -T9">2148 %4%@D!b%	
 !#)!,bll;!41O1&/JE/E
 #<#E LrF   r   c                    t        |       dk(  ry|r`t        d | D              }t        d | D              }|s9|s6t        | D ch c]  }t        |       c}t              }t        d|       yyt        d | D              s6t        | D ch c]  }t        |       c}t              }t        d|       yc c}w c c}w )	z
    Check that the ignored states are uniformly parameters or uniformly modules.

    We may remove this check in the future if we permit mixing.
    r   Nc              3   P   K   | ]  }t        |t        j                           y wrY   )rZ   r   r   r]   r,   s     rD   r_   z(_check_ignored_states.<locals>.<genexpr>I  s     UUE2<<8Ur`   c              3   P   K   | ]  }t        |t        j                           y wrY   rZ   r   Moduler   s     rD   r_   z(_check_ignored_states.<locals>.<genexpr>J  s     S5*UBII6Sr`   )keyzUignored_states expects all nn.Parameter or all nn.Module list elements but got types c              3   P   K   | ]  }t        |t        j                           y wrY   r   r   s     rD   r_   z(_check_ignored_states.<locals>.<genexpr>S  s     LE:eRYY/Lr`   z>ignored_modules expects nn.Module list elements but got types )rb   rc   sortedrP   reprr5   )r   r   
all_paramsall_modulesr,   sorted_typess         rD   r   r   >  s     >aUnUU
SNSS+!N"K54;"KQUVL**69  #.z L^LL!N"K54;"KQUVL%(  M #L #Ls   B;C ignored_params	device_idc                 6   d}|1t        |t        j                        r|nt        j                  |      }|t        ||      D ]|  }|j                  j                  dv r||j                  }+|j                  j                  |j                  k7  sOt        d|j                   d|j                  j                          |xs t        j                  j                         }|j                  dk(  rt        d      t        j                  |      | _
        | S )a<  
    Determine device handle used for initializing FSDP.

    If a device is specified by ``device_id``,
    then returns device handle corresponds to that device type. Otherwise, If the
    module is already on a non-CPU device, then the device type is that non-CPU device type.
    If the module is on CPU or meta, then the device type is the current accelerator device.
    See the :ref:`Accelerators<accelerators>` for details.


    This method will be called once ignored paramters was determined, as the device handle maybe needed
    for other initialization.
    N>   cpumetazLFSDP does not support modules with different device types but got params on z and r   zOFSDP needs a non-CPU accelerator device, but no accelerator device is detected.)rZ   torchdevice_get_orig_paramsrP   RuntimeError_C_get_acceleratorr   from_devicerM   )r,   r~   r   r   determined_deviceparams         rD   _init_device_handler   [  s   (  )U\\2 i( 	
  %fn= 
	E||  O3 ($)LL!<<$$(9(>(>>&-->-C-C,DE%,,J[J[I\^ 
	 .L1J1J1L!!U*a  -889JKELrF   c                     t        |      | _        i }|j                         D ]  \  }}t        |      }|j                  ||<   ! || _        | S rY   )_get_buffer_names_buffer_namesnamed_buffersr   dtype_buffer_name_to_orig_dtype)r,   r~   r   buffer_namebuffers        rD   _init_buffer_stater     s_    
 ,F3E
 :<%335 ?V'428,,";/? (BE$LrF   mixed_precisioncpu_offloadlimit_all_gathersuse_orig_paramsbackward_prefetch_limitforward_prefetch_limitc                    | j                   dk(  rO|t        j                  k7  r+t        j                  d|xs t        j
                   d       t        j                  }n/|t        j                  k(  rt        j                  dt        d       |xs t        j
                  | _        |xs
 t               | _	        |5t        j                  j                  dt        | j                                t        j                  j!                  t"        d      d	k(  | _        |xs
 t'               | _        || _        || _        t.        j0                  | _        d | _        t7               | _        t;        j<                         | _        tA        jB                  | j>                  ||      | _"        d | _#        i }|| _$        d }	|	| _%        g }
|
| _&        | S )
NrH   z/FSDP is switching to use `NO_SHARD` instead of z since the world size is 1.zoThe `NO_SHARD` sharding strategy is deprecated. If having issues, please use `DistributedDataParallel` instead.   )
stacklevelz'torch.distributed.fsdp.mixed_precision. 1)'r<   r!   NO_SHARDwarningswarn
FULL_SHARDFutureWarningr.   r    r   r   r   _log_api_usage_oncestrosenvirongetr   _use_full_prec_in_evalr   r   r   _use_orig_paramsr   IDLEtraining_state_is_rootr   _free_event_queuer[   get_debug_level_debug_levelexec_order_utils_ExecOrderData_exec_order_data_unshard_event_fully_sharded_module_to_handle_handleparams)r,   r.   r   r   r   r   r   r   r   r   r   s              rD   _init_core_stater     s    1 0 9 99MMA$C(8(C(CD E''
 -55	.77	7< 	
 0N3C3N3NE+?~/?E"$$5c%:O:O6P5QR	
 	

2B73> 
  $3z|E/E,E(--EEN-/E--/E-<<E
  E IK#,KE) *.GEM"$FELLrF   c                 f    g }|| _         g }|| _        g }|| _        d| _        d | _        d | _        | S )NT)_root_pre_forward_handles_pre_forward_handles_post_forward_handles_sync_gradients
_comm_hook_comm_hook_state)r,   r   r   r   s       rD   _init_runtime_stater     sK     8:&?E#24!5E35"7E EE!ELrF   backward_prefetchforward_prefetchc                 "    || _         || _        | S rY   )r   r   )r,   r   r   s      rD   _init_prefetching_stater     s     0E-E LrF   c                     t        j                  |      }|r+|| j                  k7  rt        | j                        | _        | S d | _        | S rY   )r   get_root_meshr8   r%   rM   _fsdp_extension)r,   r0   	root_meshs      rD   _init_extensionr     sO      --k:I yE$6$66 1%2F2F G
 L !%LrF   c                     t         j                  | _        t               }t	               | _        || _        i }|| _        | S rY   )r#   FULL_STATE_DICT_state_dict_typer   r   _optim_state_dict_config_state_dict_config_unshard_params_ctx)r,   state_dict_configunshard_params_ctxs      rD   _init_state_dict_stater     s?    *::E)<)>%=%?E"0E57 2ELrF   r   c                     |D ]O  }t        |j                        dk(  sd}| j                         D ]  \  }}||u s|} n |sJ t        d| d       y)z
    Verify if the parameters are accepted by FSDP. The only restriction now
    is that the parameter cannot be a scalar tensor (param.shape == []).
    r   r   z/FSDP doesn't support scalar parameters. Change z& to a 1D tensor with numel equal to 1.N)rb   shapenamed_parametersr5   )r~   r   r   
param_namenameparam_s         rD   _verify_managed_paramsr     s~    
  u{{q J & 7 7 9 fF?!%J :$%KM rF   fully_sharded_moduleparam_init_fnsync_module_statesc                 r    t        | j                  |       t        | j                   j                        }t        | j                   j                        \  }}|s|r|t        || j                         nA|r#t        || j                   j                         n|rt        j                  | fd        j                  D 	ch c]  }|j                         D ]  }	|	  }
}}	t        | j                  |
|       t        | j                  | j                   j                         _        t        t!        | j                              }t#        ||       |r@t%        || j&                          j(                  t*        v rt%        || j,                         t/         ||        S c c}	}w )zHInitialize a ``FlatParamHandle`` from a module ``fully_sharded_module``.c                 >    t        |       d u xr | j                  vS rY   )r   r   )	submoduler,   s    rD   <lambda>z0_init_param_handle_from_module.<locals>.<lambda>L  s(    '=i'HD'P (8!7!77 rF   )check_fn)_check_single_device_moduler   _get_device_from_device_idr:   rM   _need_to_materialize_moduler   _materialize_with_param_init_fn_materialize_meta_moduler)   materialize_modulebuffers_move_module_to_device_get_compute_devicecompute_devicer   r   r   _sync_module_params_and_buffersr-   r.   r6   r=   _init_param_handle_from_params)r,   r   r   r   r   device_from_device_idis_meta_moduleis_torchdistX_deferred_initignored_moduler   ignored_buffersmanaged_paramss   `           rD   _init_param_handle_from_moduler  -  s      4e6K6KYW65::u33 3Ne33U5K5K3/N/ 	5=;T' -1G1G	
 
  ""  		
 
%(( 8	
 $44$,,.  	O  	 /

E *+?AVAVWXN/@' .%2E2E	
 ""&@@+$ne6J6J #5.:NOL?s   F3c                    t        |      dk(  ry t        ||| j                  t        | j                     | j
                  j                  | j                  j                  | j                  j                  | j                  j                  | j                  | j                  | j                        }|j                          | j                  rJ | j                   j#                  |j$                         || _        || j&                  |j(                  <   t+        j,                  d      }| j
                  j                  r,|j$                  j,                  |k7  r|j/                  |       y y y )Nr   )fsdp_extensionr   )rb   r   r  SHARDING_STRATEGY_MAPr.   r   offload_paramsr   param_dtypereduce_dtypekeep_low_precision_gradsr-   r   r   shardr   r   append
flat_paramr   _fully_sharded_moduler   r   flat_param_to)r,   r   r   handle
cpu_devices        rD   r  r  r  s%    6{ae556(())**66,,F LLN}}	LL))*EMJPE))&*F*FGe$J''F,=,=,D,D
,RZ( -S'rF   root_moduler   c           	         d}	 |t        |      n	t               }|D ]V  }t        |t        j
                  j                        st        |dt        |       z         t        |      sMt        d       | j                         D ])  }t        j                  |      r|j                  |       + |D ch c]3  }|j                         D ]  }t        |t        j                        s|  5 }}}| |v rt        j                   d        | j                         D ]9  }t        |      }	|	t#        |	d      sJ |j%                  |	j&                         ; |S # t        $ r }t        |dt        |       z         |d}~ww xY wc c}}w )ah  
    Check that ``_ignored_modules`` is an iterable of ``nn.Module`` s without any FSDP instances.

    Return the modules contained in their module
    subtrees as a :class:`set`. Nested FSDP instances are excluded, but their
    already-computed ignored modules are included.

    ``_ignored_modules`` represents the argument passed by the user to FSDP.
    z>`ignored_modules` should be an iterable of `torch.nn.Module`s Nzbut got zbut got an iterable with z1`ignored_modules` should not include FSDP moduleszTrying to ignore the top-level module passed into the FSDP constructor itself will result in all parameters being ignored and is not well-supported: r   )set	TypeErrorrP   rZ   r   r   r   r   r5   modulestraversal_utils_composableadd	fsdp_fileFullyShardedDataParallelr   r   hasattrupdater   )
r$  r   
msg_prefixignored_root_moduleser~   childr   r   optional_fsdp_states
             rD   r   r     s    RJQ%5%AC !su 	
 ' R&%((//2J+DT&\N)SSTT!&) PQQR %%' -**62 $$V,- +^^% %!C!CD 	O  o%228;	
 !((* I	4Y?*.0BCCC""#6#G#GH	I
 I  Q
x5E0F/G%HHIqPQ$s   E 78F	F%F  Fr   c                    t               }|D ch c]%  }|j                         D ]  }t        |      r| ' }}}|j                  |       |,|D ch c]  }t        |      r| }}|j                  |       | j	                         D ]9  }t        |      }	|	t        |	d      sJ |j                  |	j                         ; |S c c}}w c c}w )z
    Return the parameters of the modules in ``ignored_modules`` and the parameters in ``ignored_parameters``.

    :class:`FlatParameter` s are excluded from the result.
    r   )r&  
parametersr   r/  r(  r   r.  r   )
r$  r   r   all_ignored_paramsmpparams_in_ignored_modulesparams_in_ignored_parametersr   r4  s
             rD   r   r     s     36% #!ALLN!'(BTUVBW!	! ! 78%)(
1CA1FA(
$ (
 	!!">? !((* K	4Y?*.0ABBB%%&9&I&IJ	K '!(
s   #C
C
C%Cc           	         t               }|D ch c]  }|j                         D ]  }|  }}}|j                  | j                         D ch c]  \  }}||v rt	        |       c}}       | j                         D ]9  }t        |      }|t        |d      sJ |j                  |j                         ; |S c c}}w c c}}w )z6Return the cleaned buffer FQNs in ``ignored_modules``.r   )	r&  r	  r/  r   r   r(  r   r.  r   )	r$  r   all_ignored_buffer_namesr8  r   buffers_in_ignored_modulesr   r   r4  s	            rD   r   r     s    
 *- ("aiik",2"" " ## (3'@'@'B	
#V33 k*	
 !((* W	4Y?*.0GHHH$++,?,U,UV	W $#'"
	
s   B>C
c                 f    | j                         D ch c]  \  }}t        |       c}}S c c}}w )zrReturn the fully prefixed names of all buffers in the module hierarchy rooted at ``root_module`` as a class:`set`.)r   r   )r$  r   ri   s      rD   r   r   	  s5     >I=V=V=X+9;+&  s   -c                     t        | |      D ch c]  }|j                   }}t        |      dk(  r%t        j                  d      |v r|t	        d      yt        |      dkD  rt	        d|       yc c}w )z
    Raise an error if ``module`` has original parameters on multiple devices, ignoring the parameters in ``ignored_params``.

    Thus, after this method, the
    module must be either fully on the CPU or fully on a non-CPU device.
    rW   r   NzTTo support a module with both CPU and GPU params, please pass in device_id argument.rH   z;FSDP only supports single device modules but got params on )r   r   rb   r   r   )r~   r   r   r   devicess        rD   r  r    s     *:&.)QRu||RGR 7|qU\\%0G;5  
 
W	I'S
 	
 
 Ss   A4r:   device_handlec                 ^   | yt        | t        j                        r| nt        j                  |       }|j                  dk7  ri|j                  ]t        j                  d|  d| d|j                          d|j                   d	       t        j                  |j                               }|S )z
    Return a ``torch.device`` for the specified ``device_id``.

    Processes ``device_id`` and returns either the corresponding device or
    ``None`` if ``device_id`` is ``None``.
    Nr   z"FSDP got the argument `device_id` z	 on rank zJ, which does not have an explicit index. FSDP will use the current device z6. If this is incorrect, please explicitly call `torch.zk.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.)rZ   r   r   rP   indexr   r   current_device)r   r:   rB  r   s       rD   r  r  .  s     	5<<8	ell9>U  {{e 409f 00=0L0L0N/O PCCI;;- P11	
 m::<=MrF   c                    t        t        | |            }t        d |D              }| j                         D ]-  }||v r|j	                  d      D ]  }||j
                  z  } / | xr t        xr t        d |D              }||fS )z
    Return if ``module`` has parameters on meta device and if ``module`` is using torchdistX deferred initialization.

    At most of the returned bools can
    be ``True``. If either is ``True``, then ``module`` needs to be
    materialized.
    c              3   4   K   | ]  }|j                     y wrY   )is_metar]   r   s     rD   r_   z._need_to_materialize_module.<locals>.<genexpr>X  s     C5Cs   Frecursec              3   F   K   | ]  }t        j                  |        y wrY   )r*   is_fakerI  s     rD   r_   z._need_to_materialize_module.<locals>.<genexpr>d  s     @U#@s   !)r   r   anyr(  r	  rH  _TORCHDISTX_AVAIL)r~   r   r   r  r  r   bufr  s           rD   r  r  K  s     *6>BCNCNCCN ^^% *	'$$U$3 	*Cckk)N	**  	A	A@@@  
 666rF   c                     t        |      st        d| dt        |             t        | |      }|D ]
  } ||        y )Nz	Expected z to be callable but got )callabler5   rP   _get_modules_to_materialize)r$  r   r   modules_to_materializer~   s        rD   r  r  i  sV    
 M"&>tM?R>ST
 	
 9oV( frF   r  c           	      :   |xs# t        j                  |j                               }t        | |      }d }	 t        j                         5  |D ]u  }t        j                  |j                  d      |j                  d            }t        t        |            dkD  }|sS|j                  |d       |j                          w 	 d d d        y # 1 sw Y   y xY w# t        $ r5}	t        j                  dt!        |	       dt#        |       d       |	d }	~	ww xY w)NFrJ  r   )r   rK  zIUnable to call `reset_parameters()` for module on meta device with error z(. Please ensure that your module oftype z* implements a `reset_parameters()` method.)r   r   rE  rS  no_grad	itertoolschainr6  r	  rb   r   to_emptyreset_parametersBaseExceptionr   r   r   rP   )
r$  r  r   rB  materialization_devicerT  r~   module_state_iterhas_module_statesr2  s
             rD   r  r  w  s    3 ell$$&7 9oVF ]]_ 
	.0 	. %.OO%%e%4fnnUn6S%! %(->(?$@1$D!$OO+A5OQ++-	.
	. 
	. 
	.  !!$Q )L>!KM	

 s<   C AC!%CC CC C 	D%0DDc                 "   g }t        j                  | g      }| h}|rq|j                         }|j                  |       |j	                         D ]:  }||vst        |      ||vs|j                  |       |j                  |       < |rq|S rY   )collectionsdequepopleftr  childrenr   r+  )r$  r   rT  queuevisited_modulesr~   child_modules          rD   rS  rS    s    
 /1{m,E'2mO
%%f-"OO- 	+LO3*<8@ 7##L1\*	+  "!rF   r  c                    t        j                  d      |	t        j                         }|j	                  |        g }g }|r|j                         }|j                  fd|j                  d      D               |j                  fd|j                  d      D               |j                         D ].  }t        |t        j                        r|j	                  |       0 |r|D 	cg c]	  }	|	|vs|	 }
}	|D 	cg c]	  }	|	|vs|	 }}	t        |
||       yt        t        | |      d      }||j                  k(  rt!                yyyc c}	w c c}	w )a  
    Move ``module`` depending on ``device_from_device_id`` and its current device.

    This includes moving ignored modules' parameters.

    - If ``device_from_device_id`` is not ``None``, then this moves
    ``module`` to the device.
    - If ``device_from_device_id`` is ``None``, then this does not move
    ``module`` but warns the user if it is on CPU.

    Precondition: ``_check_single_device_module()``.
    r   Nc              3   @   K   | ]  }|j                   k(  r|  y wrY   r   )r]   r   r#  s     rD   r_   z)_move_module_to_device.<locals>.<genexpr>  s%      <<:-    FrJ  c              3   @   K   | ]  }|j                   k(  r|  y wrY   ri  )r]   r   r#  s     rD   r_   z)_move_module_to_device.<locals>.<genexpr>  s%      ==J. rj  )r   r   r`  ra  r  rb  extendr6  r	  rc  rZ   r,  r-  _move_states_to_devicenextr   _warn_cpu_init)r~   r   r  r  rd  r   r	  curr_moduler   r9  params_to_movebufs_to_mover   r#  s                @rD   r
  r
    s\   $ e$J( /:.?.?.AV%'&(--/K
 MM (33E3B 
 NN )11%1@ 
 )113 ,	!)Y-O-OPLL+,! & &,Gq/F!GG#*Gaa.FGG~|=RS!&.94@EU\\Z7 8 HGs   1	E;E	EEr	  c                 6   t        |       dk(  rt        |      dk(  ryt        |       dkD  r| d   j                  }nt        |      dkD  r|d   j                  }t        j                  d      }|| D ]k  }t        j                         5  |j	                  |      |_        |j                  *|j                  j	                  |      |j                  _        ddd       m |D ]  }|j	                  |      |_         y|k(  rt                yy# 1 sw Y   xY w)z
    Move states to the specified device.

    Precondition: ``_check_single_device_module()`` and module's parameters and
    buffers have been materialized if needed.
    r   Nr   )rb   r   r   rV  todatagradro  )r   r	  r  rE  r#  r   r   s          rD   rm  rm    s    6{aCLA-
6{Q))	W	 **e$J(  	KE K"XX&;<
::)&+jjmm4I&JEJJOK K	K
  	;F ))$9:FK	;	:	% 
&K Ks   	ADD	c                  .    t        j                  d       y )Nam  The passed-in `module` is on CPU and will thus have FSDP's sharding initialization run on CPU, which may be slower than on GPU. We recommend passing in the `device_id` argument for FSDP to move `module` to GPU for the sharding initialization. `module` must also be on GPU device to work with the `sync_module_states=True` flag since that requires GPU communication.)r   r    rF   rD   ro  ro    s    MM	1rF   c                     t        t        | |      d      }|&|j                  j                  dk7  r|j                  }n#t	        j                  |j                               }|||k7  rt        d| d| d|       |S )a)  
    Determine and return this FSDP instance's compute device.

    If the module is already on a non-CPU device, then the compute device is that non-CPU
    device. If the module is on CPU, then the compute device is the current
    device.

    Since this method should be called after materializing the module, any
    non-CPU device should not be meta device. For now, the compute device is
    always a CUDA or CUDA-like device with its explicit index.

    Precondition: ``_check_single_device_module()`` and
    ``_move_module_to_device()``.
    Nr   z4Inconsistent compute device and `device_id` on rank z: z vs )rn  r   r   rP   r   rE  r5   )r~   r   r  r:   rB  r   r  s          rD   r  r    s    * !&.94@EU\\..%7m&B&B&DE(^?T-TB4&d#8"9;
 	
 rF   c                 ~   g }| j                         D ]  }t        |t        d      rt        |t        d       |j	                         }t        |      r>|j                         \  }}|D cg c]  }t        ||       }	}|j                  |	       |j                  |        |D ]l  }
|
j	                         }t        |      r>|j                         \  }}|D cg c]  }t        ||       }}|j                  |       \|j                  |       n t        |       t        ||t        d       yc c}w c c}w )z
    Synchronize module states (i.e. parameters ``params`` and all not-yet-synced buffers) by broadcasting from rank 0 to all ranks.

    Precondition: ``sync_module_states == True`` and ``self.process_group`` has
    been set.
    FTr   )srcN)r	  getattrFSDP_SYNCEDsetattrdetachr'   __tensor_flatten__rl  r  +_check_module_states_for_sync_module_statesr&   PARAM_BROADCAST_BUCKET_SIZE)r~   r   r-   module_statesr   detached_bufferattrsri   attrinner_buffersr   detached_paraminner_paramss                rD   r  r  4  s*    )+M.." 6v{E2FK.$mmoO,_= +==?qLQ RD$!? R R$$]3$$_56  1(8%88:HE1FKLdGND9LLL  .  01 0>#	 !S Ms   +D5D:r  c                 D    | rt        d | D              rt        d      y y )Nc              3   `   K   | ]&  }|j                   t        j                   d       k(   ( yw)r   N)r   r   )r]   tensors     rD   r_   z>_check_module_states_for_sync_module_states.<locals>.<genexpr>c  s'      17e,,s   ,.zThe module has CPU parameters or buffers when `sync_module_states=True`, which requires them to be on GPU. Please specify the `device_id` argument or move the module to GPU before passing it to FSDP.)rN  r5   )r  s    rD   r  r  `  s7      ;H  C
 	
}rF   c              #      K   | j                         }	 	 t        |      }||vrt        |      s| # t        $ r Y yw xY ww)aD  
    Return an iterator over the original parameters in ``module``.

    The iterator does not return
    the parameters in ``ignored_params``, any ``FlatParameter`` s (which may be
    present due to nested FSDP wrapping), or any original parameters already
    flattened (only relevant when ``use_orig_params=True``).
    N)r6  rn  r   StopIteration)r~   r   	param_genr   s       rD   r   r   m  sT      !!#IOEN*3Ee3L   s   A 4 	A AA  Ac           	          t        |       D ]A  \  }}||vst        |      rt        d| d|j                          d|j                          y)a5  
    Check that original parameters in ``fsdp_module`` have been flattened.

    The flattened parameters are made
    invisible to ``named_parameters()`` for the module hierarchy rooted at
    ``fsdp_module``. This should be called as a sanity check after flattening
    the wrapped module's parameters.
    z Found an unflattened parameter: z;  N)r   r   r   r;   	__class__)fsdp_moduler   r   r   s       rD   _check_orig_params_flattenedr    s^     ?{K 
E&/A%/H2:,b::<.%//!24 rF   c                 h    | t         j                  k(  rt        j                  S t        j                  S rY   )r!   r   r   allreduce_hookreduce_scatter_hook)r.   s    rD   _get_default_comm_hookr    s3      0 9 99 	$$ ..rF   c                 .    t        j                  |       S )NrI   )r   r>   rI   s    rD   rQ   rQ     s     %%MBBrF   rY   )r`  rW  r   r   collections.abcr   r   r   typingr   r   r   r	   r
   r   r   torch.distributeddistributedr[   (torch.distributed.fsdp._exec_order_utilsfsdp_exec_order_utilsr   'torch.distributed.fsdp._traversal_utils_traversal_utilsr)  2torch.distributed.fsdp.fully_sharded_data_parallelfully_sharded_data_parallelr,  torch.nnr   (torch.distributed.algorithms._comm_hooksr   torch.distributed.device_meshr   r   "torch.distributed.distributed_c10dr   $torch.distributed.fsdp._common_utilsr   r   r   r   r   r   r   "torch.distributed.fsdp._flat_paramr   r   r   r   %torch.distributed.fsdp._limiter_utilsr   torch.distributed.fsdp.apir   r   r   r   r    r!   r"   r#   torch.distributed.fsdp.wrapr$   &torch.distributed.tensor.parallel.fsdpr%   torch.distributed.utilsr&   torch.utils._python_dispatchr'   torch.utils.hooksr(   rO  
torchdistxr)   r*   ImportErrorintr  r}  ra   r\   HybridShardProcessGroupTypeProcessGroupTyper   r   SHARD_GRAD_OPHYBRID_SHARD_HYBRID_SHARD_ZERO2r  r6   #NO_RESHARD_AFTER_FORWARD_STRATEGIESrE   r7   boolrO   rJ   rj   r|   rL   r   r   r   r   r   r&  r   r   r   r   r   r   r   r   r   r  r  r   r   r   r   r   r  r  r  r  r  rS  Tensorr
  rm  ro  r  r  r  r   r  r  r>   rQ   rx  rF   rD   <module>r     s
     	  9 9 O O    C C A A F F  B E A    B	 	 	 0 D < F 1 . ""34 #D$5$5t7H7H$HI E$"3"35P"PQR 
 5>>!7!B!B""$:$H$H!!#9#F#F((*@*T*T  !!(( 
 ""((' #  )-00#0 (0 W	0
 *%0 0 0f ((#( ( 	( (V # $   IJ I4 I I  ARAR   !++!! 
! !H++ 4d///0&  	++II+ huxx78+ %((,,-.%((//9R0SS	+ + +\I9=	: --II- %- c5<</01	-
 - -` II  " >> 01> n-> *%	>
 > > !>  > > >B    		'	 	 		 	 : J *   *   299 d2<<6H T & AA))A c5<</01A Hbii[$%678	A
 A A AH ))) ))) )<66x896 	^6x BF) !%((*<*<!=> 				D$$)$ 	X$:299 S 
II
%
 c5<</01
 
	
<c5<</01
 % ell	:7II7%7 ^7 4:	7<RYYK-. ^ 
	  #ELL1  ^  %	 F""-0^"	"))_",3II3%3 &3 $ELL1	3
 
3l%,, $ELL1 
	@II% $ELL1 	
 % \\F)II)) $$) 
	)X

%

	

II% bll,% 
(.> C$$CCO#  s   ^ ^^