
    Vhv6                         U d dl Z d dlZd dlmZmZ d dlmZ d dlmZm	Z	m
Z
mZmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZ g Zee   ed<    e j4                  e      Z G d	 d
ej:                        Zdee   defdZy)    N)
CollectionMapping)deepcopy)AnyCallableOptionaloverloadUnion)optim)ShardedTensor)FullyShardedDataParallel__all__c                      e Zd ZdZ	 	 ddeeeej                  e	f   f   de
j                  deeeeef         deej                      ddf
dZd	 Zdeeef   fd
Zeddd       Zedeg ef   defd       Zddeeg ef      dee   fdZedeej                  ef   fd       Zdeeef   ddfdZdeeef   ddfdZddZdeeef   fdZdeeef   fdZy)_NamedOptimizera  
    ``_NamedOptimizer`` takes a dict of parameters and exposes ``state_dict`` by parameter key.

    We replace the original key (number) in an optim to the
    fully qualified name (FQN) string. User can initialize the optim as they
    initialize a PyTorch optim, the only difference is that they also need to
    pass in the FQN of each parameters.

    Args:
        named_parameters (Mapping[str, Union[torch.Tensor, ShardedTensor]]):
            Mapping from FQN to parameter.
        optimizer_class (optim.Optimizer):
            The class of optimizer to instantiate.
        param_groups (Collection[Mapping[str, Any]]):
            `param_groups` to pass to optimizer if specified.
            The key of the inner map needs to be FQNs.
            Default: None
        module (nn.Module): the module whose parameters to updated
            by the optimizer.
        args: arguments to pass to the optimizer constructor.
        kwargs: arguments to pass to the optimizer constructor.

    Example::
        >>> # xdoctest: +SKIP("distributed")
        >>> from torch import optim
        >>> from torch.distributed.optim import _NamedOptimizer
        >>>
        >>> # Define the named optimizer.
        >>> m = Model(...)
        >>> named_optim = _NamedOptimizer(m.named_parameters(), optim.SGD)
        >>> # Forward pass + backward pass.
        >>> named_optim.step()
        >>> ...
        >>> # Call state_dict for the named optimizer returns a FQN state_dict.
        >>> named_optim.state_dict()

    Warning: This API is still in development and subject to change.

    TODO: Add tutorial for _NamedOptimizer.
    TODO: Add documentation in the docstring for the public attributes
          like self.param_groups and self.named_parameters.
    Nnamed_parametersoptimizer_classparam_groupsmodulereturnc                    t         j                  j                  d       || _        | j	                          t        |      | _        || j                  j                         n|} ||g|i || _        || _	        |)t        | j                  j                               | _        nt        j                  d       | j                  j                         D 	ci c]  \  }}	|	|
 }
}}	g }|D ]3  }|d   D ])  }	|	|
vrt!        d|	 d      |j#                  |
|	          + 5 || _        | j                  j                  | _        y c c}	}w )Nz'torch.distributed.optim._NamedOptimizerzvSince we pass in param_groups, we will use param_groups to initialize the optimizer, not all parameters of the module.paramszExpect param name z% found in param group but is missing.)torch_C_log_api_usage_oncer   _param_groups_checkdictr   values
_optimizerr   listkeysordered_param_keyswarningswarnitems
ValueErrorappend)selfr   r   r   r   argskwargsparams_for_optimizerkeyparamparam_to_keyr!   groups                W/home/dcms/DCMS/lib/python3.12/site-packages/torch/distributed/optim/named_optimizer.py__init__z_NamedOptimizer.__init__@   sh    	$$%NO;G  " $%5 6.:.BD!!((* 	 * 

 

 &*4+@+@+E+E+G&HD#MMN :>9N9N9T9T9VW:3E3JWLW!#% C"8_ CEL0(07\]  '--l5.ABCC '9D# OO88 Xs   Ec                 l   | j                   | j                   D ]  }t        |t              sJ d       d|v sJ d       |d   }t        |t        j                        r|g}t        |      }|D ]=  }t        |t        j                        rt        dt        j                  |      z          ||d<    y y )Nparam group must be a dictr   z#param group must contain key paramsz>optimizer can only optimize Tensors, but one of the params is )r   
isinstancer   r   Tensorr   	TypeErrortypename)r'   param_groupr   r,   s       r/   r   z#_NamedOptimizer._param_groups_checkj   s    (#00 /!+t4R6RR4;.U0UU.$X.fell3$XFf# E%eU\\:'8:?..:OP  )/H%/ )    c                    | j                   j                         }|d   }|d   j                         D ci c]  \  }}| j                  |   | }}}g }|D ]k  }|d   D cg c]  }| j                  |    }	}dt	        |	      i}
|j                         D ]  \  }}|dk7  st        |      |
|<    |j                  |
       m | j                  ||d      S c c}}w c c}w )z
        Return the ``state_dict`` of the optimizer.

        Instead of using number to index
        parameters, we will use module fully qualified name (FQN) as the key.
        r   stater   )r:   r   )r   
state_dictr$   r!   sortedr   r&   _post_state_dict)r'   r;   r   st_key	state_val	ret_state
ret_groupsr.   r,   
param_keys	ret_groupkvs                r/   r;   z_NamedOptimizer.state_dict{   s    __//1
!.1 &0%8%>%>%@
!	 ##F+Y6
	 

 
! 	)EFKHoVU$11%8VJV!6*#56I /1=#+A;IaL/ i(	) $$y*%UVV
 Ws   C!Cclosurec                      y N r'   rF   s     r/   stepz_NamedOptimizer.step   s    14r8   c                      y rH   rI   rJ   s     r/   rK   z_NamedOptimizer.step   s    ;>r8   c                 :    | j                   j                  |      S )z
        Perform a single optimization step.

        This will call :meth:`torch.optim.Optimizer.step` on the wrapped
        optimizer.
        rF   )r   rK   rJ   s     r/   rK   z_NamedOptimizer.step   s     ##G#44r8   c                 .    | j                   j                  S rH   )r   r:   )r'   s    r/   r:   z_NamedOptimizer.state   s    $$$r8   r;   c                    | j                   j                         }| j                  |      }|d   }|d   }t        |      dk(  rt	        d      t        | j                        D ]  \  }}||j                         vrt        ||         t        ||         k7  r,t	        dt        ||          d| dt        ||                ||   j                         D ]c  \  }}|||   vrt	        d| d| d      ||   |   }	t        |t              rt        |	t              sJ t        |j                               }
t        |	j                               }|
|k7  rt	        d	| d
|
 d| d|       t        |j                         |	j                               D ]8  \  }}|j                  j                         j                  |j                         : t        |t         j"                        r=t        |	t         j"                        sJ |j                         j                  |	       St%        |	      ||   |<   f  |d   }|d   }i }|D ]  }t'        |d         }||t)        |      <     i }|D ]:  }g }|d   D ]   }|j+                  | j                  |          " ||t)        |      <   < |j                         D ]  \  }}||vr||   }t        |      t        |      k7  r't	        dt        |       d| d
t        |       d      |D ]/  }||vrt	        d| d| d      |dk7  st%        ||         ||<   1  | j                   j-                  |       y)a  
        Define the default behavior to load a state_dict for ``_NamedOptimizer``.

        Sample Code
        ```
            my_model = MyModule()
            optimizer = _NamedOptimizer(my_model.named_parameters(), Adagrad)
            ...

            optim_state_dict = optimizer.state_dict()
            ...
            ...

            optimizer.load_state_dict(optim_state_dict)
            ...
        ```
        Args:
            state_dict (Dict[str, Any]) : A ``state_dict`` to load into the optimizer.
                Note that this state dict update is performed in place.

        .. note:: PyTorch is using lazy init to initialize the optim states.
            So it is possible that there is no optim state when user call
            ``load_state_dict`` and for ``_NamedOptimizer`` we make it stricter
            that users can only call ``load_state_dict`` after the state is initialized.
            By doing this, we can validate the optim ``state_dict`` to be loaded.
        r:   r   zJExpects the optim to be initialized before load but found not initialized.zExpects equal length as z for parameter z but found: zExpects state z but not found.z"Expects equal number of shards as z but found z for /r   r   z"Expects equal param_group size as z for group .zExpects group key z to be in group z  in `state_dict` but is missing.N)r   r;   _pre_load_state_dictlenr%   	enumerater!   r    r$   r3   r   local_shardsziptensordetachcopy_r   r4   r   r   _gen_param_group_keyr&   load_state_dict)r'   r;   new_state_dictr:   	new_stateidx	param_key	state_keyr?   src_state_val
num_shardsnum_new_shardsshard	src_shardsrc_param_groupsnew_param_groupssrc_group_mapr.   rB   new_group_map	new_group	group_key	src_grouprD   s                           r/   r\   z_NamedOptimizer.load_state_dict   s   6 335..z:
7#"7+	y>Q\  ((?(?@  	HNC

,5#$IcN(;; .s9S>/B.C?S\R]]ijmnst}n~j  jA  B  )2#(<(<(> H$	9E)$44$(?9+_]  !&i 0 ;i7%m]CCC!$Y%;%;%=!>J%()C)C)E%FN!^3(@@PP[\f[gglmvlwwx  zC  yD  E  -0!..0-2L2L2N- F(y ++-33I4D4DEF  	5<<8%mU\\BBB$$&,,];080GIcN9-/H 	HF &n5).9% 	DEeHo.J>CM.z:;	D ) 	HIJ&x0 F	!!$"9"9)"DEF>GM.z:;		H
 %2$7$7$9 	: Iy -%i0I9~Y/ 8Y8HT]S^^ijmnwjxiyyz{   :I%$,QC/?	{Jjk  =#+IaL#9IaL:	:$ 	''7r8   r7   c                    t        |t              sJ d       |d   }t        |t        j                        r|g|d<   nt	        |      |d<   | j
                  j                         D ci c]  \  }}||
 }}}|d   D ]/  }||vrt        d      | j                  j                  ||          1 | j                  j                  |       | j                  j                  | _        yc c}}w )z
        Add a param group to the :class:`_NamedOptimizer` s `param_groups`.

        Warning: This API is still in development and subject to change.
        r2   r   z%some parameters are not in the moduleN)r3   r   r   r4   r   r   r$   r%   r!   r&   r   add_param_groupr   )r'   r7   r   r+   r,   r-   s         r/   ro   z_NamedOptimizer.add_param_group  s     +t,J.JJ,X&fell+%+HK!$(LK!595J5J5P5P5RSzsEs
SS * 	@EL( !HII##**<+>?	@
 	''4 OO88 Ts   )C'c                     | j                   j                         D ]H  }|j                  st        j                  |      }t        j
                  j                  |      |_        J | j                  d       y)z
        Run a dummy optimizer step, which allows to initialize optimizer state because we do lazy init for most optimizers.

        This allows doing in-place loading of optimizer state from a checkpoint.
        NrN   )	r   r   requires_gradr   
zeros_likeautogradVariablegradrK   )r'   r,   ts      r/   
init_statez_NamedOptimizer.init_state(  sa     **113 	8E""$$U+"^^44Q7
	8
 			$	r8   c                     t        | j                  t              r-t        j                  | j                  | j                  |d      S |S )NT)is_named_optimizer)r3   r   FSDPoptim_state_dict_to_loadr   r'   r;   s     r/   rS   z$_NamedOptimizer._pre_load_state_dict5  s>     dkk4(00T__jT  r8   c                     t        | j                  t              r+t        j                  | j                  | j                  |       |S rH   )r3   r   rz   optim_state_dictr   r|   s     r/   r=   z _NamedOptimizer._post_state_dict>  s2     dkk4(!!$++t
Kr8   )NN).)rF   Nr   NrH   )r   N) __name__
__module____qualname____doc__r   strr
   r   r4   r   r   	Optimizerr   r   r   nnModuler0   r   r   r;   r	   rK   r   floatpropertyr:   r\   ro   rw   rS   r=   rI   r8   r/   r   r      s   )^ AE&*(9!#uU\\=-H'I"IJ(9 (9 z'#s(*;<=	(9
 #(9 
(9T/"WDcN W4 4 4>HRY/>E> >5HXb%i%89 5Xe_ 5 %wu||S01 % %f8'#s(*; f8 f8P9738+< 9 90 $sCx. d38n r8   r   rB   r   c                 6    dj                  t        |             S )zGConcatenate all param keys as a unique indentifier for one param group.rQ   )joinr<   )rB   s    r/   r[   r[   F  s    88F:&''r8   ) loggingr"   collections.abcr   r   copyr   typingr   r   r   r	   r
   r   torch.nnr   r   'torch.distributed._shard.sharded_tensorr   torch.distributed.fsdpr   rz   r   r   r   __annotations__	getLoggerr   loggerr   r   r[   rI   r8   r/   <module>r      sr      /  ; ;    A C c 			8	$oeoo od	(T#Y (3 (r8   