
    VhO                         d dl Z d dlmZmZ d dlmZ d dlmZmZ d dl	Z	d dl
mZ d dlmc mc mc mZ d dlmZ ddgZ G d de      Z G d	 de      Zy)
    N)ABCabstractmethod)Iterable)OptionalUnion)not_noneModelAveragerPeriodicModelAveragerc                   H    e Zd ZdZddeej                     fdZed        Z	y)r	   aH  Base class for all model averagers.

    Args:
        process_group: The process group to be used for all-reduce.
                       If ``None``, the default process group, which
                       is created by :func:`torch.distributed.init_process_group`,
                       will be used. (default: ``None``)
    Nprocess_groupc                 j    ||n"t        t        j                  j                        | _        d| _        y )Nr   )	_not_nonedistgroupWORLDr   step)selfr   s     f/home/dcms/DCMS/lib/python3.12/site-packages/torch/distributed/algorithms/model_averaging/averagers.py__init__zModelAverager.__init__   s,    *6MIdjjFVFV<W 	 	    c                     t         N)NotImplementedErrorr   paramss     r   average_parametersz ModelAverager.average_parameters    s    !!r   r   )
__name__
__module____qualname____doc__r   r   ProcessGroupr   r   r    r   r   r	   r	      s2    ht/@/@&A  " "r   c                        e Zd ZdZ	 ddeej                     f fdZdee	e
j                  j                     e	eee
j                  j                  f      f   fdZ xZS )r
   ak
  
    Averages parameters periodically after the warm-up stage.

    This can be used for running `post-local SGD <https://arxiv.org/abs/1808.07217>`_,
    by running :class:`~torch.nn.DistributedDataParallel` (DDP)
    using the subgroups created by :meth:`~torch.distributed.new_subgroups`.

    Args:
        period (int): The number of steps per model averaging.
                      Usually the period should be greater than ``1`` to reduce the communication cost.
                      Otherwise, only DDP needs to be used.
        warmup_steps (int): The number of warm-up steps. During this stage,
                            model averaging is skipped.
        process_group: The process group to be used for all-reduce.
                       If ``None``, the default process group, which
                       is created by :func:`torch.distributed.init_process_group`,
                       will be used. (default: ``None``)

    Example::

        >>> # xdoctest: +SKIP("undefined variables")
        >>> import torch
        >>> import torch.distributed as dist
        >>> import torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook as post_localSGD
        >>> import torch.distributed.algorithms.model_averaging.averagers as averagers
        >>> import torch.nn as nn
        >>>
        >>> dist.init_process_group("nccl", rank=rank, world_size=16)
        >>> torch.cuda.set_device(rank)
        >>> module = nn.Linear(1, 1, bias=False).cuda()
        >>> model = nn.parallel.DistributedDataParallel(
        >>>    module, device_ids=[rank], output_device=rank
        >>> )
        >>> # Register a post-localSGD communication hook.
        >>> state = PostLocalSGDState(process_group=None, subgroup=None, start_localSGD_iter=100)
        >>> model.register_comm_hook(state, post_localSGD_hook)
        >>>
        >>> # In the first 100 steps, run global gradient averaging like normal DDP at every step.
        >>> # After 100 steps, run model averaging every 4 steps.
        >>> # Note that ``warmup_steps`` must be the same as ``start_localSGD_iter`` used in ``PostLocalSGDState``.
        >>> averager = averagers.PeriodicModelAverager(period=4, warmup_steps=100)
        >>> for step in range(0, 200):
        >>>    optimizer.zero_grad()
        >>>    loss = loss_fn(output, labels)
        >>>    loss.backward()
        >>>    optimizer.step()
        >>>    # Will average model parameters globally every 4 steps. Thus,
        >>>    # inter-node communication only occurs every 4 iterations after
        >>>    # the initial ``warmup_steps`` period.
        >>>    averager.average_parameters(model.parameters())
    r   c                     t         |   |       |dk  rt        d      || _        |dk  rt        d      |dk(  rt	        j
                  d       || _        y )Nr   z3Arg ``warmup_steps`` must be a non-negative number.   z(Arg ``period`` must be a positive value.a  When period is 1, no need to use model averaging because the communication cost of all-reducing parameters will be no less than the cost of all-reducing gradients by DistributedDataParallel in the backward pass. Therefore, only DistributedDataParallel should be used for this case.)superr   
ValueErrorwarmup_stepswarningswarnperiod)r   r+   r(   r   	__class__s       r   r   zPeriodicModelAverager.__init__Z   sc     	'!RSS(A:GHHq[MMH r   r   c                    | j                   | j                  k\  rR| j                   | j                  z
  | j                  z  dk(  r)t        j                  |t        | j                               | xj                   dz  c_         y)aQ  
        Averages parameters or parameter groups of an optimizer if ``step`` is no less than ``warmup_steps``.

        Can be divided by ``period``, where ``step`` is increased by 1
        at each iteration in the training loop.
        Args:
            params: The parameters of a model or parameter groups of an optimizer.

        r   r%   N)r   r(   r+   utils&average_parameters_or_parameter_groupsr   r   r   s     r   r   z(PeriodicModelAverager.average_parametersl   sc      II***T...$++=B88	$"4"45 			Q	r   )r   N)r   r   r   r    r   r   r!   r   r   r   torchnn	Parameterdictstrr   __classcell__)r,   s   @r   r
   r
   %   sj    2j TX5=d>O>O5P$UXX''((4UXX=O=O8O3P*QQ
r   )r)   abcr   r   collections.abcr   typingr   r   r0   torch.distributeddistributedr   2torch.distributed.algorithms.model_averaging.utils
algorithmsmodel_averagingr.   torch.utils._typing_utilsr   r   __all__r	   r
   r"   r   r   <module>r@      sG     # $ "    B B ; 3
4"C "*]M ]r   