
    Vh$?                         d dl Z d dlmZmZ d dlmZ d dlmZmZm	Z	m
Z
 d dlZd dlmZ d dlmZmZmZ d dlmZ  e j*                  e      Zdeeef   fdZd	ej6                  defd
Z G d de      Z G d de      Zy)    N)abcdefaultdict)Iterable)AnyOptionaloverloadUnion)_MultiDeviceReplicator
GradScalerOptState)ProcessGroupreturnc                  (    t         j                  i dS )N)stagefound_inf_per_device)r   READY     Z/home/dcms/DCMS/lib/python3.12/site-packages/torch/distributed/fsdp/sharded_grad_scaler.py_refresh_per_optimizer_stater      s    ^^R@@r   tensorc                     | j                   xs: | j                  j                  dddddt        j                  j                         fv S )Nxlacpuhpumtiaxpu)is_cudadevicetypetorch_C_get_privateuse1_backend_name)r   s    r   _is_supported_devicer$      sH    >> V]]//..04  r   c                   4    e Zd ZdZdej
                  ddfdZy)_GeneralMultiDeviceReplicatorz
    Lazily serves tensor to request device. This class extends
    _MultiDeviceReplicator to allow support for "cpu" as a device.
    master_tensorr   Nc                 :    t        |      sJ || _        i | _        y N)r$   master_per_device_tensors)selfr'   s     r   __init__z&_GeneralMultiDeviceReplicator.__init__%   s    #M222#EG r   )__name__
__module____qualname____doc__r!   Tensorr-   r   r   r   r&   r&      s!    
Hell Ht Hr   r&   c                   j    e Zd ZdZddddddej
                  j                  fded	ed
edede	de
dee   ddf fdZedej                   dej                   fd       Zedeej                      deej                      fd       Zedeej                   df   deej                   df   fd       Zedeej                      deej                      fd       Zdeej                   eej                      f   deej                   eej                      f   fdZ	 d"dej,                  j.                  dej                   dej                   de
deej2                  ej                   f   f
dZdej,                  j.                  ddfdZdej                   ddfdZd#d eeeej                   f      ddfd!Z xZS )$ShardedGradScaleraA	  
    ShardedGradScaler helps perform gradient scaling in a shard aware manner. It extends
    functionality from GradScaler:
    * Supports Pytorch DDP and FSDP implementations
    * Support CPU offloaded tensors (as used in fully sharded data parallel[FSDP])
    * Supports the custom Mixed Precision loss dtype (fp16, bf16) that FSDP returns
    * Sync inf/nan for scaled gradient tensors on any torch.device (where tensors are placed) across
    nodes

    Example::

        # Creates a ShardedGradScaler once at the beginning of training.
        scaler = ShardedGradScaler()

        for epoch in epochs:
            for input, target in data:
                optimizer.zero_grad()
                output = model(input)
                loss = loss_fn(output, target)

                # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
                scaler.scale(loss).backward()

                # scaler.step() first unscales gradients of the optimizer's params.
                # If gradients don't contain infs/NaNs, optimizer.step() is then called,
                # otherwise, optimizer.step() is skipped.
                scaler.step(optimizer)

                # Updates the scale for next iteration.
                scaler.update()

    See :class:`GradScaler` for explanation of scaling/unscaling and more use cases.

    Args:
        init_scale (float, optional, default=2.**16):  Initial scale factor.
        growth_factor (float, optional, default=2.0):  Factor by which the scale is multiplied during
            :meth:`update` if no inf/NaN gradients occur for ``growth_interval`` consecutive iterations.
        backoff_factor (float, optional, default=0.5):  Factor by which the scale is multiplied during
            :meth:`update` if inf/NaN gradients occur in an iteration.
        growth_interval (int, optional, default=2000):  Number of consecutive iterations without inf/NaN gradients
            that must occur for the scale to be multiplied by ``growth_factor``.
        enabled (bool, optional):  If ``False``, disables gradient scaling. :meth:`step` simply
            invokes the underlying ``optimizer.step()``, and other methods become no-ops.
            Default: ``True``
        process_group (ProcessGroup, optional, default=torch.distributed.group.WORLD):
            process group for sharding
    cudag      @g      ?g       @i  Tr   
init_scalebackoff_factorgrowth_factorgrowth_intervalenabledprocess_groupr   Nc                     t         |   ||||||       | j                  r|| _        t	        t
              | _        y y )N)r6   r7   r8   r9   r:   )superr-   _enabledr;   r   r   _per_optimizer_states)	r,   r   r6   r7   r8   r9   r:   r;   	__class__s	           r   r-   zShardedGradScaler.__init__\   sM     	!)'+ 	 	
 ==!.D)45Q)RD& r   outputsc                      y r)   r   r,   rA   s     r   scalezShardedGradScaler.scaler   s    <?r   c                      y r)   r   rC   s     r   rD   zShardedGradScaler.scaleu   s    HKr   .c                      y r)   r   rC   s     r   rD   zShardedGradScaler.scalex   s    TWr   c                      y r)   r   rC   s     r   rD   zShardedGradScaler.scale{   s    PSr   c                      j                   s|S t        |t        j                        rt	        |      sJ  j
                   j                  |j                          j
                  J | j
                  j                  |j                  d      z  }|j                  |j                        S g dt        t        j                  t        t        j                     f   f fd |      S )NTr   non_blockingvalc                 L   t        | t        j                        rt        |       sJ t	              dk(  rYj
                  j                  | j                         j
                  J j                  t        j
                               | d   j                  | j                        z  }|j                  | j                        S t        | t        j                        r5t        |       }t        | t         t"        f      r t        |       |      S |S t%        d      )Nr   z2outputs must be a Tensor or an iterable of Tensors)
isinstancer!   r2   r$   len_scale_lazy_init_scale_growth_trackerr   appendr&   getr    dtyper   r   maplisttuple
ValueError)rK   
scaled_valiteratorapply_scaler,   stashs      r   rZ   z,ShardedGradScaler.scale.<locals>.apply_scale   s    #u||,+C000u:?{{*<<SZZH;;222LL!>t{{!KL 58<<

#;;
 "syy11#s||,{C0cD%=1$49X..QRRr   )r>   rM   r!   r2   r$   rO   rP   r   tor    rS   r	   r   )r,   rA   scaled_outputrZ   r[   s   `  @@r   rD   zShardedGradScaler.scale~   s     }}Ngu||,'000{{"44W^^D;;***#dkknn~~D '5 ' M !%%gmm4457	SU5<<%,,1G#GH 	S( 7##r   	optimizer	inv_scale	found_inf
allow_fp16c           
         t        |      }t        |      }t        d       }t        j                         5  |j                  D ]9  }|d   D ]-  }	|	j
                  |s2|	j
                  j                  t        j                  k(  rt        d      |	j
                  j                  r|	j
                  j                  t        j                  u r[|	j
                  j                  t        j                        j                         }
|
j                  t        j                        |	_        |	j
                  j                         }n|	j
                  }||j                     |j                     j                  |       0 < |j!                         D ]O  \  }}|j#                         D ]7  }t        j$                  ||j'                  |      |j'                  |             9 Q 	 d d d        |j(                  s3| j*                  J |j'                  | j*                  j                         |j(                  S # 1 sw Y   TxY w)Nc                       t        t              S r)   )r   rU   r   r   r   <lambda>z3ShardedGradScaler._unscale_grads_.<locals>.<lambda>   s    T9J r   paramsz%Attempting to unscale FP16 gradients.)r&   r   r!   no_gradparam_groupsgradrS   float16rW   	is_sparser    float32coalesce_valuesr   rQ   itemsvalues*_amp_foreach_non_finite_check_and_unscale_rR   r+   rO   )r,   r^   r_   r`   ra   per_device_inv_scaleper_device_found_infper_device_and_dtype_gradsgroupparamparam_grad_fp32
to_unscaler   per_dtype_gradsgradss                  r   _unscale_grads_z!ShardedGradScaler._unscale_grads_   s     =YG<YG &11J%K"]]_ 	"// )"8_ )Ezz) &EJJ,<,<,M()PQQzz++
 !::++u}}<.3jjooemm.L.U.U.WO)8)=)=emm)LEJ%*ZZ%7%7%9
%*ZZ
.z/@/@A"((fZ())). ,F+K+K+M ',335 EDD,008,0081	D $77;;*** $$T[[%7%78#777K	 	s   F,H77I c                    | j                   sy | j                  d       | j                  t        |         }|d   t        j
                  u rt        d      |d   t        j                  u rt        d      | j                  J | j                  j                         j                         j                         }t        j                  ddt        j                  | j                  j                        }| j!                  |||d      |d	<   t        j
                  |d<   | j                  t        |         }g }g }g }|d	   j#                         D ]  }| j$                  d
k7  r|j                  j&                  d
k(  ro|j)                  |       |j+                  | j$                        }|j)                  |       |j)                  t-        j.                  |d| j0                               |j)                  t-        j.                  |d| j0                                |D ]  }	|	j3                           |rt        j4                  ||       y y )Nunscale_r   zMunscale_() has already been called on this optimizer since the last update().z(unscale_() is being called after step().)   g        )rS   r   Tr   r   )async_oprt   )r>   _check_scale_growth_trackerr?   idr   UNSCALEDRuntimeErrorSTEPPEDrO   double
reciprocalfloatr!   fullrk   r   rz   ro   _devicer    rQ   r\   dist
all_reducer;   wait_foreach_copy_)
r,   r^   optimizer_stater_   r`   worksfound_inf_on_cpusfound_inf_on_devicesfound_inf_on_deviceworks
             r   r|   zShardedGradScaler.unscale_   s   }}((444R	]C7#x'8'88_  W%)9)99IJJ {{&&&KK&&(335;;=	JJ#U]]4;;3E3E
	 372F2Fy)T3
./ $,#4#4  44R	]C!()?@GGI 	I||u$)9)9)>)>%)G!((3&/ll4<<&@#$++,?@OO+d$BTBT OOIDDVDVW	  	DIIK	  !24HI r   c                    | j                   | j                  J |j                         dk\  r;| xj                   | j                  z  c_         | j                  j	                  d       y| j                  dz   }|| j
                  k(  r;| xj                   | j                  z  c_         | j                  j	                  d       y|| _        y)z
        If found_inf is 1.0 (True), then scale is multiplied by backoff_factor and growth_tracker is set to zero.
        Otherwise, scale is multiplied by the growth factor when the growth interval is reached.
        Ng      ?r   r}   )rO   _growth_trackeritem_backoff_factorfill__growth_interval_growth_factor)r,   r`   
successfuls      r   _amp_update_scale_cpu_z(ShardedGradScaler._amp_update_scale_cpu_  s    
 {{&4+?+?+KKK>>s"KK4///K  &&q)--1JT222t222$$**1-'1$r   	new_scalec           	         | j                   sy| j                  d      \  }}|t        |t              r| j                  j                  |       nd}|j                  j                  | j                  k(  sJ |       |j                         dk(  sJ |       |j                  du sJ |       | j                  j                  |       n| j                  j                         D cg c]7  }|d   j                         D ]  }|j                  |j                  d      ! 9 }}}t        |      d	kD  sJ d
       |d	   }t        |      dkD  r"t!        dt        |            D ]
  }	|||	   z  } |j                  j                  dk(  r| j#                  |       nLt%        j&                  | j                  | j(                  || j*                  | j,                  | j.                         t1        t2              | _        yc c}}w )a  
        Updates the scale factor.
        If any optimizer steps were skipped the scale is multiplied by ``backoff_factor``
        to reduce it. If ``growth_interval`` unskipped iterations occurred consecutively,
        the scale is multiplied by ``growth_factor`` to increase it.
        Passing ``new_scale`` sets the new scale value manually. (``new_scale`` is not
        used directly, it's used to fill GradScaler's internal scale tensor. So if
        ``new_scale`` was a tensor, later in-place changes to that tensor will not further
        affect the scale GradScaler uses internally.)
        Args:
            new_scale (float or :class:`torch.Tensor`, optional, default=None):  New scale factor.
        .. warning::
            :meth:`update` should only be called at the end of the iteration, after ``scaler.step(optimizer)`` has
            been invoked for all optimizers used this iteration.
        Nupdateznew_scale should be a float or a 1-element torch.cuda.FloatTensor or                     torch.FloatTensor with requires_grad=False.r}   Fr   TrI   r   z,No inf checks were recorded prior to update.r   )r>   r   rM   r   rO   r   r   r    r   numelrequires_gradcopy_r?   ro   r\   rN   ranger   r!   _amp_update_scale_r   r   r   r   r   r   )
r,   r   rO   r   reasonstater`   
found_infsfound_inf_combinedis
             r   r   zShardedGradScaler.update'  s   " }}"&"B"B8"L )U+!!),A  !'',,<DfD< (A-5v5- ..%7??7!!), "77>>@!&'=!>!E!E!G  FMMEEJ  z?Q&V(VV&!+A:"q#j/2 8A&*Q-7&8 }}!!U*++,>?((KK((&''(()) &11M%N"5s   &<G;)Tr)   )r.   r/   r0   r1   r   rt   WORLDstrr   intboolr   r   r-   r   r!   r2   rD   rU   rV   r   r	   optim	Optimizerdictr   rz   r|   r   r   __classcell__)r@   s   @r   r4   r4   +   s=   .d # #"#04

0@0@SS S 	S
 S S S  -S 
S, ?U\\?ell? ?KT%,,/KD4FK KWU5<<#45W%c@Q:RW WSXell3S8NS S)$U\\8ELL+AAB)$	u||Xell33	4)$`  68;;((68 <<68 <<	68
 68 
ellELL(	)68p2J%++"7"7 2JD 2Jh2 2 2$@Ouell/B)C D @OPT @Or   r4   ) loggingcollectionsr   r   collections.abcr   typingr   r   r   r	   r!   torch.distributeddistributedr   torch.amp.grad_scalerr
   r   r   "torch.distributed.distributed_c10dr   	getLoggerr.   loggerr   r   r   r2   r   r$   r&   r4   r   r   r   <module>r      s     ( $ 1 1    N N ; 
		8	$Ad38n A $ 	H$: 	H|O
 |Or   