
    Vh              	          d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZmZ d dl m	Z	 d dl
mZ d dlmZmZ d dlmZ d dlmZmZmZmZmZmZ d dlmZ d dlZd dlmZ d dlmZ d dl mc m!Z" d d	l#m$Z$ d d
l%m&Z& d dl'm(Z(m)Z)m*Z+ d dl,m-Z- d dl.m/Z/m0Z0 d dl1m2Z2 d dl3m4Z4m5Z5m6Z6 d dl7m8Z8 d dl9m:Z:m;Z;m<Z< d dl=m>Z>m?Z?m@Z@ d dlAmBZBmCZCmDZDmEZE d dlmFZFmGZG d dlHmIZJ d dlKmLZLmMZMmNZNmOZO d dlPmQZQmRZRmSZSmTZTmUZU d dlVmWZW dZXeSrdZYdZZej                  j                         ZXn.eTrdZYdZZn'eUrdZYdZZej                  j                         ZXnd ZYd!ZZd"ZX G d# d$e      Z^ G d% d&e      Z_ G d' d(ej                  e      Zad)ej                  d*ej                  d+efd,Zcd- Zd	 	 dtd)ej                  d.eefd/Zfdud0Zgd1 Zhd2 Zidvd)ej                  d3eefd4Zjd)ej                  d5eefd6Zkd)ej                  d7eefd8Zl G d9 d:      Zm G d; d<ea      Zn G d= d>ea      Zo G d? d@eo      Zp G dA dBeo      Zq G dC dDea      Zr G dE dFer      Zs G dG dHej                        Zt G dI dJeo      Zu G dK dLej                        Zv G dM dNej                        Zx G dO dPej                        Zye j                  dQefdR       Z{e j                  dSefdT       Z|e j                  dUefdV       Z}ee j                  dWefdX              Z~ee j                  dYefdZ              Zee j                  d[efd\              Zee j                  d]efd^              Zd_ed+ed`edaefdbZ	 dwdcej                  ddej                  deeedff   fdgZ G dh dieM      Z G dj dkeL      Zdxdlee   fdmZ G dn doej                        Z G dp dqej                        Z G dr dsej                        Zy)y    N)ABCabstractmethod)nullcontext)deepcopy)autoEnumwraps)AnyCallablecastno_type_checkOptionalUnion)mock)
checkpoint)
DeviceMesh)
CPUOffloadfully_shardFullyShardedDataParallel)TrainingState)FSDPParamGroupRegisterPostBackwardFunction)#NO_RESHARD_AFTER_FORWARD_STRATEGIES)BackwardPrefetchMixedPrecisionShardingStrategy)ShardedGradScaler)always_wrap_policyModuleWrapPolicywrap)distribute_tensorDTensorShard)ColwiseParallelparallelize_moduleRowwiseParallelSequenceParallel)TransformerDecoderLayerTransformerEncoderLayer)DistributedDataParallel)MultiProcessTestCaseMultiThreadedTestCaserun_subtests
TEST_SKIPS)FILE_SCHEMAget_cycles_per_ms	TEST_CUDATEST_HPUTEST_XPU)
has_triton   cudancclzhpu:0hcclxpuxcclcpugloo   c                   (    e Zd Z e       Z e       Zy)FSDPInitModeN)__name__
__module____qualname__r   NO_FSDP	RECURSIVE     S/home/dcms/DCMS/lib/python3.12/site-packages/torch/testing/_internal/common_fsdp.pyr@   r@   V   s    fGIrG   r@   c                   6    e Zd Z e       Z e       Z e       Zy)DEVICEInitModeN)rA   rB   rC   r   DEVICE_BEFOREDEVICE_AFTERDEVICE_NEVERrF   rG   rH   rJ   rJ   _   s    FM6L6LrG   rJ   c                       e Zd ZdZedeej                  df   fd       Zedej                  fd       Z	edd       Z
eeded	edej                  fd
              Zy)FSDPTestModelzZThis defines the interface expected from all models used commonly for
    FSDP unit tests.return.c                      y)z+Returns an input for the model as as tuple.NrF   selfdevices     rH   	get_inputzFSDPTestModel.get_inputl        	rG   c                      y)z,Returns the loss given the input and output.NrF   )rS   inputoutputs      rH   get_losszFSDPTestModel.get_lossq   rV   rG   Nc                      y)z<Runs the backward pass (e.g. including ``loss.backward()``).NrF   rS   losss     rH   run_backwardzFSDPTestModel.run_backwardv   rV   rG   argskwargsc                       y)z&Initializes an instance of this model.NrF   )r_   r`   s     rH   initzFSDPTestModel.init{   s     	rG   rP   N)rA   rB   rC   __doc__r   tupletorchTensorrU   rZ   r^   staticmethodr   nnModulerb   rF   rG   rH   rO   rO   h   s     5s):#;        C 3 299   rG   rO   modelprocess_group	assert_fnc                 ,   | j                         D cg c]%  \  }}||j                         j                         f' }}}|| j                         D cg c]%  \  }}||j                         j                         f' c}}z  }t	        j
                  |      }t        |      D 	cg c]  }	d }
}	t	        j                  |
||       |
d   }|J |
dd D ])  }|J t        ||      D ]  \  \  }	}\  }	} |||        + yc c}}w c c}}w c c}	w )a  
    All-gathers module states across ranks and calls ``assert_fn`` on each pair
    of corresponding states from rank 0 and a nonzero rank. For example, if
    ``assert_fn`` is ``self.assertEqual()``, then this checks that all module
    states are equal across ranks.
    Ngroupr   r>   )	named_parametersdetachr<   named_buffersdistget_world_sizerangeall_gather_objectzip)rk   rl   rm   
param_nameparamnamed_module_statesbuffer_namebuffer
world_size_olistrank0_statesstatep1p2s                  rH   _assert_module_statesr      s6    "'!7!7!9J 
U\\^'')*  #(#6#6#8K 
fmmo))+,  $$]3J ,-aT-E-5"5]K8L###qr     #L% 8 	GQWab"	
 .s   *D*D'	Dc                  4    t        j                  t              S N)rf   rT   DEVICE_TYPErF   rG   rH   get_devtyper      s    <<$$rG   zero_buffersc                    |rt        j                  |       n	t               }|5  | j                         D ]/  }t	        j
                         5  |j                          ddd       1 |rB| j                         D ]/  }t	        j
                         5  |j                          ddd       1 ddd       y# 1 sw Y   xY w# 1 sw Y   PxY w# 1 sw Y   yxY w)zBZeros the parameters and optionally buffers of ``model`` in place.N)FSDPsummon_full_paramsr   
parametersrf   no_gradzero_buffers)rk   r   summon_fullctxrz   r}   s         rH   _zero_modelr      s     -8$
!
!%
([]C	 #%%' 	E  	 --/ #]]_ #LLN# ### # # ## #s;   (CB43CC !
C4B=9C C	CCc                 t    |s| j                  t              } |r| j                          | j                         S r   )tor   half
state_dict)rk   cpu_offloadr   s      rH   _get_state_dictr      s.    %

rG   c           	      j    dj                  |D cg c]  }|| t        |         nd c}      S c c}w )Nr   none)joinstr)test_name_mappingr_   ss      rH   subtest_namer      s7    88IMNAam	3q6	"	?N Ns   0c                    |j                         D ];  \  }}|j                  t        j                  d      k7  s)|j                         ||<   = | dk(  r|nd g}t	        j
                  |       t        t        t        t        j                  f   |d         }|j                         D ]  }||   j                  t              ||<    |S )Nr<   r   )itemsrT   rf   r<   rt   broadcast_object_listr   dictr   rg   keysr   r   )rankr   ry   rz   r   s        rH   _broadcast_state_dictr      s     (--/ 1
E<<5<<..%*YY[Jz"1  19Z$/Eu%d3,-uQx8J oo' H
!+J!7!:!:;!G
:HrG   recursec                     t        j                  | |      5  t        t        | j	                                     cddd       S # 1 sw Y   yxY w)a[  
    Returns the full unsharded parameters of ``model``. Any FSDP-managed
    parameters offloaded to CPU are moved to GPU in the returned list.

    Args:
        recurse (bool): If ``False``, only unshards the parameters immediate to
            ``model``; if ``True``, recurses through the module hierarchy
            rooted at ``model``.
    )r   N)r   r   r   listr   )rk   r   s     rH   get_full_paramsr      s?     
	 	 	8 2U--/012 2 2s   "AAmove_to_devicec                 4    |r| j                  t              S | S r   )r   r   )rk   r   s     rH   _move_to_devicer      s    $2588K ==rG   	wrap_fsdpc                 (    |s| S t        | g|i |S r   r   )rk   r   r_   r`   s       rH   _maybe_wrap_fsdpr      s    !5CtE'CD'CF'CCrG   c                   :    e Zd ZdedefdZdefdZdefdZd Zy)	DummyProcessGroupr   sizec                      || _         || _        y r   )_rank_size)rS   r   r   s      rH   __init__zDummyProcessGroup.__init__   s    

rG   rP   c                     | j                   S r   )r   rS   s    rH   r   zDummyProcessGroup.rank       zzrG   c                     | j                   S r   )r   r   s    rH   r   zDummyProcessGroup.size   r   rG   c                 B    t        j                         }d }||_        |S )Nc                  d    t         j                  j                         } | j                  d       | S )Nr>   )rf   futuresFuture
set_result)futures    rH   
get_futurez/DummyProcessGroup.allreduce.<locals>.get_future   s'    +0==+?+?+AFa MrG   )r   Mockr   )rS   r_   r`   	dist_waitr   s        rH   	allreducezDummyProcessGroup.allreduce   s"    IIK		
  *	rG   N)rA   rB   rC   intr   r   r   r   rF   rG   rH   r   r      s2    S  c c 	rG   r   c                        e Zd Zdej                  dededef fdZd Zd Z	d Z
d	 Ze	 	 	 ddej                  d
ededeeeef      dededeej(                  ef   fd       Zd Z xZS )TransformerWithSharedParamsrp   device_init_modeadd_bndeterministicc                    t         |           |j                         | _        |j                         | _        |rt        j                  d       d}d}t        j                  ||      | _	        t        j                  |dddd      | _        t        j                  ||      | _        | j                  j                  | j                  _        | j                  d| j                  j                  j!                  |f             | j                  d	t        j"                  | j$                  t
        j&                  
             d| _        |r)t
        j                  j+                  | j(                        nt
        j                  j-                         | _        |t0        j2                  k(  r| j5                  t6              } |r| j9                          y y )Nr               g?)d_modelnum_encoder_layersnum_decoder_layersdim_feedforwarddropout
vocab_biaslong_buffer)dtype)superr   r   r   r~   rf   manual_seedri   	Embeddingembed_tokensTransformertransformerLinearoutput_projweightregister_buffernew_ones
zeros_liker   longbsBatchNorm1dIdentitybnrJ   rK   r   r   eval)rS   rp   r   r   r   d_vocabr   	__class__s          rH   r   z$TransformerWithSharedParams.__init__  sa    	JJL	**,a LL':>>  
 99Wg6 #'"3"3":":$++22;;WJG	
 	T__EJJ?	

 39%((&&tww/uxx?P?P?R~;;;77;'DIIK rG   c                 ,   t        j                  d| j                  z          t        j                  d|      j	                  d| j
                        }t        j                  | j
                  dz  |      j	                  d| j
                        }||fS )Nr>      rT      r6   )rf   r   r   arangeviewr   )rS   rT   srctgts       rH   rU   z%TransformerWithSharedParams.get_input.  sl    !dii-(ll2f-221dgg>ll477Q;v6;;AtwwGSzrG   c                    | j                  |      }|| j                  z   | j                  j                  |      z   }| j                  |      }| j	                  |      }| j                  ||      }| j                  |      S r   )r   r   r   type_asr   r   r   )rS   src_idstgt_idsr   r   xs         rH   forwardz#TransformerWithSharedParams.forward4  sv    (DOO#d&6&6&>&>s&CC(ggclS#&""rG   c                     |\  }}t         j                  j                  |j                  d|j	                  d            |j                  d      d      S )Nsum)	reduction)ri   
functionalcross_entropyr   r   )rS   rX   rY   r   r   s        rH   rZ   z$TransformerWithSharedParams.get_loss<  sI    3}}**KKFKKO,chhrle + 
 	
rG   c                 $    |j                          y r   backwardr\   s     rH   r^   z(TransformerWithSharedParams.run_backwardB      rG   fsdp_init_modefsdp_kwargsrP   c                 N   |i }|t         j                  k(  r&t        | t              r| d   }n| }t	        ||||      S |t         j
                  k(  rd|vrt        t        t        h      }n|j                  d      }d|v r8|d   t        j                  t        j                  hv rt        | t              sd}n| }t        | t              r| d   }	n| }	t	        |	|||      }
t        |
|fd|i|}|t        j                  k(  r|j!                  t"              }|S t%        d|       )au  
        Initializes a :class:`TransformerWithSharedParams` instance.

        Args:
            fsdp_init_mode (FSDPInitMode): If ``NO_FSDP``, then does not wrap
                any modules with FSDP. If ``RECURSIVE``, then wraps with
                top-level FSDP. By default, the top-level FSDP uses the
                ``ModuleWrapPolicy`` for encoder and decoder layers, but a
                different auto wrap policy may be specified via
                ``fsdp_kwargs``.
            device_init_mode (DEVICEInitMode): Determines model movement to DEVICE.
            fsdp_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments
                forwarded to the FSDP constructor.
            deterministic (bool): Whether to make the model deterministic
                across constructions.
            add_bn (bool): Whether to include batch norm in the model.
        Nr   auto_wrap_policysharding_strategyUnsupported FSDP init mode: )r@   rD   
isinstancere   r   rE   r    r*   r)   popr   HYBRID_SHARD_HYBRID_SHARD_ZERO2r   rJ   rL   r   r   
ValueError)rp   r  r   r	  r   r   pgr  fsdp_pg
tformer_pgm
fsdp_models               rH   rb   z TransformerWithSharedParams.initE  sX   6 K\111%'1X.$fm  |555!4#3//$  $/??3E#F  ${2 34$113C3W3WXY"5%0%'"1X
"
+,fmA  "2 	J  >#>#>>']];7
77GHIIrG   c                     | j                   gS r   )r   r   s    rH   get_ignored_modulesz/TransformerWithSharedParams.get_ignored_modules  s      !!rG   )NFT)rA   rB   rC   rt   ProcessGrouprJ   boolr   rU   r   rZ   r^   rh   r@   r   r   r   r   r   ri   rj   r   rb   r  __classcell__r   s   @rH   r   r     s    (  ( )( 	(
 (T#
 
 15#KJ  KJ$KJ )KJ d38n-	KJ
 KJ KJ 
ryy$	KJ KJZ"rG   r   c                        e Zd Zdej                  dededef fdZd Zd Z	d Z
d	 Ze	 	 ddej                  d
ededeeeef      dedej&                  fd       Z xZS )NestedWrappedModulerp   r   r   r   c                    t         |           j                         | _        j                         | _        |t
        j                  k(  }fd}|rt        j                  d       t        j                  t        t        j                  dd      |       |t        j                   |t        t        j                  dd      |            t        t        j                  dd      |                   |t        t        j                  dd      |            t        t        j                  dd      |            | _        y )Nc                 &    rt        | fi S | S r   r   layerr	  rp   r   s    rH   _maybe_wrapz1NestedWrappedModule.__init__.<locals>._maybe_wrap      E58K88LrG   r   r   r6   r   )r   r   r   r   r~   rJ   rK   rf   r   ri   
Sequentialr   r   module	rS   rp   r   r   r   r	  r   r$  r   s	    ``  `  rH   r   zNestedWrappedModule.__init__  s     	JJL	**,)^-I-II	
 a mmBIIaO^<		!R0@. QR#BIIb"$5~F 		"a(8.IJBIIaO^<

rG   c                 x    t        j                  d| j                  z          t        j                  dd|      fS )Nr>   r6   r   r   )rf   r   r   randrR   s     rH   rU   zNestedWrappedModule.get_input  s.    !dii-(

1a/11rG   c                 $    | j                  |      S r   r'  rS   r   s     rH   r   zNestedWrappedModule.forward      {{1~rG   c                 &    |j                         }|S r   )r   rS   rX   rY   r]   s       rH   rZ   zNestedWrappedModule.get_loss  s    zz|rG   c                 $    |j                          y r   r  r\   s     rH   r^   z NestedWrappedModule.run_backward  r  rG   r  r	  rP   c                    |i }|t         j                  k(  rt        | d||      S |t         j                  k(  r:t        | fd||d|}|t        j
                  k(  r|j                  t              }|S t        d|       )a  
        Initializes a :class:`NestedWrappedModule` instance.

        Args:
            fsdp_init_mode (FSDPInitMode): If ``NO_FSDP``, then does not wrap
                any modules with FSDP. If ``RECURSIVE``, then wraps some nested
                modules with FSDP but not the top-level module. The model may
                later be wrapped with a top-level FSDP external to this method
                if desired.
            device_init_mode (DEVICEInitMode): Determines model movement to DEVICE.
            fsdp_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments
                forwarded to the FSDP constructor.
            deterministic (bool): Whether to make the model deterministic
                across constructions.
        Fr   r   r   Tr  )	r@   rD   r  rE   rJ   rL   r   r   r  )rp   r  r   r	  r   r  s         rH   rb   zNestedWrappedModule.init  s    . K\111&!1+	  |555,!1+	
 J  >#>#>>']];7
77GHIIrG   NF)rA   rB   rC   rt   r  r  rJ   r   rU   r   rZ   r^   rh   r@   r   r   r   r   ri   rj   rb   r  r  s   @rH   r  r    s    
  
 
 )	

 
@2 
 15#+J  +J$+J )+J d38n-	+J
 +J 
+J +JrG   r  c                   h     e Zd Ze	 	 ddej
                  dededee	e
ef      def
 fd       Z xZS )AlwaysWrapNestedWrappedModulerp   r  r   r	  r   c                 0   t         t        t          	 | t        j                  |||      }|t        j                  k(  r|S |t        j
                  k(  rB|xs i }t        |fdt        i|}|t        j                  k(  r|j                  t              }|S y)z
        Initializes a :class:`NestedWrappedModule` instance, but unlike
        :meth:`NestedWrappedModule.init`, for the ``RECURSIVE`` init mode, this
        wraps with top-level FSDP and the ``always_wrap_policy()`` auto wrap
        policy.
        )rp   r  r   r	  r   r  N)r   r6  rb   r@   rD   rE   r   r   rJ   rL   r   r   )rp   r  r   r	  r   rk   r  r   s          rH   rb   z"AlwaysWrapNestedWrappedModule.init  s     )+H
'//-#'  
 	 \111L|555%+KeX6HXKXJ>#>#>>']];7
 6rG   r4  )rA   rB   rC   rh   rt   r  r@   rJ   r   r   r   r   r  rb   r  r  s   @rH   r6  r6    s^    
 15#  $ ) d38n-	
  rG   r6  c                        e Zd Zdej                  dededef fdZed
d       Z	e	 	 ddej                  de
dedeeeef      def
d	       Z xZS )NonUniformReqGradNWMrp   r   r   r   c                    t         t        |           j                         | _        j	                         | _        |t        j                  k(  }fd}|rt        j                  d       t        j                  t        t        j                  dd      |       |t        j                   |t        t        j                  dd      |            t        t        j                  dd      |                   |t        j                  t        t        j                  dd      |      t        t        j                  dd      |                        | _        y )Nc                 &    rt        | fi S | S r   r   r"  s    rH   r$  z2NonUniformReqGradNWM.__init__.<locals>._maybe_wrap+  r%  rG   r   r   r6   r   )r   r  r   r   r   r~   rJ   rK   rf   r   ri   r&  r   r   r'  r(  s	    ``  `  rH   r   zNonUniformReqGradNWM.__init__  s     	!413 JJL	**,)^-I-II	
 a mmBIIaO^<		!R0@. QR#BIIb"$5~F #BIIb!$4nE#BIIaO^D
rG   c                     | j                         D ]-  \  }}t        j                  ||      r|j                  d       / y r4  )rq   rematchrequires_grad_)rk   req_grad_masknps       rH   _set_nonuniform_req_gradz-NonUniformReqGradNWM._set_nonuniform_req_gradB  s:    **, 	(DAq88M1-  '	(rG   r  r	  c                    t        j                  d      }|t        j                  k(  r't	        | d||      }t        j                  ||       |S |t        j                  k(  rT|i }t	        | fd||d|}|t        j                  k(  r|j                  t              }t        j                  ||       |S t        d|       )a  
        Initializes a :class:`NestedWrappedModule` instance, but unlike
        :meth:`NestedWrappedModule.init`, it wraps a second :class:`torch.nn.Sequential`
        container to enable the desired non-uniform ``requires_grad``
        ``use_orig_params=True`` tests. For both ``RECURSIVE`` and ``NO_FSDP``
        init modes, freezes all parameters except the last two to validate
        ``ShardedGradScaler`` support for ranks with no (non-zero sized) local shards in
        FSDP ``use_orig_params=True`` mode.
        zmodule\.2.*\.1.*Fr3  Tr  )r=  compiler@   rD   r9  rC  rE   rJ   rL   r   r   r  )rp   r  r   r	  r   req_grad_pattern	ddp_modelr  s           rH   rb   zNonUniformReqGradNWM.initH  s    ( ::&9:\111,!1+	I !99)EUV|555" -!1+	
 J  >#>#>>']];7
 99*FVW77GHIIrG   rc   r4  )rA   rB   rC   rt   r  r  rJ   r   rh   rC  r@   r   r   r   r   rb   r  r  s   @rH   r9  r9    s    (
  (
 (
 )	(

 (
T ( (
 
 15#+J  +J$+J )+J d38n-	+J
 +J +JrG   r9  c                        e Zd ZdZdej
                  dedef fdZd Zd Z	d Z
d	 Zed
ee   dedededef
d       Z xZS )ModuleWithDelayzThis class wraps a :class:`FSDPTestModel` to optionally add a delay
    after computing the loss and/or before the gradient reduction.r'  delay_after_loss_msdelay_before_reduction_msc                 L    t         |           || _        || _        || _        y r   )r   r   rJ  rK  r'  )rS   r'  rJ  rK  r   s       rH   r   zModuleWithDelay.__init__{  s'     	#6 )B&rG   c                 8    | j                   j                  |      S r   )r'  rU   rR   s     rH   rU   zModuleWithDelay.get_input  s    {{$$V,,rG   c                 $    | j                  |      S r   r,  r-  s     rH   r   zModuleWithDelay.forward  r.  rG   c                 B   | j                   j                  ||      }| j                  dkD  rst        st        r$t        j                  | j                  dz         |S t        r=t        j                  j                  t        | j                  t               z               |S Nr     )r'  rZ   rJ  r3   r4   timesleepr2   rf   r7   _sleepr   r1   r0  s       rH   rZ   zModuleWithDelay.get_loss  sy    {{##E62##a'8

433d:;  

!!#d&>&>ARAT&T"UVrG   c                      t         j                  j                   fd}t        j                  d|      5   j
                  j                  |       d d d        y # 1 sw Y   y xY w)Nc                     j                   dkD  rrt        r>t        j                  j	                  t        j                   t               z               n.t        st        r"t        j                  j                   dz          | i |S rP  )rK  r2   rf   r7   rT  r   r1   r3   r4   rR  rS  )r_   r`   orig_reduce_scatterrS   s     rH   _delayed_reduce_scatterz=ModuleWithDelay.run_backward.<locals>._delayed_reduce_scatter  sk    --1JJ%%D::=N=PPQ JJt==DE&777rG   z'torch.distributed.reduce_scatter_tensor)rf   distributedreduce_scatter_tensorr   patchr'  r^   )rS   r]   rX  rW  s   `  @rH   r^   zModuleWithDelay.run_backward  sW    #//EE	8 ZZ57N
 	+ KK$$T*	+ 	+ 	+s   AA'module_class
model_argsmodel_kwargsc                <    t         | j                  |i |||      S )aA  
        Args:
            module_class (Type[FSDPTestModel]): Wrapped module class to which
                to add delays.
            model_args: Positional arguments forwarded to the ``module_class``
                ``init()``.
            delay_after_loss_ms (int): Delay after computing the loss/before
                the optimizer step (in ms).
            delay_before_reduction_ms (int): Delay before reduce-scattering
                gradients (in ms).
            model_kwargs: Keyword arguments forwarded to the ``module_class``
                ``init()``.
        )rI  rb   )r\  rJ  rK  r]  r^  s        rH   rb   zModuleWithDelay.init  s,    * Lz:\:%
 	
rG   )rA   rB   rC   rd   ri   rj   r   r   rU   r   rZ   r^   rh   typerO   r   rb   r  r  s   @rH   rI  rI  w  s    F				 !	 $'		-+$ 
=)

 !
 $'	

 
 
rG   rI  c                   ~    e Zd Zeej
                  ddddfdej                  dedede	e
eef      ded	ed
efd       Zy)NestedWrappedModuleWithDelayNFr   rp   r  r   r	  r   rJ  rK  c           
      D    t         j                  t        | ||||||      S )Nrp   r  r   r	  r   rJ  rK  )rI  rb   r  rd  s          rH   rb   z!NestedWrappedModuleWithDelay.init  s4     ##)-#' 3&? $ 	
 		
rG   )rA   rB   rC   rh   rJ   rL   rt   r  r@   r   r   r   r   r  r   rb   rF   rG   rH   rb  rb    s     ,:+F+F04##$)*
  
$
 )
 d38n-	

 
 !
 $'
 
rG   rb  c                   $     e Zd Z fdZd Z xZS )DummyDDPc                 0    t         |           || _        y r   )r   r   r'  )rS   r'  r   s     rH   r   zDummyDDP.__init__  s    rG   c                 &     | j                   |i |S r   r,  rS   r_   r`   s      rH   r   zDummyDDP.forward  s    t{{D+F++rG   rA   rB   rC   r   r   r  r  s   @rH   rf  rf    s    ,rG   rf  c                        e Zd Zdej                  dedededef
 fdZd Z	d Z
e	 	 	 ddej                  d	eded
eeeef      dedefd       Z xZS )MixtureOfExpertsrp   r   r   delay_before_free_msr   c                    t         |   ||||       || _        || _        || _        |t
        j                  k(  | _        |r"t        j                  d| j                  z          d}d}d}	t        t        j                  ||      | j                        }
t        d |
j                         D              | _        |
j                         D ]	  }d|_         |rt        j                  d       t        t        j                  ||      | j                        }|rHt        j$                  j'                  |j                         g      }t)        |
|fi |}
t)        ||fi |}t        j*                  t        t        j                  |	|      | j                        ||
t        t        j                  ||	      | j                              | _        y )	N)rp   r   r   r   *   r   r   r   c              3   <   K   | ]  }|j                           y wr   )numel).0rB  s     rH   	<genexpr>z,MixtureOfExperts.__init__.<locals>.<genexpr>   s     $L1QWWY$L   Tr   )r   r   rp   rm  r   rJ   rK   r   rf   r   r   r   ri   r   r   r   num_expert_paramsexpertrY  	new_groupr   r&  r'  )rS   rp   r   r   rm  r   r	  d_expertd_sharedd_inputrv  rB  sharedexpert_groupr   s                 rH   r   zMixtureOfExperts.__init__  s    	-'	 	 	
 
$8!"..2N2NNb499n- 8X!>@S@ST!$$L8I8I8K$L!L""$ 	AAH	 a  8X!>@S@ST ,,66L &,>+>F&%7;7FmmBIIgx8$:M:MNBIIh8$:M:MN	
rG   c                 f     j                   dkD  r j                  d   }t        |t              ret        j
                  j                  j                  j                   fd}t        j                  d|      5   j                  |      cd d d        S  j                  |      S # 1 sw Y   xY w)Nr   r   c                      t         r>t        j                  j                  t	        j
                  t               z               n.t        st        r"t        j                  j
                  dz          | i |S )NrQ  )r2   rf   r7   rT  r   rm  r1   r3   r4   rR  rS  )r_   r`   orig_reshardrS   s     rH   _delayed_reshardz2MixtureOfExperts.forward.<locals>._delayed_reshard  s]     

)) 9 9<M<O OP "X

4#<#<t#CD'888rG   z.torch.distributed.fsdp._runtime_utils._reshard)rm  r'  r  r   rf   rY  fsdp_runtime_utils_reshardr   r[  )rS   r   rv  r  r  s   `   @rH   r   zMixtureOfExperts.forward  s    $$q([[^F&$'$0055DDMM9 ZZDFV *  ;;q>* *
 {{1~* *s   ;B''B0c                    |j                          | j                  st        j                         5  | j	                         D ]v  }t        |d      r|j                  |j                  j                  | j                         t        j                  j                  |j                  | j                         x 	 d d d        y y # 1 sw Y   y xY w)Nrv  ro   )r  r   rf   r   r   hasattrgraddiv_r~   rY  
all_reducerp   )rS   r]   rB  s      rH   r^   zMixtureOfExperts.run_backward1  s    ~~ O* OAq(+ vv)DOO4))44QVV4::4NOO O O Os   -CACCr  r	  c                    |i }|t         j                  k(  rt        | d|||      S |t         j                  k(  r;t        | fd|||d|}|t        j
                  k(  r|j                  t              }|S t        d|       )a  
        Initializes a :class:`MixtureOfExperts` instance.

        Args:
            fsdp_init_mode (FSDPInitMode): If ``NO_FSDP``, then does not wrap
                any modules with FSDP. If ``RECURSIVE``, then wraps some nested
                modules with FSDP, including the expert and shared layers, but
                not the top-level module. The model may later be wrapped with a
                top-level FSDP external to this method if desired.
            device_init_mode (DEVICEInitMode): Determines model movement to DEVICE.
            fsdp_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments
                forwarded to the FSDP constructor.
            deterministic (bool): Whether to make the model deterministic
                across constructions.
            delay_before_free_ms (int): Delay before resharding expert
                parameters in the forward pass (in ms).
        F)r   r   rm  r   Tr  )	r@   rD   rl  rE   rJ   rL   r   r   r  )rp   r  r   r	  r   rm  r  s          rH   rb   zMixtureOfExperts.init=  s    4 K\111#!1%9+  |555)!1%9+ J  >#>#>>']];7
77GHIIrG   )NFr   )rA   rB   rC   rt   r  r  rJ   r   r   r   r^   rh   r@   r   r   r   r   rb   r  r  s   @rH   rl  rl    s    2
  2
 2
 )	2

 "2
 2
h0
O 
 15#$%0J  0J$0J )0J d38n-	0J
 0J "0J 0JrG   rl  c                        e Zd Z	 ddddddedeej                     deded	ef
 fd
Zdej                  dej                  fdZ
d Z xZS )MLPTFr6   )biaswith_bufferdim_multiplierdimrT   r  r  r  c                
   t         |           t        j                  |||z  ||      | _        t        j                  ||z  |||      | _        |r)| j                  dt        j                  |f|             y d | _	        y )N)rT   r  r}   r   )
r   r   ri   r   in_projout_projr   rf   randnr}   )rS   r  rT   r  r  r  r   s         rH   r   zMLP.__init__r  so     	yyns&:6PTU		.3"6FQUV  5;;vf+MNDKrG   r   rP   c                     | j                  |      }t        j                  |      }| j                  |      }t        j                  |      }| j                  || j                  z   }|S r   )r  Frelur  r}   )rS   r   zs      rH   r   zMLP.forward  sS    LLOFF1IMM!FF1I;;"DKKArG   c                     | j                   4t        j                  j                  j	                  | j                          y y r   )r}   rf   ri   rb   normal_r   s    rH   reset_parameterszMLP.reset_parameters  s+    ;;"HHMM!!$++. #rG   r   )rA   rB   rC   r   r   rf   rT   r  r   rg   r   r  r  r  s   @rH   r  r  q  sv     *.
 ! &
   " %,, /rG   r  c                   F     e Zd Zdddedef fdZdededed	d fd
Z xZS )MLPStackF)with_seq_parallelmlp_dimr  c                    t        |d      t        |      t        |d      g}|r&|j                  t        j                  |d             t	        |   |  || _        y )N   )r  Fr  )r  appendri   	LayerNormr   r   r  )rS   r  r  modulesr   s       rH   r   zMLPStack.__init__  sX     *L*	$
 NN2<<e<='"!2rG   tp_meshdp_meshuse_activation_checkpointingrP   c           
         t        d      t        d      t        d      t        d      t        d      | j                  rt        t        d            n	t               d}| j                  rt	        d      |d<   t        | ||       | D ]8  }t        |t        j                        r|rt        |       t        |fd	|i| : t        | fd	|i| | S )
NF)use_local_outputr>   )output_layouts)z	0.in_projz
0.out_projz	1.in_projz
1.out_projz	2.in_projz
2.out_proj)sequence_dim3)device_meshparallelize_planmesh)r%   r'   r  r$   r(   r&   r  ri   r  r   r   )rS   r  r  r  r	  r  r'  s          rH   parallelizezMLPStack.parallelize  s     )%@)5A(%@)5A(%@%% *qB "
 !!$4!$DS!4WGWX 	=F&",,/+6"<W<<	= 	D6w6+6rG   )	rA   rB   rC   r   r  r   r   r  r  r  s   @rH   r  r    sD    BG 
3 
34 
3  '+	 
rG   r  c                        e Zd ZdZddedef fdZdej                  de	e
ej                  ej                  f   ej                  f   fdZ xZS )	DoubleLinearz
    This can be used for returning multiple outputs from a module
    (``use_second_linear=True``) or for having an unused module (``False``).
    r  use_second_linearc                     t         |           t        j                  ||      | _        t        j                  ||      | _        t        j                         | _        || _        y r   )	r   r   ri   r   lin1lin2ReLUr  r  )rS   r  r  r   s      rH   r   zDoubleLinear.__init__  sG    IIc3'	IIc3'	GGI	!2rG   r   rP   c                     | j                   r@| j                  | j                  |            | j                  | j                  |            fS | j                  | j                  |            S r   )r  r  r  r  r-  s     rH   r   zDoubleLinear.forward  sQ     !!99TYYq\*DIIdiil,CCCyy1&&rG   T)rA   rB   rC   rd   r   r  r   rf   rg   r   re   r   r  r  s   @rH   r  r    sT    
3C 3D 3''	uU\\5<</0%,,>	?'rG   r  new_all_gather_into_tensorc              #      K   t         j                  }t        j                          | t         _        	 d  t        j                          |t         _        y # t        j                          |t         _        w xY wwr   )rt   all_gather_into_tensorbarrier)r  orig_all_gathers     rH   patch_all_gatherr    sO     11OLLN"<D6&5# 	&5#   0A;A  A;!A88A;new_reduce_scatter_tensorc              #      K   t         j                  }t        j                          | t         _        	 d  t        j                          |t         _        y # t        j                          |t         _        w xY wwr   )rt   rZ  r  )r  rW  s     rH   patch_reduce_scatterr    sP     44LLN!:D9%8" 	%8"r  new_all_reducec              #      K   t         j                  }t        j                          | t         _        	 d  t        j                          |t         _        y # t        j                          |t         _        w xY wwr   )rt   r  r  )r  orig_all_reduces     rH   patch_all_reducer    sJ     ooOLLN$DO*) 	)r  new_unshardc              #      K   t         j                  }t        j                          | t         _        	 d  t        j                          |t         _        y # t        j                          |t         _        w xY wwr   )r   unshardrt   r  )r  orig_unshards     rH   patch_unshardr    Q      "))LLLN(N.!- 	!-r  new_reshardc              #      K   t         j                  }t        j                          | t         _        	 d  t        j                          |t         _        y # t        j                          |t         _        w xY wwr   )r   reshardrt   r  )r  r  s     rH   patch_reshardr  
  r  r  new_post_backwardc              #      K   t         j                  }t        j                          | t         _        	 d  t        j                          |t         _        y # t        j                          |t         _        w xY wwr   )r   post_backwardrt   r  )r  orig_post_backwards     rH   patch_post_backwardr    sR      (55LLN#4N :'9$ 	'9$r  new_backwardc              #      K   t         j                  }t        j                          | t         _        	 d  t        j                          |t         _        y # t        j                          |t         _        w xY wwr   )r   r  rt   r  )r  orig_backwards     rH   *patch_register_post_backward_hook_backwardr  $  sT      199MLLN,8 )>0=$- 	0=$-r  rW  r_   r`   c                     t        |      dkD  r|d   }nd|v r|d   }nt        d| d|        ||        ||i |S )Nr   rY   z,Cannot get reduce-scatter output from
args: z	
kwargs: )lenAssertionError)clsrW  rm   r_   r`   rY   s         rH   reduce_scatter_with_assertr  1  sa     4y1}a	V	!;D6F8T
 	
 f///rG   replicated_modulesharded_moduleprefixes_to_ignore.c                    t        |j                         |j                               D ]  \  \  }}\  }}|}|D ]  }	|j                  |	d      } | j                  ||       | j	                  |t
               t        |t
              sJ |j                  |j                  }}
t        |      t        d      t        d      fk(  rt        d      t        ||
|      }| j                  |j                         |j                                |j                  | j                  |j                         | j!                  |j                         t        |j                  |
|      }| j	                  |j                  t
               t        |j                  t
              sJ | j                  |j                  j                         |j                                 y )N r   zmFSDP's (Shard(0), Shard(0)) layout differs from distribute_tensor(), so we cannot check for equality using it)rx   rq   replaceassertEqualassertIsInstancer#   r  r  
placementsre   r$   r  r"   to_localr  assertIsNoneassertIsNotNone)r  r  r  r  replicated_namereplicated_paramsharded_namesharded_paramclean_sharded_nameprefixr  r  sharded_ref_paramsharded_ref_grads                 rH   check_sharded_parityr  D  s    OR**,n.M.M.OO TJ+*-JlM *( 	HF!3!;!;FB!G	H);<]G4-111(44m6N6Njq58 44 ;  ..>jQ..02C2L2L2NO  (]//0M../,-=-B-BD*U]//9-,,g666**3357G7P7P7RS1TrG   c                   @     e Zd Zed        Z fdZd Zd Zd Z xZ	S )FSDPTestMultiThreadc                     t         S r   DEVICE_COUNTr   s    rH   r~   zFSDPTestMultiThread.world_sizef      rG   c                 B    t         |           | j                          y r   )r   setUp_spawn_threadsrS   r   s    rH   r  zFSDPTestMultiThread.setUpj  s    rG   c                      t        | g|i |S r   r.   ri  s      rH   r.   z FSDPTestMultiThread.run_subtestsn      D242622rG   c                 @    t         j                  j                          y r   rf   _dynamoresetr   s    rH   perThreadSetUpz"FSDPTestMultiThread.perThreadSetUpq      rG   c                 @    t         j                  j                          y r   r  r   s    rH   perThreadTearDownz%FSDPTestMultiThread.perThreadTearDownt  r  rG   )
rA   rB   rC   propertyr~   r  r.   r  r	  r  r  s   @rH   r  r  e  s)     3rG   r  c            $           e Zd Z fdZed        Zed        Zedefd       Zed        Z	d Z
d Zd	 Zd
 Zed        Z	 	 	 	 	 	 	 d'dej"                  dedededee   dedee   dededeeeef      fdZddd e       dddddddddfdee   dededee   deded ed!ee    d"ee!   dee   d#ed$ededed%eeeef      deeeef      f d&Z" xZ#S )(FSDPTestc                 h    t         |           dt        j                  d<   | j	                          y )N0TORCH_NCCL_DESYNC_DEBUG)r   r  osenviron_spawn_processesr  s    rH   r  zFSDPTest.setUpy  s)     14

,-rG   c                     t         S r   r  r   s    rH   r~   zFSDPTest.world_size  r  rG   c                 >    t         j                  j                         S r   )rt   distributed_c10d_get_default_groupr   s    rH   rl   zFSDPTest.process_group  s    $$7799rG   rP   c                      yr4  rF   r   s    rH   destroy_pg_upon_exitzFSDPTest.destroy_pg_upon_exit  s     rG   c                 *    t          | j                   S r   )r0   	file_namer   s    rH   init_methodzFSDPTest.init_method  s    t~~.//rG   c                 <    | j                  ||j                         y r   )r  r   )rS   r  r   s      rH   _check_cpu_offloadzFSDPTest._check_cpu_offload  s    j&<&<=rG   c                 <    | j                  ||j                         y r   )r  backward_prefetch)rS   r  r  s      rH   _check_backward_prefetchz!FSDPTest._check_backward_prefetch  s    *J,H,HIrG   c                 <    | j                  ||j                         y r   )r  forward_prefetch)rS   r  r"  s      rH   _check_forward_prefetchz FSDPTest._check_forward_prefetch  s    ):+F+FGrG   c                      t        | g|i |S r   r   ri  s      rH   r.   zFSDPTest.run_subtests  r  rG   c                      | |      }||_         ||_        |j                  dd      }t        d|j                    d|j                          	 |r`t
        j                  j                  j                  j                  j                         }t        j                  d|j                  ||       nDt        j                  |j                  t        t        |j                        |j                          d }
|j                   t,        z  }t.        st0        rt
        j2                  j5                  |       |g}
t        j6                  |
       t
        j8                  j;                          |j=                  ||       t
        j8                  j;                          t        j6                  |
       t        j>                          y # t         $ r=}	d|	j"                  d	   v r&t%        j&                  t(        d
   j*                          d }	~	ww xY w)Nfake_pgFzdist init r=z, world=fake)backendr~   r   store)r  r(  r~   r   	recompiler   backend_unavailable)
device_ids) r   r  getprintr~   rf   testing	_internalrY  r&  	FakeStorert   init_process_groupr  DISTRIBUTED_BACKENDr   RuntimeErrorr_   sysexitr/   	exit_coder  r2   r4   acceleratorset_device_indexr  r  r  run_testdestroy_process_group)r  r   	test_namer  piper`   rS   r&  r)  er,  	device_ids               rH   _runzFSDPTest._run  s   9~	"**Y.TYYKx/@AB	//;;CCMMO''"#	 '' $ 0 0/"4??3	 
II,	..y9[

 	
+i&
+""$/  	affQi'$9:DDE		s   B&G 	H8HHNFrk   	num_stepsautocastlrfsdp_cpu_offload
save_modelmixed_precisionenable_sharded_grad_scaleruse_pure_fp16sharded_grad_scaler_kwargsc           	         |xr |j                   }t        |j                               j                  }|
i }
t	        dd|i|
}t
        j                  j                  |j                         |d      }t        |      D ]  }|j                          t
        j                  j                  t        |      5  |j                  j                  t        j                  t                    }|	s|rMt        |t               s=t        |t
        j"                        r|j%                         }nt'        d |D              } || }|rft        |t               rV|j(                  t*        vrD|j                         D ]1  }| j-                  |j                  t        j                  d             3 |j                  j/                  ||      j1                  |      }d d d        |j3                        }|s&|	s$|j4                  t
        j6                  k(  sJ d       |	r+| j-                  |j4                  t
        j8                         net        |t               r+|J | j-                  |j4                  |j:                         n*| j-                  |j4                  t
        j6                         |j                  j=                  |       |rTt        |t               rD|j                         D ]1  }| j-                  |j                  t        j                  d             3 |j?                  |       |jA                          |s|jC                         jE                         D ci c]  \  }}||jG                          }}}tI        |       |jK                  |        t        |t               r|jM                  tN        jP                         jS                         S # 1 sw Y   xY wc c}}w )	Nenabledg?)rC  momentum)rK  c              3   <   K   | ]  }|j                           y wr   )r   )rr  r   s     rH   rs  z4FSDPTest._train_for_several_steps.<locals>.<genexpr>  s     %>1affh%>rt  r<   zeloss data type should be float32, as the original                     parameter data type is float32.rF   )*offload_paramsnextr   rT   r   rf   optimSGDrv   	zero_gradamprB  r   r'  rU   r  r   rg   r   re   r  r   r  rZ   r   scaler   float32float16param_dtyper^   stepupdater   r   cloner   load_state_dict_assert_stater   IDLErr   )rS   rk   rA  rB  rC  rD  rE  rF  rG  rH  rI  cpu_offload_paramsmodel_devicesharded_grad_scalerrP  r   rX   rY   rB  r]   kvr   s                          rH   _train_for_several_stepsz!FSDPTest._train_for_several_steps  s4    .Q2B2Q2QE,,./66%-)+&/ 
.
2L

  0 0 2rCHy! 9	2AOO##K#B M..u||K/HI _Zt=T!%6 %

 %%>%> > '"5$/ //>? #--/ H((5<<3FGH ||,,UF;>>|L-M. ',,T2D"=JJ%--/555/ !$$TZZ?t,*666$$TZZ1L1LM$$TZZ?LL%%d+!j&=))+ DA$$QXXu||E/BCD  $$U+&&(7<7G7G7I7O7O7QRtq!alR
R E"%%j1s9	2v eT" 2 23{{}wM Mf Ss   9DOO(O%	r   Tmodel_classr  r   ref_init_fn	num_itersr   r  r  r"  use_orig_paramsinit_kwargsc                    |t         j                  k7  sJ d       |i }d}| j                  j                         } |j                  | j                  t         j                  t
        j                  fddi|}|.t        rt        |t        gt              }nt        ||g|      }n ||      }|r|j                         }| j                  |||
du|||
|||	      }t        |j                               }|j                  |||	|
||d       	  |j                  | j                  |||fddi|}t%        |t&              st'        || j                  fi |}|r|j                         }|t
        j(                  k(  r|j+                  t              }|duxr |j,                  }|xr |t
        j(                  k(  }|xr |t
        j(                  k7  }|rFt/        j0                  d      }|j                         D ]  }| j3                  |j0                  |         |r| j5                  t6        dt               n	t9               }|5  | j                  ||d||||
|||
      } ddd       |ry|r[t/        j0                  d      }|j                         D ]  }| j3                  |j0                  |          j+                  t              } t;        |      }!t.        j<                  j?                  | d       |
|s| j3                  ||!dd       yyy# t        $ r }t!        d	| d
t#        |             |d}~ww xY w# 1 sw Y   xY w)a  
        Tests FSDP training against a reference, which defaults to DDP but
        may be customized with ``ref_init_fn``.

        Args:
            model_class (Type[FSDPTestModel]): A model class that inherits from
                ``FSDPTestModel``, which defines the expected interface.
            fsdp_init_mode (FSDPInitMode): The mode to initialize the
                FSDP-wrapped model. This should not be ``NO_FSDP``.
            ref_init_fn (Optional[Callable]): A callable to invoke that wraps a
                non-wrapped model to construct the reference model, where this
                wrapper should provide data parallel semantics. If ``None``,
                then the callable defaults to the DDP constructor.
        z.Expects an FSDP init mode that wraps with FSDPN{Gz?r   T)r,  output_device)rB  rC  rD  rF  rG  rH  rI  )r   r  r  rF  r"  rg  zInitializing z raised error r<   zOAn FSDP-managed module with parameter CPU offloading enabled has parameters on F)rB  rC  rD  rE  rF  rG  rH  rI  )check_dtypezFSDP did not match DDP)exact_devicemsg) r@   rD   rl   r   rb   rJ   rK   r3   DDPr   r   rc  r   r   rY  	Exceptionr  r   r  r   rL   r   rN  rf   rT   r  assertRaisesRegexr4  r   r   r/  assert_close)"rS   rd  r  r   re  rf  rE  r   r  r  rF  r"  rg  rG  rH  rh  rI  r	  rC  r   rk   	ref_modelref_loss
ddp_paramsr  r>  rN  expects_device_errorexpects_cpu_device
cpu_devicerz   context	fsdp_lossfsdp_unsharded_paramss"                                     rH   _test_fsdp_parityzFSDPTest._test_fsdp_parity)  s   F l222	<;	<2K!!&&(     ((
 	

 
 {m;	  4&M	#E*I!(I00$D0(+'A''A 1 

 )..01
*%6%6#2$4#2		

	Y)))"" 	
 # J *d+ j$*<*<LLJ#*J~:::#{3J$D0O[5O5O
 N/>3N3NN 	 N/>3N3NN 	 e,J#..0 ;  z:; $ ""%%0M3  	  	55!,% /+E++E 6 I	   e,J#..0 ;  z:;![1I /
 ; 	""8YE"J "=%!,	   ,9"K  	Y}[MAxPQWXX	YF	 	s$   "L L?	L<L77L<?M)rj  NFNFFN)$rA   rB   rC   r  r
  r~   rl   r  r  r  r  r   r#  r.   classmethodr@  ri   rj   r   floatr   r   r   r   r   r   rc  r`  rO   r@   rJ   r   r   r   r|  r  r  s   @rH   r  r  x  sC       : : d   0 0>JH3 1% 1%p 15 48+0#?CUyyU U 	U
 U #:.U U ".1U %)U U %-T#s(^$<Ux +/",,8<8<48!& %+0#04?C#g-(g %g )	g
 h'g g g  g $$45g $$45g ".1g g g %)g g  d38n-!g" %-T#s(^$<#grG   r  compile_compute_on_modulec                 @      fd G d dt               fd}|S )Nc                      t        j                  j                  j                  | i | t	        | d         r| d   j                          y y )Nr   )rf   rY  r  r   r  rE  )r_   r`   r  s     rH   !fully_shard_with_compiled_computez=compiled_fsdp_test.<locals>.fully_shard_with_compiled_compute  sN    **D;F;$,
G.1
 GOO1
rG   c                   (    e Zd Z e       Z e       Zy)*compiled_fsdp_test.<locals>.FullyShardModeN)rA   rB   rC   r   EAGERCOMPILED_COMPUTErF   rG   rH   FullyShardModer    s    6rG   r  c                 4     t                fd       }|S )Nc                     t         j                  j                  j                  }D ]  }|j                  k7  r t               st        j                  d       3t         j                  j                  j                  }t         j                  j                  j                  }t         j                  j                          |j                  k(  r|}n^|j                  k(  rAdt         j                  j                  _
        dt         j                  j                  _        }nt        d|       |	j                   |j"                  <    	| i | t         j                  j                          |	j                   |j"                  <   |t         j                  j                  _
        |t         j                  j                  _         y )Nz0Inductor on GPU needs Triton and recent GPU archTr>   z!Need to implement FullyShardMode=)rf   rY  r  r   r  r5   warningswarnr  configskip_fsdp_hooks	_inductorcompile_threadsr  r  NotImplementedError__globals__rA   )
r_   r`   original_fully_shardmodeoriginal_skip_fsdp_hooksoriginal_compile_threadsfully_shard_patchr  r  funcs
          rH   wrapperz6compiled_fsdp_test.<locals>.decorator.<locals>.wrapper  sd   (-(9(9(>(>(J(J & R>///
MM"TU+0==+?+?+O+O(+0??+A+A+Q+Q(!!))+>///(<%^<<<;?EMM((8=>EOO**:(I%-;D6B  CT  !5!>!>?d%f%!!))+BV  !5!>!>?7O$$49Q&&69RrG   r	   )r  r  r  r  s   ` rH   	decoratorz%compiled_fsdp_test.<locals>.decorator  s#    	t	R 
	R@ rG   )r   )r  r  r  r  s   ` @@rH   compiled_fsdp_testr    s"    " ""H rG   c                   &     e Zd Zd fdZd Z xZS )
SkipModulec                 \    t         |           t        j                  ddd      | _        y N
   Fr  )r   r   ri   r   linr  s    rH   r   zSkipModule.__init__  s"    99R%0rG   c                 $    | j                  |      S r   )r  r-  s     rH   r   zSkipModule.forward  s    xx{rG   rc   rj  r  s   @rH   r  r    s    1rG   r  c                   $     e Zd Z fdZd Z xZS )NestedLinearc                     t         |           |r:t        t        j                  ddd      j                  t                    | _        y t        j                  ddd      j                  t              | _        y r  )r   r   r!   ri   r   r   r   nested_linear)rS   	fsdp_wrapr   s     rH   r   zNestedLinear.__init__  sV    !%biiBU&C&F&F{&S!TD!#2r!>!A!A+!NDrG   c                 $    | j                  |      S r   )r  r-  s     rH   r   zNestedLinear.forward  s    !!!$$rG   rj  r  s   @rH   r  r    s    O%rG   r  c                   $     e Zd Z fdZd Z xZS )	SkipModelc                    t         |           t        j                  ddd      j	                  t
              | _        t               j	                  t
              | _        t        t        |      t
              | _        y )Nr  Fr  )r  )r?  )r   r   ri   r   r   r   linearr  linear_skipr!   r  r  )rS   double_nestr   s     rH   r   zSkipModel.__init__  sW    iiBU366{C%<??;7!;/;
rG   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r  r  r  r-  s     rH   r   zSkipModel.forward$  s4    KKNQq!rG   rj  r  s   @rH   r  r    s    
rG   r  )FT)FFr  )rF   r   )
contextlibr  r=  r5  rR  r  abcr   r   r   copyr   enumr   r   	functoolsr
   typingr   r   r   r   r   r   unittestr   rf   torch.distributedrY  rt   torch.nnri   torch.nn.functionalr  r  torch.distributed._composabler   torch.distributed.device_meshr   torch.distributed.fsdpr   r   r   r   $torch.distributed.fsdp._common_utilsr   5torch.distributed.fsdp._fully_shard._fsdp_param_groupr   r   "torch.distributed.fsdp._init_utilsr   2torch.distributed.fsdp.fully_sharded_data_parallelr   r   r   *torch.distributed.fsdp.sharded_grad_scalerr   torch.distributed.fsdp.wrapr   r    r!   torch.distributed.tensorr"   r#   r$   !torch.distributed.tensor.parallelr%   r&   r'   r(   r)   r*   torch.nn.parallel.distributedr+   ro  *torch.testing._internal.common_distributedr,   r-   r.   r/   $torch.testing._internal.common_utilsr0   r1   r2   r3   r4   torch.utils._tritonr5   r  r   r3  r7   device_countr:   r@   rJ   rj   rO   r  r   r   r  r   r   r   r   r   r   r   r   r   r  r6  r9  rI  rb  rf  rl  r  r&  r  r  contextmanagerr  r  r  r  r  r  r  r  re   r   r  r  r  r`  r  r  r  r  rF   rG   rH   <module>r     sa    	 	 
   # "    F F        4 4 
 ? S 
 I R R F F  F H   + K ::**,LK K 99))+LK L4 T BIIs 499$$ >% #99##""2299 2t 2>299 >d >DBII D$ D .Q"- Q"h[J- [J|$7 D]J. ]J@J
m J
Z
? 
.,ryy ,JJ* JJZ/")) /@*r}} *Z'299 '6 6 6 6 9H 9 9 *X * * .x .  . .x .  . :8 :  : >X >  >0!0 0 	0
 0. +-	TyyT IIT c3h	TB/ &X# Xv
0(4. 0f 	%299 	%		 rG   