
    Vh              	          d dl mZ d dlmZmZ d dlZd dlmZmZm	Z	m
Z
 dgZ	 ddej                  j                  dedee   d	ej                  j                  fd
Zdededej                   d	efdZdededed	ej                   fdZy)    )partial)AnyOptionalN)
DeviceMeshDTensor	ReplicateShardinput_reshardmoduletp_device_meshinput_reshard_dimreturnc                 8   | S ddt         j                  j                  dt        t        df   ddffd}dt         j                  j                  dt        t        df   dt        dt        ffd}| j                  |       | j                  |       | S )	a  
    Register hooks to an nn.Module for input resharding, enabling sharding and restoration during backward computation.

    Register hooks to an nn.Module with input resharding so that we can shard
    per the given `tp_device_mesh` and `input_reshard_dim` and restore the
    input back when recomputing the activations in the backward. The reason
    why we can do this is that for Tensor Parallel(TP), the input are same
    across all TP ranks.

    Args:
        module (:class:`nn.Module`):
            Module to be registered with input resharding.
        tp_device_mesh (:class:`DeviceMesh`):
            Object which describes the mesh topology
            of devices for Tensor Parallel.
        input_reshard_dim (Optional[int]):
            The dimension of where we perform the sharding
            of input. If set None, there is no sharding of input.
            Default: None

    Return:
        A :class:`nn.Module` object registered with TP input resharding.
    N__i.r   c                     t         j                  j                  j                  t	        t
              t	        t                    }|j                          |y N)torchautogradgraphsaved_tensors_hooksr   _pack_hook_tp_unpack_hook_tp	__enter__)r   r   saved_tensor_hookscxr   r   s      _/home/dcms/DCMS/lib/python3.12/site-packages/torch/distributed/tensor/parallel/input_reshard.pyinput_reshard_forward_pre_hookz5input_reshard.<locals>.input_reshard_forward_pre_hook/   sK    "^^11EEM>3DEO^5FG
 	$$&    _oc                 &    j                          y r   )__exit__)r   r   r    r   s      r   input_reshard_backward_hookz2input_reshard.<locals>.input_reshard_backward_hook8   s     	r   )r   nnModuletupler   register_forward_pre_hookregister_forward_hook)r   r   r   r   r#   r   s    ``  @r   r
   r
      s    8  =AB %((//  uS#X  SW  88?? %c3h58	 $$%CD
  !<=Mr   meshxc                    t        |t              rCt        d |j                  j                  D              r|j                  | t        |      g      S t        |t              s{t        |t        j                        ra|j                         | j                         k\  r@t        j                  ||       j                  | t        |      g      j                         S |S )z.Hook function called after FWD to shard input.c              3   <   K   | ]  }|j                           y wr   )is_replicate).0ps     r   	<genexpr>z _pack_hook_tp.<locals>.<genexpr>E   s     %S1ann&6%Ss   device_mesh
placements)r2   )
isinstancer   all_specr3   redistributer	   r   Tensornumelsize
from_localto_localr)   r   r*   s      r   r   r   C   s    !W#%S@R@R%S"S~~$EBS<T;U~VVq'"q%,,'GGI$ qd3\d>O8P7Q\RXZ	
 r   c                    t        |t              ret        |j                  j                        dk(  rC|j                  j                  d   j                         r|j                  | t               g      S t        |t              st        |t        j                        rk|j                         | j                         k\  rJt        j                  || t        |      g      j                  | t               g      j                         S |S )zKHook function called before activation recomputing in BWD to restore input.   r   r1   )r4   r   lenr6   r3   is_shardr7   r   r   r8   r9   r:   r;   r	   r<   r=   s      r   r   r   U   s     	1g""#q(GGq!**,~~$IK=~IIq'"q%,,'GGI$ t7H1I0J \d	}\EXZ	
 r   r   )	functoolsr   typingr   r   r   torch.distributed.tensorr   r   r   r	   __all__r$   r%   intr
   r8   r   r    r   r   <module>rH      s        J J  (,2HHOO22  }2 XX__	2j
 s u|| PS $*    r   