
    VhE                        d dl Z d dlmZmZ d dlZd dlmZ d dlm	c m
Z d dlm	c mZ d dlmZ d dlmZ d dlmZmZmZ d dlmZmZ d dlmZ d dlmZmZmZ d d	lm Z  ejB                  jD                  Z"d
gZ#e jH                  d        Z%de&e df   de'de'fdZ(de&e df   dedefdZ)dejT                  jV                  de&e,df   de-e.e,f   defdZ/d Z0dejT                  jV                  de&e,df   de-e.e,f   de,fdZ1dejT                  jV                  de&e,df   de-e.e,f   de,fdZ2dededee   dee   de'de'd ejf                  d!e'ded"e'de&eef   fd#Z4dejT                  jV                  de&e,df   de-e.e,f   de,fd$Z5d%edededee   de'de'd&ed ejf                  d!e'ded"e'defd'Z6dejT                  jV                  de&e,df   de-e.e,f   de,fd(Z7e"j`                  jp                  e1e"jr                  jp                  e2e"jt                  jp                  e5e"jv                  jp                  e5e"jx                  jp                  e7e"jz                  jp                  e7iZ>d) Z?d* Z@y)+    N)castOptional)Tensor)
DeviceMesh)DTensor	ReplicateShard)DTensorSpec
TensorMeta)_MaskPartial)	_skip_dim	Reductionreplicate_reduction_dims)	Placementloss_parallelc               #   <   K   t                d t                yw)a  
    A context manager that enables loss parallelism, where efficient parallelized loss computation
    can be performed when the input is sharded on the class dimension. Currently only the cross-entropy
    loss is supported.

    Within this context manager, one can use :func:`~torch.nn.functional.cross_entropy` or
    :class:`~torch.nn.CrossEntropyLoss` as usual, with the following assumptions on the input parameters.
    The corresponding ``backward()`` call, if any, also needs to happen under this context manager.

    Args:
        input (:class:`DTensor`):
            Input logits. Assumed to be sharded on the class dimension.
        target (Union[:class:`torch.Tensor`, :class:`DTensor`]):
            Must be ground truth class indices (class probabilities currently not supported).
            Assumed to be replicated across the ``DeviceMesh``.
        weight (Union[:class:`torch.Tensor`, :class:`DTensor`], optional):
            If given, assumed to be replicated across the ``DeviceMesh``.
        label_smoothing:
            Currently not supported.

    Returns:
        A replicated :class:`DTensor`.

    Example:
        A sharded DTensor is manually created here to showcase the usage.
        In practice, it is usually the output of a TP module.

        >>> # xdoctest: +SKIP("distributed")
        >>> from torch.distributed.tensor.parallel import loss_parallel
        >>> from torch.distributed.device_mesh import init_device_mesh
        >>> ...
        >>> device_mesh = init_device_mesh("cuda", (8,))
        >>> input = torch.randn(4, 16, device="cuda", requires_grad=True)
        >>> dist_input = distribute_tensor(input, device_mesh, placements=[Shard(1)])
        >>> target = torch.randint(16, (4,), device="cuda")
        >>> with loss_parallel():
        >>>     loss = F.cross_entropy(dist_input, target, reduction="mean")
        >>>     loss.backward()
        >>> ...
    N)_enable_custom_loss_ops_disable_custom_loss_ops     V/home/dcms/DCMS/lib/python3.12/site-packages/torch/distributed/tensor/parallel/loss.pyr   r      s     T 	s   
placements.dimreturnc                 |    t        |       dk(  st        d      | d   j                  |      st        d| d      y)N   zLCurrently loss_parallel() only supports input on one-dimensional DeviceMesh.r   zUloss_parallel() should be enabled only when the input tensor is sharded on dimension .)len
ValueErroris_shard)r   r   s     r   _find_all_reduce_mesh_dimr!   P   sS    z?aZ
 	
 a=!!#&cdgchhij
 	
 r   meshc                    t        | t              r-| j                  |k(  r| S t        d| d| j                   d      t        | t        j
                        rt        j                  | ||d      S t        dt        |              )Nz	Expected z	 but got r   F)device_meshr   	run_checkzUnsupported type )	
isinstancer   r   RuntimeErrortorchr   
from_local	TypeErrortype)tensorr   r"   s      r   _cast_to_dtensorr-   \   s     &'"
*M:,i@Q@Q?RRSTUU	FELL	)!!u
 	
 +DL>:;;r   op_callargskwargsc                 (   t         j                  j                  | ||      }t         j                  j                  j	                  |j
                        }t        |t              r|S t        |t              r|d   S t        dt        |       d      )Nr   zUnexpected tensor meta type: r   )r   _op_dispatcherunwrap_to_op_infosharding_propagator_propagate_tensor_metaschemar&   r   tupler'   r+   )r.   r/   r0   op_infotensor_metas        r   r5   r5   l   s    
 $$66wfMG((<<SSK +z*	K	'1~:4;L:MQOPPr   c                    |r| j                   t        j                  k(  sJ t        j                  | t        j
                  j                        \  }}| j                  |t        j                        } | j                         dk(  r| }nYt        j                  | |d      }t        j                  |t        j                  j                  j                   ||f      }| |z
  }t        j"                  t        j$                  |      |d      }	t        j                  |	t        j                  j&                  j                   ||f      }	t        j(                  |	      }
||
z
  }|s|j                  |      }|S )N)type_promotion_kind)dtypememory_formatr   T)keepdim)reduceOpgroup)r<   r(   halfutilselementwise_dtypesELEMENTWISE_TYPE_PROMOTION_KINDDEFAULTtocontiguous_formatnumelamaxfuncol
all_reducec10dReduceOpMAXnamesumexpSUMlog)xr   half_to_floatr"   mesh_dimcomputation_dtyperesult_dtypeshiftedx_maxshifted_sumexpshifted_logsumexpresults               r   _log_softmaxr^      s1   ww%**$$$&+&>&>	uDDLL'#| 	
$E4K4KLAwwyA~

1c40!!DMM--224:J
 e)YYuyy13EN&&!2!2!7!7h?ON 		.1((F<(Mr   c                    t        t        |d         }t        t        |d         }t        t        |d         }|j                  }t        |j                  |      }t        | ||      }t        |j                  |||j                  |      }	t        |j                  |j                  |      }
t        |	|
|	j                        S )Nr   r      r9   requires_grad)r   r   intbool_specr!   r   r5   r^   _local_tensorr"   r
   rc   )r.   r/   r0   rT   r   rU   specrV   output_tensor_metaresres_specs              r   _log_softmax_handlerrl      s    
 	Wd1gA
sDG
CtAw'M77D(#>H/vF
q]DIIx
PC		&H '' r   c                     t        t        |d         }t        t        j                  |d         }|j	                  |      S )Nr      )r   r   r(   r<   rF   )r.   r/   r0   grad_outputinput_dtypes        r   _log_softmax_backward_handlerrq      s7    
 wQ(Ku{{DG,K>>+&&r   rT   targetweightlocal_weight	reductionignore_indexinput_shapechannel_dimrV   c
                 P   | j                         ddk  rddt        dt        ffd}
| |
|      }|J  |
|      }| |z  } t        j                  ||k7  |d      }|j	                        }t        |      }|j                  |||	      }t        j                  | |      }|j                  |||	      }|j                         }t        j                  ||k7  |d      }|t        j                  j                  k(  rdkD  r| j                  dd	      }||fS ||t        | j                        }d
|<   j!                  |      }t        j                  ||      j                        }t        j                  ||k7  |d      }|j#                         }n"||k7  j#                         j%                  |       }|t        j&                  j                  k(  r|j#                         }||fS |t        j(                  j                  k(  r|j#                         |z  }||fS )Nr   r`   r   rs   r   c                 l    dkD  r+dgz  }| j                   d   |<   | j                  |      }|S | }|S )Nr   r   )shapeview)rs   r{   wrx   n_dimss      r   _weight_viewz'_nll_loss_forward.<locals>._weight_view   sQ    A:E "(aE+E"A  Ar   offset_shape
offset_dimr   g        )r   r   r(   where	unsqueezer   _partition_valuegather_reduce_valuesqueezer   NONEvaluenew_fulllistr{   expandrP   rF   rR   MEAN)rT   rr   rs   rt   ru   rv   rw   rx   r"   rV   r   r}   local_wsafe_targetsafe_target_partial_placementsafe_target_partial_result_partialresult_reducedr]   total_weight	new_shapewsumr~   s          `               @r   _nll_loss_forwardr      s    UUWFKz	V 	 	  '''|,K++f4fa@K((5L %++V,==dH \\![2FGN&44^T8TN$$[11F[[</;FINN(((VaZzz"c*|##M	!#	+HHY||A{L9AA+N{{6\14;xxz,.33588; IMM''' < 
inn**	*,<r   c                    t        t        |d         }|d   }|d   }t        t        |d         }t        t        |d         }|j                         dk\  rdnd}|j                  }	t        |	j                  |      }
t        t        |	j                  |g      |      }t               f|	j                  j                  z  }t        |||	j                        }d }|t        |||	j                        }t        |	j                  j                        D cg c]  }||
k(  rt        d      n	t                }}|j                  |	j                  |      j                   }|j"                  d   |j                   j"                  |   k(  sJ |t$        j&                  j(                  k(  r|}n|}t+        |      }||c|d<   |d<   t-        | t/        |      |      }t1        |j                   |j                   ||j                   nd ||||j"                  ||	j                  |

      \  }}t3        |	j                  ||      }t        |||j4                        |fS c c}w )Nr   r   r`   rn      ra   rb   )r   r   rd   r   rf   r!   r   r   r   r   r"   ndimr-   ranger	   redistributerg   r{   r   r   r   r   r5   r7   r   r
   rc   )r.   r/   r0   rT   rr   rs   ru   rv   rx   rh   rV   target_placementsall_replicate_placementsrt   isharded_placementsoutput_placementsri   r]   r   out_specs                        r   _nll_loss_forward_handlerr     s>   
 	Wd1gA!WF!WFS$q'"IT!W%Luuw!|!K77D(+FH " ;-@+ !*~		>f&7CFL!&*BDIIN
 AFdiinn@U
;<XE!H9;6
 
 **4996HIWW!!!$(=(=k(JJJJINN(((-4 :DvDGT!W/tfM,	 & 2			FL 499&7EWXH 	 ..	

 	 =
s   !Iro   r   c                    |j                         dk  rdnd}|t        j                  j                  k(  r| |z  } |j	                  |      }t        j                  ||k7  |d      }t        j                  |      }t        ||      }|j                  |      j                         }|j                  ||	|
      }|j                  j                  J |j                  j                  j                  |j                        dz
  }t        j                   |j"                  d   |j$                        }|j                         dk(  r|||<   n|j                         dk(  r||||f<   ne|j'                  |d      }|j"                  }|j)                  d|j"                  |         }||||f<   |j+                  |      j'                  |d      }|j                         | j                         cxkD  rdkD  rn n| j	                  |      } |t-        |j                               D cg c]  }d }}|j"                  d   ||<   |j)                  |      }t/        |j"                        }d||<   |j1                  |      }t        j2                  |||      }| |z  } t        j                  ||k7  | d      } |t        j4                  |      z   | z  S c c}w )Nr`   r   r   r   g      ?)devicer   )r   r   r   r   r   r(   r   
zeros_liker   r   flattenr   mask_bufferdatarF   r<   aranger{   r   	transposereshaper|   r   r   r   r   rQ   )ro   rT   rr   rs   ru   rv   r   rw   rx   r"   rV   r   
grad_inputr   masked_safe_targetgrad_update	arange_1dgrad_input_tintermidate_shapegrad_input_2d_r   r}   w_targets                           r   "_nll_loss_and_log_softmax_backwardr   X  s    uuw{!KINN(((!L0k*F++f4fa@K!!!$J %++V%%k2::<K*;;KxX((--999#//4477
8H8HICOK  #,>,E,EI
 	uuw!|)4
%&	
A4?
9001!++K<(..$,,R1EF7Bi!334"''(9:DD[RTU
~~+//+/a/!++K8 %aeeg/1Q/	/!'a	+	* M	!#	+MM)$<<;7!H,++f4k1EK 1%44# 0s   "	Kc                    t        t        |d         }t        t        |d         }|d   }|d   }t        t        |d         }t        t        |d         }t        t        |d         }	|j	                         dk\  rdnd}
|j
                  }t        |j                  |
      }t        t        |j                  |
g      |
      }t               f|j                  j                  z  }t        |||j                        }|t        |||j                        }t        |      }||c|d<   |d<   t        |	||j                        |d<   t        | t!        |      |      }t#        |j$                  |j$                  |j$                  ||j$                  nd |||	|j&                  |
|j                  |      }t)        |j                  |j                  |      }t        |||j*                  	      S )
Nr   r   r`   rn   r         ra   rb   )r   r   rd   r   r   rf   r!   r   r   r   r   r"   r   r-   r   r5   r7   r   rg   r{   r
   rc   )r.   r/   r0   ro   rT   rr   rs   ru   rv   r   rx   rh   rV   r   r   ri   r]   r   s                     r   _nll_loss_backward_handlerr     s   
 wQ(KWd1gA!WF!WFS$q'"IT!W%LQ(Luuw!|!K77D(+FH " ;-@+ !*~		>f&7CF!&*BDIIN :DvDGT!W|-EtyyQDG/tfM/!!	 & 2			F 		&H ** r   c                  ^    t         j                  j                  j                  t               y N)r   r2   _custom_op_handlersupdatecustomized_loss_opsr   r   r   r   r     s    ..556IJr   c                  l    t         D ]+  } t        j                  j                  j	                  |        - y r   )r   r   r2   r   pop)	custom_ops    r   r   r     s-    ( B	2266yABr   )A
contextlibtypingr   r   r(   torch._prims_common_prims_commonrB   )torch.distributed._functional_collectivesdistributed_functional_collectivesrJ   "torch.distributed.distributed_c10ddistributed_c10drL   r   torch.distributed.device_meshr   torch.distributed.tensorr   r   r	   &torch.distributed.tensor._dtensor_specr
   r   ,torch.distributed.tensor._ops._embedding_opsr   'torch.distributed.tensor._ops._math_opsr   r   r   (torch.distributed.tensor.placement_typesr   opsaten__all__contextmanagerr   r7   rd   r!   r-   _ops
OpOverloadobjectdictstrr5   r^   rl   rq   Sizer   r   r   r   default_log_softmax_backward_datanll_loss_forwardnll_loss2d_forwardnll_loss_backwardnll_loss2d_backwardr   r   r   r   r   r   <module>r      s    !  # : : 1 1  4 > > J E 
 ? yy~~ 
 - -d	%	3*? 	c 	c 	<in-<5?<< QZZ""Q

Q fQ 	Q&4ZZ""

 f 	>'ZZ""'

' f' 	'F F F  VF  6"	F 
 F  F  F  F  F  F  66>F RAZZ""A

A fA 	AVB5B5B5 B5 V	B5
 B5 B5 B5 B5 B5 B5 B5 B5J8ZZ""8

8 f8 	8x 	3##++-J!!#<##%>""$>$$&@ KBr   