
    Vh                        U d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZmZ d dlmZ d dlmZmZm Z m!Z!m"Z" d d	l#m$Z$ d d
l%m&Z& d dl'Z'd dl(Z'd dl)Z'd dl*m+Z, d dl-m.Z. d dl/m0Z0 d dl1m2Z2 d dl3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z? d dl@mAZAmBZBmCZC d dlDZD ej                  ej                          ej                  eH      ZI G d de      ZJi d eJdd      d eJdd      d eJdd      d eJdd      d eJdd       d! eJd"d#      d$ eJd%d&      d' eJd(d)      d* eJd+d,      d- eJd.d/      d0 eJd1d2      d3 eJd4d5      d6 eJd7d8      d9 eJd:d;      d< eJd=d>      d? eJd@dA      dB eJdCdD      dE eJdFdG      iZKe G dH dI             ZLdJ ZMdK ZNdL ZOdM ZPdN ZQdO ZRdP ZSdQ ZTdR ZUdS ZVdT ZWdU ZXdV ZYdW ZZdX Z[dY Z\dZ Z]d[ Z^d\ Z_d]e'j                  d^ead_ead`ebfdaZce7dbdcdd edef      dddgddfdh       Zde;rdiZen ea ej                  djdk            ZedldmiZge:rdnegdo<   ddpZhd`eafdqZiedr        ZjddseadteadueafdvZkdteadwelfdxZmdanee	j                     epdy<   ddzeel   d`dfd{Zqdd|Zrd}Zs G d~ de<      Zt G d det      Zudevelewe!   f   de"de!fdZxdayd`ebfdZzd Z{deeesfdZ| G d de<      Z} G d de2j                        Z G d de2j                        Zedd       Z G d de'j                  j                  jx                        Z G d det      Z G d de<      Zy)    N)contextmanager)	dataclass)	timedelta)Enum)partialreducewraps)StringIO)
NamedTupleOptionalUnionAnyCallable)patch)	trace_log)
DeviceType)_SymmetricMemory)FILE_SCHEMAfind_free_portIS_SANDCASTLEretry_on_connect_failuresskip_but_pass_in_sandcastleskip_but_pass_in_sandcastle_ifTEST_WITH_ROCMTEST_WITH_TSANTestCase	run_testsTEST_HPUTEST_XPU)_install_threaded_pg_uninstall_threaded_pgProcessLocalGroup)levelc                   "    e Zd ZU eed<   eed<   y)TestSkip	exit_codemessageN)__name__
__module____qualname__int__annotations__str     Z/home/dcms/DCMS/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.pyr%   r%   <   s    NLr/   r%   backend_unavailableH   z5Skipped because distributed backend is not available.small_worldsizeI   z Skipped due to small world size.odd_worldsizeW   zSkipped due to odd world size.no_cudaJ   zCUDA is not available.zmulti-gpu-1K   zNeed at least 1 CUDA devicezmulti-gpu-2M   zNeed at least 2 CUDA deviceszmulti-gpu-3P   zNeed at least 3 CUDA deviceszmulti-gpu-4Q   zNeed at least 4 CUDA deviceszmulti-gpu-5R   zNeed at least 5 CUDA deviceszmulti-gpu-6S   zNeed at least 6 CUDA deviceszmulti-gpu-7T   zNeed at least 7 CUDA deviceszmulti-gpu-8U   zNeed at least 8 CUDA devicesncclL   z#c10d not compiled with NCCL support
skipIfRocmN   zTest skipped for ROCmno_peer_accessO   z'Test skipped because no GPU peer accessgenericV   zHTest skipped at subprocess level, look at subprocess log for skip reasonimporterrorX   z"Test skipped due to missing importno_acceleratorY   zaccelerator is not available.c                       e Zd Zi Zh ded<    e       ed<   ddhed<   ddhed<   i Zh ded	<   h ded
<   h ded<   h ded<    e       ed<   erdhed<   erdhed<   yy)DistTestCases>   mpiuccrA   allgather_coalescedr   rA   rP   zsendrecv anysourcezcpu barrier>   rP   gloorA   gpucudaddpsubgrouppluginhcclhpuxcclxpuN)r(   r)   r*   skip_collectivesetbackend_featurer   r   r.   r/   r0   rN   rN   [   s     O-CO)* #OH-3UOO()&,e_OM" O4OE5OF4OE"9OJ #OH"("( r/   rN   c                 .     t                fd       }|S )zSkips if the world size exceeds the number of GPUs, ensuring that if the
    test is run, each rank has its own GPU via ``torch.cuda.device(rank)``.c                     t         j                  j                         s&t        j                  t
        d   j                         t        t        j                  d         }t         j                  j                         |k  r)t        j                  t
        d|    j                         t        rFt         j                  j                  |k  r)t        j                  t
        d|    j                         t        rFt         j                  j                  |k  r)t        j                  t
        d|    j                          | i |S )Nr7   
WORLD_SIZE
multi-gpu-z
multi-xpu-)torchrT   is_availablesysexit
TEST_SKIPSr&   r+   osenvirondevice_countr   rY   r   r[   )argskwargs
world_sizefuncs      r0   wrapperzskip_if_no_gpu.<locals>.wrapperu   s    zz&&(HHZ	*445L12
::""$z1HHZ*ZL 9:DDE		..;HHZ*ZL 9:DDE		..;HHZ*ZL 9:DDET$V$$r/   r	   rn   ro   s   ` r0   skip_if_no_gpurr   q   s"     4[% % Nr/   c                 .     t                fd       }|S )Nc                      t         j                  d   dk7  rEt        t         j                  d         dk  r&t        j                  t
        d   j                          | i |S )NBACKENDrO   ra      r3   rh   ri   r+   re   rf   rg   r&   rk   rl   rn   s     r0   ro   z(skip_if_small_worldsize.<locals>.wrapper   sR    JJy!U*BJJ|4L0MPQ0QHHZ 12<<=T$V$$r/   rp   rq   s   ` r0   skip_if_small_worldsizery           
4[% % Nr/   c                 .     t                fd       }|S )Nc                      t         j                  d   dk7  rHt        t         j                  d         dz  dk(  r&t        j                  t
        d   j                          | i |S )Nru   rO   ra         r5   rw   rx   s     r0   ro   z&skip_if_odd_worldsize.<locals>.wrapper   sW    JJy!U*BJJ|4L0MPQ0QUV0VHHZ0::;T$V$$r/   rp   rq   s   ` r0   skip_if_odd_worldsizer      rz   r/   c                       fd}|S )Nc                 4     t                fd       }|S )Nc                      dk(  rKt         j                  j                         k  r*t        j                  t
        d    j                         y  | i |S NrA   rb   )rc   rT   rj   re   rf   rg   r&   )rk   rl   backendrn   ns     r0   ro   zCrequire_n_gpus_for_nccl_backend.<locals>.decorator.<locals>.wrapper   sM    & UZZ%<%<%>%Bj$45??@T,V,,r/   rp   )rn   ro   r   r   s   ` r0   	decoratorz2require_n_gpus_for_nccl_backend.<locals>.decorator   s     	t	- 
	- r/   r.   )r   r   r   s   `` r0   require_n_gpus_for_nccl_backendr      s     r/   c                      d } | S )Nc                 .     t                fd       }|S )Nc                      	 ddl m}m}  | i |S # t        $ r) t	        j
                  t        d   j                         Y y w xY w)Nr   )AutoModelForMaskedLM
BertConfigrI   )transformersr   r   ImportErrorre   rf   rg   r&   )rk   rl   r   r   rn   s       r0   ro   z?import_transformers_or_skip.<locals>.decorator.<locals>.wrapper   sE    >
 T,V,, >M2<<=>s    /AArp   rq   s   ` r0   r   z.import_transformers_or_skip.<locals>.decorator   s     	t		> 
		> r/   r.   )r   s    r0   import_transformers_or_skipr      s     r/   c                     t         j                  j                         xr! t         j                  j                         | k\  S N)rc   rT   rd   rj   )xs    r0   at_least_x_gpur      s,    ::""$G)@)@)Ba)GGr/   c                       fd}|S )Nc                 2     t                fd       }|S )Nc                     t         j                  j                         r)t         j                  j                         k\  r | i |S t        r)t         j
                  j                         k\  r | i |S t        r)t         j                  j                         k\  r | i |S t        j                  t        d    j                         y )Nrb   )rc   rT   rd   rj   r   rY   r   r[   re   rf   rg   r&   )rk   rl   rn   r   s     r0   ro   z4skip_if_lt_x_gpu.<locals>.decorator.<locals>.wrapper   s    zz&&(UZZ-D-D-F!-KT,V,,EII2249T,V,,EII2249T,V,,HHZ*QC 01;;<r/   rp   )rn   ro   r   s   ` r0   r   z#skip_if_lt_x_gpu.<locals>.decorator   s     	t	= 
	= r/   r.   )r   r   s   ` r0   skip_if_lt_x_gpur      s     r/   c                       fd}|S )Nc                 4     t                fd       }|S )Nc                      dk7  r | i |S t         j                  j                         r)t         j                  j                         k\  r | i |S t	        j
                  t        d    j                         y r   )rc   rT   rd   rj   re   rf   rg   r&   )rk   rl   r   rn   r   s     r0   ro   z9nccl_skip_if_lt_x_gpu.<locals>.decorator.<locals>.wrapper   sm    & T,V,,zz&&(UZZ-D-D-F!-KT,V,,HHZ*QC 01;;<r/   rp   )rn   ro   r   r   s   ` r0   r   z(nccl_skip_if_lt_x_gpu.<locals>.decorator   s     	t	= 
	= r/   r.   )r   r   r   s   `` r0   nccl_skip_if_lt_x_gpur      s    	 r/   c                     | j                         }d|v sJ d|v sJ d|v sJ |d   }|j                  d      dk(  r|n|j                  d      d   }||v sJ d| d|        y )	N	iteration	has_errorerrorz
Exception raised from r   zDid not find expected z in ddp logging data error: )_get_ddp_logging_datafindsplit)	model_DDP
err_substrddp_logging_datalogging_erractuals        r0   verify_ddp_error_loggedr      s     668********&&&&"7+K ??56"< 	89!<  	+R	x'CK=QRr/   c                 .     t                fd       }|S )aJ  
    Convenience decorator to set/unset TORCH_NCCL_BLOCKING_WAIT flag. Note that use of
    this decorator will override the setting of TORCH_NCCL_ASYNC_ERROR_HANDLING for
    the particular test. After the test, both TORCH_NCCL_BLOCKING_WAIT and
    TORCH_NCCL_ASYNC_ERROR_HANDLING will be restored to their original values.
    c                     	 t         j                  d   }t         j                  d= 	 t         j                  d   }dt         j                  d<   	  | i |}|||t         j                  d<   ||t         j                  d<   S S # t        $ r d }Y jw xY w# t        $ r d }Y gw xY w# dt         j                  d<   w xY w# ||t         j                  d<   ||t         j                  d<   w w xY w)NTORCH_NCCL_ASYNC_ERROR_HANDLINGTORCH_NCCL_BLOCKING_WAIT1)rh   ri   KeyError)rk   rl    cached_nccl_async_error_handlingcached_nccl_blocking_waitretrn   s        r0   ro   z(with_nccl_blocking_wait.<locals>.wrapper  s   	4AC1B, 

<=	9:<***;% 69BJJ12	S''C 0; 5 

5 )49R

56 51  	4/3,	4  	-(,%	- 69BJJ12 0; 5 

5 )49R

56 5s@   $B B 	B> BBB# B& "B##B& &B;>-C+rp   rq   s   ` r0   with_nccl_blocking_waitr      s%     4[ S  SD Nr/   c                       fd}|S )zK
    Runs a test for each distributed debug level specified in levels.
    c                 2     t                fd       }|S )Nc                     t         j                  j                  dd       }D ][  }|t         j                  d<   t        j                           | i |}t        j
                          |I|t         j                  d<   ] S )NTORCH_DISTRIBUTED_DEBUG)rh   ri   getc10dset_debug_level_from_envbarrier)rk   rl   	old_levelr#   r   rn   levelss        r0   ro   z:with_dist_debug_levels.<locals>.decorator.<locals>.wrapper.  sx    

'@$GI F8=

45--/D+F+(<EBJJ89F Jr/   rp   )rn   ro   r   s   ` r0   r   z)with_dist_debug_levels.<locals>.decorator-  s     	t	 
	 r/   r.   )r   r   s   ` r0   with_dist_debug_levelsr   (  s    
$ r/   c                  @    t        t        j                          d      S )Nz+c10d was not compiled with the Gloo backend)r   r   is_gloo_availabler.   r/   r0   requires_gloor   B  !    )""$$5 r/   c           	         t        j                         st        d      S t        t        j
                  j                  j                         | k  d|  dt        j
                  j                  j                          d|       S )N+c10d was not compiled with the NCCL backendz0Requires NCCL version greater than or equal to: z	, found: z
, reason: )r   is_nccl_availabler   r   rc   rT   rA   version)r   msgs     r0   requires_nccl_versionr   I  sv    !!#*9
 	
 .JJOO##%/>wiyQVQ[Q[Q`Q`QhQhQjPkkuvyuz{
 	
r/   c                  @    t        t        j                          d      S )Nr   )r   r   r   r.   r/   r0   requires_ncclr   U  r   r/   c                  @    t        t        j                          d      S )Nz*c10d was not compiled with the UCC backend)r   r   is_ucc_availabler.   r/   r0   requires_uccr   [  !    )!!##4 r/   c                  @    t        t        j                          d      S )Nz*c10d was not compiled with the MPI backend)r   r   is_mpi_availabler.   r/   r0   requires_mpir   a  r   r/   c                      t         j                  j                         xr$ t        j                  t
        j                  d      } t        |  d      S )Nr   z"multicast support is not available)rc   rT   rd   r   has_multicast_supportr   CUDAr   )r   s    r0   requires_multicast_supportr   h  sI    

! 	G22:??AF  *!!, r/   c                 <     d _         t                fd       }|S )zSkips a test for ROCmTc                  n    t         s | i |S t        j                  t        d   j                         y )NrC   )r   re   rf   rg   r&   rx   s     r0   ro   z*skip_if_rocm_multiprocess.<locals>.wrapperw  s-    (((L)334r/   )skip_if_rocm_multiprocessr	   rq   s   ` r0   r   r   s  s(    %)D"
4[5 5
 Nr/   c                  <    t        t        j                  dk(  d      S )Nwin32z8This unit test case is not supported on Windows platform)r   re   platformr.   r/   r0   skip_if_win32r     s    )B r/   devicemajorminorreturnc                     | j                   dk7  rt        d      t        j                  j                  yt        j
                  j                  |       ||fk\  S )z
    Returns True if the device's compute capability is (major, minor) or higher.
    Error out if the device is not a CUDA device.
    Returns False if device is a RoCM device.
    rT   z3sm_is_or_later() is only supported for CUDA devicesF)type
ValueErrorrc   r   hiprT   get_device_capability)r   r   r   s      r0   sm_is_or_higher_thanr     sM     {{fNOO}}$::++F3u~EEr/   	localhostr~   T   )minutesFc                     t               }|rEt        |t        d      z        }t        j                  j
                  j                  | ||||      S t        j                  | |||||      S )zL
    Creates a TCP store. Retries if the chosen port is already in use.
    r~   )milliseconds)wait_for_workers	use_libuv)r   r+   r   rc   classes	dist_c10dTCPStorer   )	addrrm   	is_mastertimeoutr   	jit_classr   porttimeout_milliseconds	            r0   create_tcp_storer     so     D!'I1,E"EF}}&&//$
I/B
 	
 }}$
I@P\e
 	
r/   i  !DISTRIBUTED_TESTS_DEFAULT_TIMEOUT300test_ddp_uneven_inputsi     test_join_kwargsc                     t         j                  dk(  s|  t        j                  j	                  d      S t        j                  j	                  |       S )Nr   z	127.0.0.1)hostname	interface)re   r   r   ProcessGroupGloocreate_devicer   s    r0   r  r    sG    
||w)"3$$22K2HH$$22Y2GGr/   c                 Z    t         j                  | j                  d      d   t              S N.r   )TIMEOUT_OVERRIDEr   r   TIMEOUT_DEFAULT)test_ids    r0   get_timeoutr	    s#    c 22 6HHr/   c               #   N  K   t               t               }} t        j                  t        j                  }}	 | |ct        _        t        _        t        j                  t        j                  f ||ct        _        t        _        y # ||ct        _        t        _        w xY wwr   )r
   re   stdoutstderr)new_outnew_errold_outold_errs       r0   captured_outputr    sl     z8:WGzz3::WG2!('
CJjj#**$$!('
CJ'
CJs   5B%9B	 1B%	B""B%rankrm   
num_inputsc                    ddt         dt         dt         dt         fd}dt         fd}t        |d      t        |d	      t        |d
      t        |d      t        |d	      t        |d
      fD cg c]N  }t        |      D cg c]  } ||| z  |z   ||z         c}t        |      D cg c]  } ||||z         c}fP c}}S c c}w c c}w c c}}w )z
    Generate a number of basic test cases for sparse reduction.
    These cover tensors with a varying number of sparse dimensions and a varying
    number of dense dimensions. The only reduction operation we support is sum.
    r~   r  rm   sparse_dims
dense_dimsc           	         t        j                  t        j                  | dz         d| dz   f      }|gt        |      D cg c]  }d c}z   }t        |dz
        D ]A  }t        j                  |t        j
                  d| dz         f      }|j                  |       C t        j                  | dz   gt        |      D cg c]  }d c}z         }t        j                  |||      S c c}w c c}w )Nr~   r}   )	rc   reshapearangerangecatzerosappendonessparse_coo_tensor)r  rm   r  r  indices_shapevaluess           r0   generatez,simple_sparse_reduce_tests.<locals>.generate  s     --TAX 6D1HF5+<=a=={Q' 	%Aii%++a*B CDGLL$	% TAXJU:5F)G!)GGH&&w>>  > *Hs   	C+	C0
c           
      |    t        t        j                  t        |      D cg c]  } | ||       c}      S c c}w r   )r   operatoraddr  )fnrm   r  s      r0   compute_sumz/simple_sparse_reduce_tests.<locals>.compute_sum  s2    LLE*<MND2dJ/N
 	
Ns   9
)r  r}      )r  )r~   r   )r+   r   r  )r  rm   r  r$  r)  r(  is          r0   simple_sparse_reduce_testsr,    s    
?s 
? 
?# 
?s 
?
C 
 H!,H!,H!,H+H+H+
 	 z* :$q(*z*AB @EZ?PQ![Z*45Q	
  Rs$   5CC C/CC
Cr   c           
      f   t         j                  j                         }t        rt         j                  j                         }t
        rt         j                  j                         }t        |      }d}| |kD  r|| z  }t        |       D ci c]  }|t        |||z  |dz   |z          }}|S c c}w )zMultigpu tests are designed to simulate the multi nodes with multi
    GPUs on each node. Nccl backend requires equal #GPUs in each process.
    On a single node, all visible GPUs are evenly
    divided to subsets, each process only uses a subset.
    r~   )	rc   rT   rj   r   rY   r   r[   r  list)rm   r   nGPUsvisible_devicesnGPUs_per_processr+  rank_to_GPUs          r0   init_multigpu_helperr3    s     JJ##%E		&&(		&&(ElO E!Z/ z" 	
4$5 5QBS8STUUK  	s   B.tmp_dirinit_methodc                    t        j                         at        j                  t        j
                  d<   t	        j                  t        j                  j                  t        j                  d             t	        j                  t        j                  j                  t        j                  d             t        j                  j                  t        j                  d      }t	        j                  |       | | t        j
                  d<   y t        t        j                  j                  |d      z   t        j
                  d<   y )NTEMP_DIRr   test_dirinit_dirINIT_METHODshared_init_file)
tempfileTemporaryDirectoryr4  namerh   ri   mkdirpathjoinr   )r5  init_dir_paths     r0   initialize_temp_directoriesrC    s    ))+G$\\BJJzHHRWW\\',,	23HHRWW\\',,
34GGLLz:MHH]$/

=!$/"'',,-3
 %


=!r/   c                  :    t         t         j                          y y r   )r4  cleanupr.   r/   r0   cleanup_temp_dirrF  ,  s     r/      c            	       <    e Zd ZdZdZdefdZedefd       Zede	fd       Z
d Zdded	edd
f fdZd fdZd fdZdefdZddZddZ G d de      Zede	fd       Zede	dededd
fd       Zdedd
fdZddZddZddZddZedefd       Z xZS ) MultiProcessTestCaser   
   r   c                      y)NFr.   selfs    r0   _should_stop_test_suitez,MultiProcessTestCase._should_stop_test_suiteI  s    r/   c                      y)NTr.   rL  s    r0   destroy_pg_upon_exitz)MultiProcessTestCase.destroy_pg_upon_exitQ  s    r/   c                     t         S r   DEFAULT_WORLD_SIZErL  s    r0   rm   zMultiProcessTestCase.world_sizeU      !!r/   c                 V    t              fd       }t        j                  ||       S )Nc                 j    | j                   | j                  k(  r| j                         y          y r   )r  MAIN_PROCESS_RANK_join_processesrM  r(  s    r0   ro   z1MultiProcessTestCase.join_or_run.<locals>.wrapperZ  s(    yyD222$$R(r/   r	   types
MethodTyperM  r(  ro   s    ` r0   join_or_runz MultiProcessTestCase.join_or_runY  .    	r	 
	 ..r/   method_name
methodNameNc                     |dk7  r|}t         |   |       	 t        | |      }t        | || j	                  |             y # t
        $ r+}|dk7  rt        d| j                   d|       |Y d }~y d }~ww xY wNrunTestzno such test method in z: super__init__getattrsetattrr^  AttributeErrorr   	__class__rM  r`  ra  r(  erk  s        r0   rg  zMultiProcessTestCase.__init__g       "$K%	b{+BD+t'7'7';< 	bY& !#:4>>:J"ZL!YZ`aa '	b   (A 	A6!A11A6c                     t         |           g | _        g | _        | j                  | _        t        j                  d      j                  | _	        i | _
        y )NF)delete)rf  setUpskip_return_code_checks	processesrW  r  r<  NamedTemporaryFiler>  	file_namepid_to_piperM  rk  s    r0   rr  zMultiProcessTestCase.setUpv  sH    ')$**	!44EBGGr/   c                 r    t         |           | j                  D ]  }|j                           g | _        y r   )rf  tearDownrt  	terminate)rM  prk  s     r0   rz  zMultiProcessTestCase.tearDown  s3     	AKKM	 r/   c                 F    | j                         j                  d      d   S r  idr   rL  s    r0   _current_test_namez'MultiProcessTestCase._current_test_name  s    wwys#B''r/   c                    g | _         t        t        | j                              D ]  }t        j
                  j                         \  }} || j                  j                  dt        |      z   || j                         | j                  |fdt        | dd      i      }|j                          t        j                  d||j                          || j"                  |j                   <   | j                   j%                  |        y )Nzprocess fake_pgF)targetr>  rk   rl   zStarted process %s with pid %s)rt  r  r+   rm   rc   multiprocessingPiperk  _runr-   r  rv  rh  startloggerinfopidrw  r  )rM  procr  parent_conn
child_connprocesss         r0   _start_processesz%MultiProcessTestCase._start_processes  s    #doo./ 	+D&+&;&;&@&@&B#K~~**#d)+D335t~~zRwtY>	G MMOKK8$L,7DW[[)NN!!'*	+r/   c                 x    t         j                  j                  d      j                  }| j	                  |       y )Nspawn)rc   r  get_contextProcessr  )rM  r  s     r0   _spawn_processesz%MultiProcessTestCase._spawn_processes  s,    $$009AAd#r/   c                       e Zd ZdZy)MultiProcessTestCase.Eventr~   N)r(   r)   r*   GET_TRACEBACKr.   r/   r0   Eventr    s    r/   r  r  c                    t         j                  d|       	 t        j                  j	                  | |g      }| |v r| j
                  rt         j                  d|       y | j                         }t         j                  d||       |t        j                  j                  k(  rt        j                  d      5 }t        j                  |       |j                          |j                  d       | j!                  |j#                                t         j                  d|       d d d        ||v ry # 1 sw Y   xY w)Nz*Starting event listener thread for rank %sz:Pipe closed for process %s, stopping event listener threadzReceived event %s on process %szr+)moder   zProcess %s sent traceback)r  r  r  
connectionwaitclosedrecvrI  r  r  r<  ru  faulthandlerdump_tracebackflushseeksendread)parent_pipesignal_piper  ready_pipeseventtmp_files         r0   _event_listenerz$MultiProcessTestCase._event_listener  s   @$G)4499;:TUKk)%%KKTVZ #((*=udK066DDD!44$? G8$33H= ( a(#((9$?FG k)5  G Gs   :A,D55D>	test_namerv  c                 T     | |      }||_         ||_        |j                  ||       y r   )r  rv  run_testclsr  r  rv  r  rl   rM  s          r0   r  zMultiProcessTestCase._run  s'    9~	"i-r/   c           	          t         j                  j                  d      \  }}t        j                  t
        j                  ||| j                  fd      }|j                          t        j                  dk7  r2t        j                  dk7  rt         j                  j                  d       dt        j                  d<   	  t        | |              ||j;                  d        |J |j=                          |j?                          | j@                  r	 tC        jD                          y y # t         j"                  $ r[}t$        j'                  d	| j                  |t)        |             t        j*                  t,        d
   j.                         Y d }~d }~wt0        $ r t$        j3                  dt5        j6                         | j                  t
        j8                         |j;                  t5        j6                                t        j*                  t
        j8                         Y Zw xY w# ||j;                  d        |J |j=                          |j?                          w xY w# tF        tH        f$ r Y y w xY w)NF)duplexT)r  rk   daemonr   darwinr   TORCH_SHOW_CPP_STACKTRACESz4Process %s skipping test %s for following reason: %srG   z;Caught exception: 
%s exiting process %s with exit code: %s)%rc   r  r  	threadingThreadrI  r  r  r  re   r   _C'_set_print_stack_traces_on_fatal_signalrh   ri   rh  unittestSkipTestr  r  r-   rf   rg   r&   	Exceptionr   	traceback
format_excTEST_ERROR_EXIT_CODEr  rA  closerP  r   destroy_process_groupAssertionErrorr   )rM  r  r  signal_recv_pipesignal_send_pipeevent_listener_threadses          r0   r  zMultiProcessTestCase.run_test  s   -2-B-B-G-Gu-G-U** ) 0 0'77/;!

 	##%<<7"s||x'? HH<<TB36

/0	 $GD)$&   + %%d+(444!&&($$ **,	 %/    	6KKF		S\^abd^e HHZ	*4455 	@LL0$$&		3G3\3\ Y1134HH)>>?	@  + %%d+(444!&&( #J/ sJ    D+ I+ +H,>AFH/ BH,(H/ +H,,H/ /9I(+I=<I=c                    g }t        | j                        D ]h  \  }}|j                  | j                  |j                     }	 |j                  t        j                  j                         |j                  ||f       j |D ]x  \  }}	 |j                  d      rK|j                  rt        j                  d|       ;|j!                         }t        j                  d||       nt        j                  d|       z y # t        $ r"}t        j                  d||       Y d }~d }~ww xY w# t        $ r!}t        j                  d||       Y d }~d }~ww xY w)NzBEncountered error while trying to get traceback for process %s: %sr   z5Pipe closed for process %s, cannot retrieve tracebackz)Process %s timed out with traceback: 

%sz6Could not retrieve traceback for timed out process: %s)	enumeratert  exitcoderw  r  r  rI  r  r  r  ConnectionErrorr  r   pollr  r  r  )rM  pipesr+  r  piperm  r  r  s           r0   _get_timedout_process_tracebackz4MultiProcessTestCase._get_timedout_process_traceback  s>   #DNN3 		JAw'''4II288FFGLL!T+		   	JD$99Q<{{SUY ! $		ILLEtY LLPRV	 ' LL\^_ab 0 # XZ^`a s6   <D3D/ >D/	D,
D''D,/	E8EEc                    t        | j                               }t        j                         }d}	 	 t        | j                        D ]w  \  }}|j
                  t        j                  k(  s$t        d| d|j
                   d       t        j                  j                         }|D ]  }|j                           d} n |rnt        d | j                  D              rntt        j                         |z
  }	|	|kD  rA| j                          t        d| d       | j                  D ]  }|j                           nt        j                  d	       #t        j                         |z
  }
|| j                   v r| j#                  |
       n| j%                  |
       | j&                  j)                         D ]  }|j+                           y # | j&                  j)                         D ]  }|j+                           w xY w)
NFTProcess z terminated with exit code z", terminating remaining processes.c              3   8   K   | ]  }|j                   d u  y wr   )r  ).0r|  s     r0   	<genexpr>z7MultiProcessTestCase._join_processes.<locals>.<genexpr>9  s     F!qzz-Fs   zTiming out after z" seconds and killing subprocesses.g?)r	  r  timer  rt  r  rI  r  printrc   r  active_childrenr{  allr  sleeprs  _check_no_test_errors_check_return_codesrw  r#  r  )rM  r(  r   
start_timesubprocess_errorr+  r|  r  acelapsedelapsed_timer  s               r0   rX  z$MultiProcessTestCase._join_processes#  s   dggi(YY[
 *	'7 FQ zz%9%N%NN&qc)DQZZLPrs +0*?*?*O*O*Q"1 +BLLN++/( $Ft~~FF))+
2W$88:+G94VW "^^ &&

3= @  99;3LT111**<8((6 ((//1 

((//1 

s   9G +D2G 1G>c                     t        | j                        D ]I  \  }}|j                  t        d| d| d      | j	                  | j
                  |j                         K y)zV
        Checks that we didn't have any errors thrown in the child processes.
        Nr  z timed out after  seconds)r  rt  r  RuntimeErrorassertNotEqualr  )rM  r  r+  r|  s       r0   r  z*MultiProcessTestCase._check_no_test_errorsS  sh     dnn- 	GDAqzz!"qc!2<.I   9 91::F	Gr/   c                 8   | j                   st        j                  d       y| j                   d   }t        | j                         D cg c]&  \  }}|j                  t
        j                  k(  r||f( }}}|r[d}|D ]I  \  }}| j                  |j                     j                         }|d| dt
        j                   d| dz  }K t        |      t        | j                         D ]h  \  }}|j                  t        d| d	| d
      | j                  |j                  |j                  d| d|j                   d|j                          j t        j                         D ]q  }	|j                  |	j                  k(  st        r1t        j!                  d| j#                         |	j$                          yt'        j(                  |	j$                         | j                  |j                  dd|j                   d|j                          yc c}}w )z
        Checks that the return codes of all spawned processes match, and skips
        tests if they returned a return code indicating a skipping condition.
        z<Note: no subprocesses were spawned, test was likely skipped.Nr    r  z exited with error code z and exception:

 terminated or timed out after r  zExpect process z+ exit code to match Process 0 exit code of z
, but got )r   6Skipping %s on sandcastle for the following reason: %sz Expected zero exit code but got z
 for pid: )rt  r  warningr  r  rI  r  rw  r  r  r  assertEqualrg   r#  r&   r   r  r  r'   r  r  )
rM  r  first_processr+  r|  errored_processesr   r  error_messageskips
             r0   r  z(MultiProcessTestCase._check_return_codes^  s?    ~~NNYZq) "$..1
1zz1FFF F
 

 E/ 
7 $ 0 0 = B B Dqc!9:N:c:c9d e''4oR9 u%% dnn- 		DAqzz!"qc!@hW  

&&%aS(STaTjTjSkkuvw  wA  wA  vB  C  		 %%' 	:D%%7 
 KKPRVRYRYR[]a]i]i "++DLL99	: 	""2=3I3I2J*UbUfUfTgh 	 	
S
s   
+Hc                      | j                   dk(  S )Nr   r  rL  s    r0   r   zMultiProcessTestCase.is_master  s    yyA~r/   rd  rd  r   N) r(   r)   r*   rW  r  boolrN  propertyrP  r+   rm   r^  r-   rg  rr  rz  r  r  r  r   r  staticmethodr  classmethodr  r  r  rX  r  r  r   __classcell__rk  s   @r0   rI  rI  @  s6   
   d   "C " "/bC b bUY b(C (+"$    < . . . .W[ . .1# 1t 1f#J.`	G>
@ 4  r/   rI  c                   <     e Zd Z fdZd ZdefdZd Zd Z xZ	S )DistributedTestBasec                 B    t         |           | j                          y r   rf  rr  r  rx  s    r0   rr  zDistributedTestBase.setUp      r/   c                 b    	 t        j                  | j                         y # t        $ r Y y w xY wr   )rh   removerv  OSErrorrL  s    r0   rz  zDistributedTestBase.tearDown  s)    	IIdnn% 		s   " 	..r   c                 "    d|v ryd|v ryd|v ryy)NrT   rA   rY   rX   r[   rZ   rR   r.   )rM  r   s     r0   r   zDistributedTestBase.backend  s$    Vf_f_r/   c                    t        j                  |      j                         }t         j                  j	                  | j
                  |      }t         j                  j                  | j                  |      | j                  | j                  |       d| j                  |      v sd| j                  |      v r)t         j                  j                  | j                         t         j                  j                  j                         S )Nr   rm   r  storerA   rZ   )rc   get_device_modulerj   distributed	FileStorerv  init_process_groupr   rm   r  acceleratorset_device_indexdistributed_c10d_get_default_group)rM  r   num_visible_devicesr  s       r0   	create_pgzDistributedTestBase.create_pg  s    #55f=JJL!!++DNN<OP,,LL(	 	- 	
 T\\&))Vt||F7K-K..tyy9  11DDFFr/   c                     t        j                  |      j                         }t        | j                        D ci c]	  }|||z  g c}S c c}w r   )rc   r  rj   r  rm   )rM  r   r  r+  s       r0   rank_to_devicez"DistributedTestBase.rank_to_device  sG    #55f=JJL6;DOO6LMA++,,MMMs   A)
r(   r)   r*   rr  rz  r-   r   r  r  r  r  s   @r0   r  r    s%      GNr/   r  subtest_configtest_fntest_kwargsc                    t        |j                               }|D cg c]  }|d   	 }}|D cg c]  }|d   	 }}t        j                  | D ]  }	t	        t        ||	            }
 | j                  di |
5  t        j                  j                           ||i ||
 t        j                  j                          ddd       t        j                           yc c}w c c}w # 1 sw Y   *xY w)a\  
    Runs a test function given by ``test_fn`` as a subtest according to the
    configurations specified by ``subtest_config``. This amortizes the
    costly setup overhead (including process spawn and initializing the
    process group) over the subtests.

    Args:
        subtest_config (Dict[str, List[Any]]): A mapping from subtest
            keyword argument name to a list of its possible values.
        test_fn (Callable): A callable that runs the actual test.
        test_args: Positional arguments to pass to ``test_fn``.
        test_kwargs: Keyword arguments to pass to ``test_fn``.
    r   r~   Nr.   )r.  items	itertoolsproductdictzipsubTestrc   _dynamoresetr   r   )cls_instr  r  	test_argsr  subtest_config_itemsitemsubtest_config_keyssubtest_config_valuesr#  subtest_kwargss              r0   run_subtestsr#    s    * 9=^=Q=Q=S8T:N%O$d1g%O%OBV-W$d1g-W-W##%:; c"5v>?X// 	"MM!Y@+@@MM!	" 	 &P-W	" 	"s   C C%:AC**C3	c                      t         t         S 	 t        j                  g dd      j                  dk(  a t         S # t        $ r
 da Y t         S w xY w)a   
    If shell command `fi_info -p efa -t FI_EP_RDM` returns exit code 0 then we assume that the machine has
    Libfabric EFA interfaces and EFA software components installed,
    see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start.html.
    )fi_infoz-pefaz-t	FI_EP_RDMF)checkr   )EFA_PROBE_RESULT
subprocessrun
returncodeFileNotFoundErrorr.   r/   r0   has_efar.    sZ     #!NNFeT__cdd 	
   ! !s   &: AAc                  "    t               rddgS dS )a  
    If the machine has Libfabric EFA interfaces and EFA software components installed it may cause
    'RuntimeError: In operator() at tensorpipe/common/ibv.h:172 "": Operation not supported' if tensorpipe
    uses InfiniBand transport, so we exclude it from tensorpipe transports,
    see https://github.com/pytorch/pytorch/issues/73885 and https://github.com/pytorch/pytorch/issues/65022
    shmuvN)r.  r.   r/   r0   tp_transportsr2    s     $IE4=/4/r/   c                 d      t        t        |      S d t                fd       }|S )z+
    Wrapper to use with a test method
    )r   rm   c                      t               t        j                         }fd fd}g }t               D ]=  }t	        j
                  |||f      }|j                          |j                  |       ? |S )Nc                  >     t         j                  j                  k(  S r   r   r
  _worldworlds   r0   world_is_validzaspawn_threads_and_init_comms.<locals>._run_test_method_with_multi_threads.<locals>.world_is_valid!      D118888r/   c                 ~   t        j                  d| |       	                  rt        j                          y y # t        $ rR}t        j                  j                  | t        j                         f       t        j                  |       Y d }~sd }~ww xY w#         rt        j                          w w xY w)Nthreadedr   r  rm   r  )r   r  BaseExceptionMultiThreadedTestCaseexception_queueputre   exc_infor"   exception_handler  )r  world_pgr  excallbackr:  rm   s       r0   workerzYspawn_threads_and_init_comms.<locals>._run_test_method_with_multi_threads.<locals>.worker$  s    ##"*E1
 "#..0 $ ! 7%55994:PQ!222667
 "#..0 $s*   A   	B	ABB BB B<r  rk   )r    r   	HashStorer  r  r  r  r  )	rm   rG  global_storerH  threadsr  tr9  r:  s	   ``     @@r0   #_run_test_method_with_multi_threadszIspawn_threads_and_init_comms.<locals>._run_test_method_with_multi_threads  sq    $&~~'	9	1 *% 	D  dE<5PQAGGINN1	
 r/   c                 X    t         j                  j                  j                  d       	   fd      }t        j                  |       t         j                  j                  j                  d       y # t         j                  j                  j                  d       w xY w)NTc                       g i S r   r.   )rk   rn   rl   rM  s   r0   <lambda>z?spawn_threads_and_init_comms.<locals>.wrapper.<locals>.<lambda>@  s    dSWNiZ^NibhNi r/   F)rc   r  _distributed_c10d_set_thread_isolation_moder@  _join_threads)rM  rk   rl   rL  rN  rn   rm   s   ``` r0   ro   z-spawn_threads_and_init_comms.<locals>.wrapper;  sq     	""==dC	I9*FijG!//>HH&&AA%HEHH&&AA%Hs   %A> >+B))r   spawn_threads_and_init_commsr	   )rn   r   rm   ro   rN  s   ` ` @r0   rU  rU    sD     |('j
 	

< 4[I I Nr/   c                       e Zd ZdZ ej
                         ZdZd Zdde	de	ddf fdZ
d	 Zd
 Zd fdZ fdZd Zed        Zd Zed        Zed        Zedefd       Zede	fd       ZddddZddddZ xZS )r@  a5  
    Test runner that runs all tests with the in-proc process group using
    multiple threads with the threaded process group.

    Each test spawns world_size threads and run the test method in each thread.

    Difference from regular MultiProcess test runner:
    Must explicitly defines SetUp and call self._spawn_threads() to run the tests.
    Cannot use setUp / tearDown (must use perThreadSetup / perThreadShutdown)
        to set up / tear down each thread when running each test.
    No global state possible
        How bad of a limitation is this?
    r   c                 V    t              fd       }t        j                  ||       S )Nc                     | j                   | j                  k(  r| j                  | j                         y          y r   )r  MAIN_THREAD_RANKrT  rL  rY  s    r0   ro   z2MultiThreadedTestCase.join_or_run.<locals>.wrapper\  s.    yyD111""4<<4r/   rZ  r]  s    ` r0   r^  z!MultiThreadedTestCase.join_or_run[  r_  r/   r`  ra  r   Nc                     |dk7  r|}t         |   |       	 t        | |      }t        | || j	                  |             y # t
        $ r+}|dk7  rt        d| j                   d|       |Y d }~y d }~ww xY wrc  re  rl  s        r0   rg  zMultiThreadedTestCase.__init__e  rn  ro  c                      y r   r.   rL  s    r0   perThreadSetUpz$MultiThreadedTestCase.perThreadSetUpt  s    r/   c                      y r   r.   rL  s    r0   perThreadTearDownz'MultiThreadedTestCase.perThreadTearDownx  s    r/   c                 x    t         |           | j                  | _        g | _        dt
        j                  d<   y)z
        setUp only set up things in the main thread, if you want to configure things
        in the spawned threads, use perThreadSetUp
        r   r  N)rf  rr  rY  r  rL  rh   ri   rx  s    r0   rr  zMultiThreadedTestCase.setUp{  s1    
 	))	36

/0r/   c                 0    t         |           g | _        y)z
        tearDown only set up things in the main thread, if you want to configure things
        in the spawned threads, use perThreadTearDown
        N)rf  rz  rL  rx  s    r0   rz  zMultiThreadedTestCase.tearDown  s    
 	r/   c                    t         j                  j                  j                  d       | j                  }t               t        j                         | j                  _	        fd} |       st        d      t        | j                        D ]e  }t        j                  | j                  j                  ||| j                  f      }|j!                          | j"                  j%                  |       g y)zk
        class method to spawn threads and run test, use this method in the SetUp of your TestCase
        Tc                  >     t         j                  j                  k(  S r   r6  r8  s   r0   r:  z<MultiThreadedTestCase._spawn_threads.<locals>.world_is_valid  r;  r/   zInvalid worldrI  N)rc   r  rR  rS  r  r    r   rJ  rk  rK  r  r  rm   r  r  r  r  rL  r  )rM  r  r:  r  rM  r9  s        @r0   _spawn_threadsz$MultiThreadedTestCase._spawn_threads  s     	""==dC++	$&&*nn&6#	9 //$//* 	#D  (;(;9dTXTcTcBdeAGGILL"	#r/   c                     | |      }||_         t        |d      rWt        j                         |_        t
        j                  |j                  _        t
        j                  |j                  _	        |j                  |||       y )N_tls)r  hasattrr  localre  r   
_precision	precision_rel_tolrel_tolrun_test_with_threaded_pg)r  r  r  rm   rl   rM  s         r0   r  zMultiThreadedTestCase._run  sb    9~	 4 !)DI"*"5"5DII ( 1 1DII&&y$
Cr/   c                    t        j                  d||| j                  j                         | j	                          	  t        | |              t        j                          | j                          y# t        $ rN}| j                  j                  |t        j                         f       t        j                  |       Y d}~wd}~ww xY w# t        j                          | j                          w xY w)zd
        Run the current test associated with `test_name` using the threaded process group.
        r=  r>  N)r   r  rk  rK  r\  rh  r?  rA  rB  re   rC  r"   rD  r  r^  )rM  r  r  rm   rF  s        r0   rl  z/MultiThreadedTestCase.run_test_with_threaded_pg  s     	TjHcHc	
 		%$GD)$&
 &&(""$  	3  $$dCLLN%;<..r22	3 &&(""$s*   A5 5	C>ACC CC &C5c           
         t         }	 t        |      D ]f  \  }}|j                  t        d|             |j	                         s2t
        j                  j                  |t        t        d| d      d ff       h t        j                          g }| j                  j                         sF| j                  j                         }|j                  |       | j                  j                         sFt                t        j                   j"                  j%                  d       | j'                  |||       y # t                t        j                   j"                  j%                  d       w xY w)Nr   zRank failed to join in under r  F)r  r  rA  maxis_aliver@  rA  rB  TimeoutErrorr"   r  emptyr   r  r!   rc   r  rR  rS  r  )r  rL  r(  r   idxthreadfailed_ranksfailures           r0   rT  z#MultiThreadedTestCase._join_threads  s-   !	I(1 VC7O,??$)99== , ,&CG9H$U!" !%	 ##%L))//1--113##G, ))//1 #$HH&&AA%Hgr: #$HH&&AA%Hs   <D9 B,D9 95E.c           	         d}d}|D ]1  \  }}|d   }t        |t        j                        r;t        j	                  d||t        |             |dk  sMt        d   j                  }at        |t              r)d| d| d	}	t        j                  |	       t        |	      t        |t              rEdj                  t        j                  |       }	t        j                  d
|	|       |d| d|	 dz  }t        |t              st!        |j"                        t$        k(  s|dk  s&|j"                  }4 t'        |      dkD  rt        |      |dkD  rqt        j)                         D ]Y  }
||
j                  k(  st*        r#t        j	                  d||
j,                          y t        j                  |
j,                         y y )Nr  r   r~   z3Thread %s skipping test %s for following reason: %sr   rG   zThread r  z	 seconds
z'Caught exception: 
%s exiting thread %sz exited with exception:
r  r  )
isinstancer  r  r  r  r-   rg   r&   rq  r   r  r  rA  r  format_exception
SystemExitr   coder+   lenr#  r   r'   )r  ru  r   r(  	error_msg	skip_coder  rC  excr   r  s              r0   r  z)MultiThreadedTestCase._check_return_codes  s    		* 	)ND(1+C#x001I4QSUXY\U] q= *9 5 ? ?IC.v%DWIZXS!"3''C+ggi88(CD>T dV#<SED	 C,>S(Y] #I-	)2 y>Ay))q="))+ 	>.$TVXZ^ZfZf &//==	> r/   c                     t         S r   rR  rL  s    r0   rm   z MultiThreadedTestCase.world_size  rT  r/   c                 F    | j                         j                  d      d   S r  r~  rL  s    r0   r  z(MultiThreadedTestCase._current_test_name  s     wwys#B''r/   r   r  c                J    | j                   |k(  r| j                  |||       yy)z
        The reason why we have this util function instead of
        self.assertEqual is all threads are sharing one CPU RNG
        so the assertion result is only reliable on rank 0
        N)r  r  rM  r   yr   r  s        r0   assertEqualOnRankz'MultiThreadedTestCase.assertEqualOnRank  s'     99Q3' r/   c                H    | j                   |k(  r| j                  ||       y y r   )r  r  r  s        r0   assertNotEqualOnRankz*MultiThreadedTestCase.assertNotEqualOnRank'  s#    991% r/   r  r  r   )r(   r)   r*   __doc__queueQueuerA  rY  r^  r-   rg  r\  r^  rr  rz  rc  r  r  rl  rT  r  r  r+   rm   r  r  r  r  r  s   @r0   r@  r@  I  s     "ekkmO/bC b bUY b	7#* D D%& ; ;: .> .>` "C " " (C ( (( (&1 & &r/   r@  c                        e Zd Zdeej
                  ej                  f   deddf fdZ	dej                  dej                  fdZ
 xZS )SaveForwardInputsModuleforward_inputscast_forward_inputsr   Nc                 t    t         |           t        j                  dd      | _        || _        || _        y )Nd   )rf  rg  nnLinearlr  r  rM  r  r  rk  s      r0   rg  z SaveForwardInputsModule.__init__-  s2    
 	3$,#6 r/   r   c                     || j                   | <   | j                  | j                  r3|j                  | j                  j                  j
                              S |      S r   )r  r  r  toweightdtyperM  r   s     r0   forwardzSaveForwardInputsModule.forward7  sI    $%D!vv43K3Kadd466==../SSQRSSr/   r(   r)   r*   r  r  Modulerc   Tensorr  rg  r  r  r  s   @r0   r  r  ,  sT    7RYY457 "7 
	7T T%,, Tr/   r  c                        e Zd Zdeej
                  ej                  f   deddf fdZ	dej                  dej                  fdZ
 xZS )SaveForwardInputsModelr  r  r   Nc                 t    t         |           t        ||      | _        t        ||      | _        || _        y r   )rf  rg  r  c1c2r  r  s      r0   rg  zSaveForwardInputsModel.__init__=  s6    
 	).:MN).:MN,r/   r   c                 `    || j                   | <   | j                  | j                  |            S r   )r  r  r  r  s     r0   r  zSaveForwardInputsModel.forwardG  s)    $%D!wwtwwqz""r/   r  r  s   @r0   r  r  <  sQ    -RYY45- "- 
	-# #%,, #r/   r  c              #     K   |st         j                  j                  |        dt        j                  d<   dt        j                  d<   |rp|rVt         j
                  j                  j                  j                  j                         }t        j                  d|| |       nt        j                  d| |       t         j                  j                          t         j                  j                  j                  j!                          	 d  t         j                  j                          t         j                  j                  j                  j!                          |rt        j"                          y y # t         j                  j                          t         j                  j                  j                  j!                          |rt        j"                          w w xY ww)	Nr   MASTER_ADDR6789MASTER_PORTfaker  rA   r  rm   )rc   r  r	  rh   ri   testing	_internalr  r  	FakeStorer   r  r  r  utilscountersclearr  )r  rm   init_pgr  r  s        r0   _dynamo_dist_per_rank_initr  K  sJ     **40 +BJJ} &BJJ}MM++77??IIKE##%	 ##F*M	MM	MM  &&()$$**,&&(  	$$**,&&( s    D
G%E9 A(G%9A)G""G%c                   @     e Zd ZdZe fd       Ze fd       Z xZS )#DynamoDistributedSingleProcTestCasez
    Test harness for single-process dynamo distributed tests,
    initializes dist process group.

    Prefer this for simple tests, as it's easier to debug.
    c                 `   t         |           | j                  j                  t	        j
                  t        j                  ddd             d| _        d| j                   | _	        d| j                  v rd n| j                  g| _
        t        j                  d| j                  d	       y )
Nr   12355)r  r  r   zcuda:rT   rA   r~   r  )rf  
setUpClass_exit_stackenter_contextr   r  rh   ri   r  r   
device_idsr   r  r  rk  s    r0   r  z.DynamoDistributedSingleProcTestCase.setUpClassq  s    %%JJ

#.#*	
 SXXJ'
!'3::!5CHH:SXX!Dr/   c                 J    t        j                          t        |           y r   )r   r  rf  tearDownClassr  s    r0   r  z1DynamoDistributedSingleProcTestCase.tearDownClass  s    ""$r/   )r(   r)   r*   r  r  r  r  r  r  s   @r0   r  r  i  s2     E E"    r/   r  c            	       d     e Zd ZdZ fdZ fdZedefd       Ze	dede
de
dd	fd
       Z xZS )"DynamoDistributedMultiProcTestCasea   
    Use this for tests that actually run on multiple GPUs.

    Decorate tests with @skip_if_lt_x_gpu(ngpu)

    Note: MultiProcTestCase spawns processes per test and is slow.
    Prefer MultiThreadedTestCase for most tests. Perhaps use this one
    sparingly for integration tests.
    c                 B    t         |           | j                          y r   r  rx  s    r0   rr  z(DynamoDistributedMultiProcTestCase.setUp  r  r/   c                     t         |           	 t        j                  | j                         y # t
        $ r Y y w xY wr   )rf  rz  rh   r  rv  r  rx  s    r0   rz  z+DynamoDistributedMultiProcTestCase.tearDown  s5    	IIdnn% 		s   1 	==r   c                 >    t         j                  j                         S r   )rc   rT   rj   rL  s    r0   rm   z-DynamoDistributedMultiProcTestCase.world_size  s    zz&&((r/   r  r  rv  Nc                     t        j                  t        j                                 | |      }||_        ||_        |j                  ||       y r   )r   
addHandlerloggingNullHandlerr  rv  r  r  s          r0   r  z'DynamoDistributedMultiProcTestCase._run  s@    W0023 9~	"i-r/   )r(   r)   r*   r  rr  rz  r  r+   rm   r  r-   r  r  r  s   @r0   r  r    s^      )C ) ) . . . .W[ . .r/   r  c            	            e Zd ZU dZeed<   dZeed<   dZee	   ed<    e
d      Ze
ed	<   eej                  d
e	fd              Zedd       Ze fd       Ze fd       Ze	 ddededee	   fd       Z xZS )MultiProcContinousTestr}   rm   r   r  N	rdvz_filex   )secondsr   r   c                     t        d      )z
        ProcessGroup backend str.
        To be customized by sub test classes, e.g. "nccl".
        Here we raise error.
        z/Please implement backend_str in your test class)NotImplementedError)r  s    r0   backend_strz"MultiProcContinousTest.backend_str  s     ""STTr/   c                      y)z
        ProcessGroup init options.
        To be customized by sub test classes, e.g. ProcessGroupNCCLOpTest
        Here we return None.
        Nr.   )r  high_priority_streams     r0   optszMultiProcContinousTest.opts  s     r/   c                 f   t         |           d| j                  cxk  r| j                  k  s'n t	        d| j                   d| j                         | j
                  r+t        j                  | j
                  | j                        }nd}| j                         }| j                         }t        d|       t        j                  || j                  | j                  ||| j                         t        j                  j                         | _        t        d| j                   d       y)	z
        Class-scope test fixture. Run once for entire test class, before any test starts.
        Set up the process group.
        r   zBRank must be set and in the range of 0 to world_size. World size: z Rank: NzTesting backend=)r   rm   r  r  
pg_optionsr   Rank z setup complete)rf  r  r  rm   r  r  r   r  r  r  r  r  r   r
  r  pg)r  r  r  r   rk  s       r0   r  z!MultiProcContinousTest.setUpClass  s     	CHH-s~~-"~~.gchhZA  ==NN3==#..AE Exxz//#!
#$~~KK	
 &&99;chhZ/0r/   c                     t        j                          t        |           | j                  r 	 t        j                  | j                         t        d| j                   d       y# t        $ r Y %w xY w)z
        Class-scope test fixture. Run once for entire test class, after all tests finish.
        Tear down the process group.
        r  z teardown completeN)
r   r  rf  r  r  rh   r  r  r  r  r  s    r0   r  z$MultiProcContinousTest.tearDownClass  sd     	""$==		#--( 	chhZ123  s   A* *	A65A6c                 B    || _         || _        || _        t                y)ad  
        This is an entry point for each rank to run the tests in `MultiProcContinousTest`.
        In this entry point, we set the class variables for the test class.
        Then we run all tests.

        Note:
        - This helper only works for a subclass of `MultiProcContinousTest`.

        Example:
        - See `test_c10d_ops_nccl.py`.
        N)r  rm   r  r   )r  r  rm   r  s       r0   run_rankzMultiProcContinousTest.run_rank  s    & #!r/   )Fr   )r(   r)   r*   rm   r+   r,   r  r  r   r-   r   r   r  abcabstractmethodr  r  r  r  r  r  r  s   @r0   r  r    s     JD#N#Ix}#"3/GY/ UC U  U   1 1> 4 4 
 $(	  C=	 r/   r  r   )r~   r  )TF)r  r  r  r  r  rh   r  r*  re   r<  r  r  r  r[  r  
contextlibr   dataclassesr   datetimer   enumr   	functoolsr   r   r	   ior
   typingr   r   r   r   r   unittest.mockr   torch._logging._internalr   rc   torch._dynamo.test_casetorch.cuda.nccltorch.distributedr  r   torch._C._autogradr   torch._C._distributed_c10dr   torch.nnr  $torch.testing._internal.common_utilsr   r   r   r   r   r   r   r   r   r   r   r   5torch.testing._internal.distributed.multi_threaded_pgr    r!   r"   r&  basicConfigINFO	getLoggerr(   r  r%   rg   rN   rr   ry   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r+   r  r   r   r  getenvr  r  r	  r  r,  r-   r3  r4  r=  r,   rC  rF  rS  rI  r  r  r.  r#  r)  r.  r2  rU  r@  r  r  r  r  r  	test_caser  r  r  r.   r/   r0   <module>r     s        	   
       % !   , ,  = =  .      ) 7     
    ',, '			8	$z 
8
C x$FG	
 Xb"BC x45 8B => 8B >? 8B >? 8B >? 8B >? 8B >? 8B >? 8B >? HR>?  (267!" hr#LM#$ x
V%* 8B DE+, hr#BC-
4 * * **6&H$R$+\4	

F Fc F# F$ F  	a 
 
0 O)"))$GOPO,c2  +.'(HIC I 2 2(S (c (s (XS 3 2 26(--	. 5
Xc] 
d 
"  `8 `N%N. %NNd3i( 
 F   &0 
3E5p`&H `&FTbii T #RYY # ) ): %--*A*A*J*J  @!.)= !.HfX fr/   