
    Vhm                     n   d dl mZ d dlmZ d dlmZ d dlmZmZ d dl	Z	e	j                  j                  ZdZdZdZd	Zd
Zg dZedefd       ZdededefdZ	 ddedededededee   fdZdedededeegef   dedeee      fdZdededefdZ	 	 	 	 ddedededee   deeegef      deddfdZy)    )Iterable)contextmanager)	timedelta)CallableOptionalNz/num_membersz/last_memberz/TRACEz/TRACING_GATE   )store_timeoutget_allsynchronizebarriertimeoutc              #      K   | j                   }| j                  t        |             d | j                  |       yw)z
    This sets the timeout and then restores the old timeout when the context
    manager exits.

    Args:
        store: the store to set the timeout on
        timeout: the timeout to set
    secondsN)r   set_timeoutr   )storer   old_timeouts      U/home/dcms/DCMS/lib/python3.12/site-packages/torch/distributed/elastic/utils/store.pyr	   r	      s5      --K	i01		k"s   >A rankprefix
world_sizec                     | j                  t        |      D cg c]  }| | 
 c}      }t        | || d      }|dk(  r| j                  |g       |S c c}w )ad  
    Given a store and a prefix, the method goes through the array of keys
    of the following format: ``{prefix}{idx}``, where idx is in a range
    from 0 to size, and tries to retrieve the data.

    The Rank0 process waits at the end to make sure all other processes
    finished the procedure before exiting.

    Usage

    ::

     values = get_all(store, "torchelastic/data", 3)
     value1 = values[0]  # retrieves the data for key torchelastic/data0
     value2 = values[1]  # retrieves the data for key torchelastic/data1
     value3 = values[2]  # retrieves the data for key torchelastic/data2

    z	/finishedr   r   
key_prefixr   )	multi_getrange_barrier_nonblockingwait)r   r   r   r   idxdata_arrbarrier_keys          r   r
   r
   /   si    & E*<MNS6(3% 0NOH&XY'K
 qy 	

K=!O  Os   Adatar   returnc                     t        | |      5  | j                  | | |       t        | |||      }|cddd       S # 1 sw Y   yxY w)aT  
    Synchronizes ``world_size`` agents between each other using the underlying c10d store.
    The ``data`` will be available on each of the agents.

    Note: The data on the path is not deleted, as a result there can be stale data if
        you use the same key_prefix twice.

    Time complexity: O(N) per worker, O(N^2) globally.
    N)r	   setr
   )r   r"   r   r   r   r   
agent_datas          r   r   r   R   sM    " 
ug	& 		ZL'.UD*jA
  s	   &=Arank_decodertrace_timeoutc                       j                   | t         d        fd} fd}|dk(  r# |       } j                   t         d       |S  |       S )N<val_ignored>c                  Z   t               } d}t        d      D ]c  }|t        k\  r | S 	 |dk(  r(j                   | t         gt                     n'j                   | t         gt        d             e | S # t        $ r |dz  }| j                   |             Y w xY w)Nr      r   )milliseconds)r%   r   _MAX_TRACE_MISSING_RANKSr   _TRACEr   DistStoreErroradd)missing_rank_inforanks_missingir   r'   r   r(   r   s      r   _find_missing_ranksz9_try_detecting_missing_ranks.<locals>._find_missing_rankss   s    Eq*% 	7A  88 ! 
7 A%JJ&<s6(34i6V
 JJ:,qc& :;YTU=VW	7  !  " 7"!%%l1o67s   AB%B*)B*c                  r    	 j                    t         g       d d       dgS # t        $ r Y y w xY w)Nz[<check rank 0 (r   z) for missing rank info>])r   _TRACING_GATEr0   )r   r'   r   s   r   _checkinz._try_detecting_missing_ranks.<locals>._checkin   sJ    	JJ:,}o678&|A&77PQRR 		s   &* 	66r   )r%   r/   r7   )	r   r   r   r   r'   r(   r5   r8   r2   s	   ``` ``   r   _try_detecting_missing_ranksr9   i   sf     
IITF6(+_=! !* qy/1		ZL0/B  z    c                 |    |t         z   }|t        z   }| j                  |d      }||k(  r| j                  |d       |S )zq
    Does all the non-blocking operations for a barrier and returns the final key
    that can be waited on.
    r,   r*   )_NUM_MEMBERS_LAST_MEMBER_CHECKINr1   r%   )r   r   r   num_members_keylast_member_keyr   s         r   r   r      sE    
 !</O #77O
))OQ
'C
j		/?3r:   barrier_timeoutrank_tracing_decoderc                 `   |	|J d       t        | |      5  t        | ||      }	 | j                  |g       	 ddd       y# t        $ rT}||t	        | ||||xs d |      }	|	2t        dj                  |||ddj                  |	       d|            d|d}~ww xY w# 1 sw Y   yxY w)	as  
    A global lock between agents. This will pause all workers until at least
    ``world_size`` workers respond.

    This uses a fast incrementing index to assign waiting ranks and a success
    flag set by the last worker.

    Time complexity: O(1) per worker, O(N) globally.

    Optionally, passing rank will enable tracing of missing ranks on timeouts.
    `rank_tracing_decoder` lambda arg can be used to convert rank data
    into a more meaninful information at an app level (e.g. hostname).

    Note: Since the data is not removed from the store, the barrier can be used
        once per unique ``key_prefix``.
    Nz!Tracing requires rank informationr   c                     t        |       S )N)str)xs    r   <lambda>zbarrier.<locals>.<lambda>   s
    s1v r:   ziTimed out waiting on barrier on rank {}, for key prefix: {} (world_size={}, missing_ranks={}, timeout={})[z, ])r	   r   r   r0   r9   formatjoin)
r   r   r   r@   r   rA   r(   r?   emissing_rankss
             r   r   r      s    4 |#+P-PP+	uo	. .J:
	JJ()   	| <(>-=!! !,(ddjdj &&		- 89;+e	  	  G1	 s)   B$A	B!ABB!!B$$B-),  )rM   NN
   )collections.abcr   
contextlibr   datetimer   typingr   r   torch_C_DistStoreErrorr0   r<   r=   r/   r7   r.   __all__floatr	   intrD   r
   byteslistr   r9   r   r    r:   r   <module>r\      s   % %  %  ))% 	  A #% # #    c  s  R 
  	
   
%[.,, , 	,
 C5#:&, , hsm,^C S S & !;?;; ; 	;
 3-; #8SE3J#78; ; 
;r:   