
    Vh*                     (   d dl Z d dlZd dlZd dlZd dlmZmZ d dlmZ d dl	m
Z
mZmZ d dlmZmZmZ d dlmZmZ ddlmZmZmZmZ dd	lmZmZ dd
lmZmZ  ej>                  e       Z!dZ" G d de      Z#dedefdZ$dedefdZ%dede&e#ef   fdZ'y)    N)	b64decode	b64encode)	timedelta)AnycastOptional)	FileStoreStoreTCPStore)construct_and_record_rdzv_event	NodeState   )RendezvousConnectionErrorRendezvousErrorRendezvousParametersRendezvousStateError)RendezvousBackendToken)_matches_machine_hostnameparse_rendezvous_endpointir  c                       e Zd ZU dZdZeed<   eed<   dededdfd	Ze	defd
       Z
deeeef      fdZ	 ddedee   deeeeef      fdZdedefdZdedeeeef      fdZy)C10dRendezvousBackendzRepresents a C10d-backed rendezvous backend.

    Args:
        store:
            The :py:class:`torch.distributed.Store` instance to use to
            communicate with the C10d store.
        run_id:
            The run id of the rendezvous.
    Y2FuaW1hZGFt_store_keystorerun_idreturnNc                     |st        d      || _        d|z   | _        | j                  d| j                  d| j                         y )Nz&The run id must be a non-empty string.ztorch.rendezvous.compare_set )
ValueErrorr   r   _call_store_NULL_SENTINEL)selfr   r   s      l/home/dcms/DCMS/lib/python3.12/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py__init__zC10dRendezvousBackend.__init__4   sD    EFF'&0	 			2t7J7JK    c                      y)See base class.c10d )r%   s    r&   namezC10dRendezvousBackend.nameD   s     r(   c                 \    | j                  d| j                        }| j                  |      S )r*   get)r#   r   _decode_state)r%   base64_states     r&   	get_statezC10dRendezvousBackend.get_stateI   s)    "..udii@!!,//r(   statetokenc                 H   t        |      j                         }|r=t        |t              s| j	                         }|	g |d}|S y|j                         }n| j
                  }| j                  d| j                  ||      }| j                  |      }|y|\  }}	||	||k(  fS )r*   NFr    )	r   decode
isinstancebytesr2   r$   r#   r   r0   )
r%   r3   r4   base64_state_strresulttmpr1   state_token_pair	new_state	new_tokens
             r&   	set_statezC10dRendezvousBackend.set_stateO   s     !*% 0 7 7 9eU+)%(6.5.C JLLNE''E"..499e-=
  --l;#/	9
 )Y%%777r(   store_opc                     	  t        | j                  |      |i |S # t        t        t        f$ r}t        d      |d }~ww xY w)NMThe connection to the C10d store has failed. See inner exception for details.)getattrr   r"   RuntimeErrorTimeoutErrorr   )r%   r@   argskwargsexcs        r&   r#   z!C10dRendezvousBackend._call_stores   sN    	174;;14B6BBL,7 	+_	s    A>Ar1   c                     || j                   j                         k(  ry 	 t        |      }||fS # t        j                  $ r}t        d      |d }~ww xY w)Nz=The state object is corrupt. See inner exception for details.)r$   encoder   binasciiErrorr   )r%   r1   r3   rH   s       r&   r0   z#C10dRendezvousBackend._decode_state{   sa    4..5577	l+E l"" ~~ 	&O	s   / AAA)N)__name__
__module____qualname____doc__r$   r
   __annotations__strr'   propertyr-   r   tupler8   r   r2   boolr?   r   r#   r0   r,   r(   r&   r   r   #   s     $NM
ILe LS LT L  c  08E%,$78 0 6:"8"8#+E?"8	%ud*+	,"8HC S #% #HU5%<=P4Q #r(   r   paramsr   c           
      <   t        | j                  t              \  }}| j                  d      }||}nt	        |      }t        t        | j                  dd            }|dk  rt        d      |dfD ]t  }	 t        |||dt        |	      
      }|rSdt        j                          d}t        | j                  |t        j                          t"        j%                  |        |S  S # t        t&        t(        f$ r}	|r|t+        d      |	Y d }	~	d }	~	ww xY w)N)default_portis_hostread_timeout<   r   z,The read timeout must be a positive integer.FT)seconds)	is_mastermulti_tenanttimeoutzProcess z5 hosts the TCP store for the C10d rendezvous backend.)r   message
node_staterB   )r   endpointDEFAULT_PORTget_as_boolr   r   int
get_as_intr"   r   r   osgetpidr   r   r   INITloggerinforD   rE   r   )
rV   hostportcfg_is_hostrY   rZ   	is_serverr   msgrH   s
             r&   _create_tcp_storerq      s2   *6??VJD$$$Y/K  ,D1 V..~rBCLqGHH u% 		#!!,7E  -bc/!==#).. C  L=< L L,7 
	  7/c !8
	s   8A/C--DDDc                     | j                   r| j                   }n	 t        j                         \  }}	 t        |      }|S # t        $ r}t	        d      |d }~ww xY w# t        t        f$ r}t        d      |d }~ww xY w)NzMThe file creation for C10d store has failed. See inner exception for details.rB   )	rb   tempfilemkstempOSErrorr   r	   r"   rD   r   )rV   path_rH   r   s        r&   _create_file_storerx      s    	 &&(GAt$ L  	!_	 % '[
	s.   A  A  	A	AAA=,A88A=c                    | j                  dd      j                         j                         }	 |dk(  rt        |       }n|dk(  rt	        |       }nt        d      t        || j                        }||fS # t        $ rM}t        t        |      j                   dt        |       | j                  t        j                          d}~ww xY w)a	  Create a new :py:class:`C10dRendezvousBackend` from the specified parameters.

    +--------------+-----------------------------------------------------------+
    | Parameter    | Description                                               |
    +==============+===========================================================+
    | store_type   | The type of the C10d store. The currently supported types |
    |              | are "tcp" and "file" which correspond to                  |
    |              | :py:class:`torch.distributed.TCPStore` and                |
    |              | :py:class:`torch.distributed.FileStore`, respectively.    |
    |              | Defaults to "tcp".                                        |
    +--------------+-----------------------------------------------------------+
    | read_timeout | The read timeout, in seconds, for store operations.       |
    |              | Defaults to 60 seconds.                                   |
    |              |                                                           |
    |              | Note this only applies to                                 |
    |              | :py:class:`torch.distributed.TCPStore`. It is not relevant|
    |              | to :py:class:`torch.distributed.FileStore` which does not |
    |              | take in timeout as a parameter.                           |
    +--------------+-----------------------------------------------------------+
    | is_host      | A boolean value indicating whether this backend instance  |
    |              | will host the C10d store. If not specified it will be     |
    |              | inferred heuristically by matching the hostname or the IP |
    |              | address of this machine against the specified rendezvous  |
    |              | endpoint. Defaults to ``None``.                           |
    |              |                                                           |
    |              | Note that this configuration option only applies to       |
    |              | :py:class:`torch.distributed.TCPStore`. In normal         |
    |              | circumstances you can safely skip it; the only time when  |
    |              | it is needed is if its value cannot be correctly          |
    |              | determined (e.g. the rendezvous endpoint has a CNAME as   |
    |              | the hostname or does not match the FQDN of the machine).  |
    +--------------+-----------------------------------------------------------+
    
store_typetcpfilez?Invalid store type given. Currently only supports file and tcp.z: )r`   r   ra   N)r/   striplowerrx   rq   r"   r   r   	Exceptionr   typerM   rR   r   FAILED)rV   rz   r   backendes        r&   create_backendr      s    H L%0668>>@J&v.E5 %f-EQ  (v}}= E>  'Aw''(3q6(3== ''	

 	s   AA7 7	C ACC)(rK   loggingrg   rs   base64r   r   datetimer   typingr   r   r   torch.distributedr	   r
   r    torch.distributed.elastic.eventsr   r   apir   r   r   r   dynamic_rendezvousr   r   utilsr   r   	getLoggerrM   rj   rc   r   rq   rx   rT   r   r,   r(   r&   <module>r      s      	  '  & & 8 8 W  9 G 
		8	$ c#- c#L22 2x 2j3 	 0;/ ;E:OQV:V4W ;r(   