
    BVh9                     \   d Z ddlmZ ddlZddlZddlZddlZddlZddlZddl	m
Z
 dZ	 ddlmZ ddlmZ e
j"                  Z e
j$                  d	dd
        e
j$                  ddd       dZdZdZdZdZdZdZdZdZdZd Zd Zd Zd Z d Z!d Z"d Z#d Z$ G d d       Z%y# e$ r dZY tw xY w)!zCloud TPU Client.    )futuresN)flagsT)	discovery)clientFruntime_oom_exitz,Exit the script when the TPU runtime is OOM.hbm_oom_exitz(Exit the script when the TPU HBM is OOM.KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS
TPU_CONFIG,TPU_NAMETPU_API_DISCOVERY_URLGCE_METADATA_IPGCE_METADATA_HOST8470Z   zhttp://{}:8475/requestversionc                  >    t         j                   j                         S )zA wrapper function around datetime.datetime.utcnow.

  This function is created for unit testing purpose. It's not easy to do
  StubOutWithMock with datetime.datetime package.

  Returns:
    datetime.datetime
  )datetimeutcnow     S/home/dcms/DCMS/lib/python3.12/site-packages/tensorflow/python/tpu/client/client.py_utcnowr   5   s     
			!	!	##r   c                  H    t         j                  j                  t              S N)osenvironget#_DISCOVERY_SERVICE_URL_ENV_VARIABLEr   r   r   _environment_discovery_urlr   A   s    	;	<<r   c                      t         j                  j                  t              } | s$t         j                  j                  t        d      } d| z   S )Nzmetadata.google.internalzhttp://)r   r   r   #_GCE_METADATA_ENDPOINT_ENV_VARIABLE_GCE_METADATA_URL_ENV_VARIABLE)endpoints    r   _gce_metadata_endpointr$   E   s=    ZZ^^?@(	zz~~&(BH 
X	r   c                     t         j                  j                  t               d| ddi      }t         j                  j	                  |      }t        |j                               S )Nz/computeMetadata/v1/zMetadata-FlavorGoogle)headers)urllibrequestRequestr$   urlopen_as_textread)pathreqresps      r   _request_compute_metadatar1   N   sU    $:$<dC (+ 	 	-# 
			$$	$))+	r   c              #      K   | j                  d      D ]_  }d}|j                  |      r|j                  |      d   }|j                  d      }|d   }t        }t        |      dkD  r|d   }||d a yw)z'Yields a dict with ip address and port.r   grpc://   :r   	ipAddressportN)split
startswith_DEFAULT_ENDPOINT_PORTlen)	endpointsr#   grpc_prefixparts
ip_addressr8   s         r   %_environment_var_to_network_endpointsrA   V   s     //#& hK;',Q/hNN3EqJ!D
5zA~1Xd s   A5A7c                  x    t         j                  j                  t              } | rt	        j
                  |       S y r   )r   r   r   _DEFAULT_TPUCONFIG_VARIABLEjsonloads)tpu_config_envs    r   _get_tpu_node_configrG   g   s*    ::>>"=>.::n%%	r   c                 |    | r| S t         t        fD ])  }|t        j                  v st        j                  |   c S  y r   )_GKE_ENV_VARIABLE_DEFAULT_ENV_VARIABLEr   r   )tpues     r   _get_tpu_namerM   n   s=    J45 aBJJZZ] 
r   c                 H    t        | t              r| j                  d      S | S )Nzutf-8)
isinstancebytesdecode)ss    r   r,   r,   x   s     588G	
(r   c                       e Zd ZdZ	 	 	 	 	 	 ddZd Zd Zd Zd Zd Z	d	 Z
d
 Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd ZddZddZy)ClientzClient for working with the Cloud TPU API.

  This client is intended to be used for resolving tpu name to ip addresses.

  It's recommended to use this library as a contextlib to utilize all
  functionality.
  Nc                    t        |t              r+|st        d      t        |      dk7  rt	        d      |d   }t        |      }|St               }|r<|j                  d      }|xs |j                  d      }|xs |j                  d      }nt        d      t        |      | _	        | j                  j                  d	       | _        || _        d | _        d | _        d | _        d | _        | j                  rz|d
k7  r|| _        |rt        |      | _        nt#        d      | _        |rt        |      | _        n$t#        d      }|j%                  d      d   | _        t'               xs || _        y y )Nz#At least one TPU must be specified.r4   z>Using multiple TPUs in a single session is not yet implementedr   tpu_node_nameprojectzonez(Please provide a TPU Name to connect to.r3   defaultzproject/project-idzinstance/zone/)rO   list
ValueErrorr<   NotImplementedErrorrM   rG   r   r,   _tpur:   _use_api_service_credentials_project_zone_discovery_urlr1   r9   r   )	selfrK   rX   rW   credentialsservicediscovery_urltpu_node_config	zone_paths	            r   __init__zClient.__init__   sa    #t>??	SQ!LN 	NFc

C
{,.o	!!/2;_00;2**62CDDDI		,,Y77DMDMDDMDJD}}			!'	 )12FG	d^
-o>	__S)"-
68IMd r   c                     d|z   S )z&Return the structured Symptom message.z	Symptom: r   )rf   msgs     r   _symptom_msgzClient._symptom_msg   s    r   c                 |   |syt        |      D ]  }|d   dk7  r|d   j                  d      d   }t        j                  j                  |d      }t	               |z
  }|t        j
                  t              k  snt        j                  | j                  d	j                  |j                                      y
 y)z)Check if a runtime OOM event is reported.FsymptomTypeOUT_OF_MEMORY
createTime.r   %Y-%m-%dT%H:%M:%Ssecondsza recent runtime OOM has occurred ~{} seconds ago. The model script will terminate automatically. To prevent future OOM events, please consider reducing the model size. To disable this behavior, set flag --runtime_oom_exit=false when starting the script.Treversedr9   r   strptimer   	timedelta_OOM_EVENT_COOL_TIME_SECloggingwarningro   formatrw   rf   symptomssymptomoom_datetime_stroom_datetime	time_diffs         r   
_oom_eventzClient._oom_event   s    H% 		?	2 .44S9!<&&//0@0CEl)l*i	X''0HI	I !&!2!235	6   r   c                 |   |syt        |      D ]  }|d   dk7  r|d   j                  d      d   }t        j                  j                  |d      }t	               |z
  }|t        j
                  t              k  snt        j                  | j                  d	j                  |j                                      y
 y)z%Check if a HBM OOM event is reported.Frq   HBM_OUT_OF_MEMORYrs   rt   r   ru   rv   za recent HBM OOM has occurred ~{} seconds ago. The model script will terminate automatically. To prevent future HBM OOM events, please consider reducing the model size. To disable this behavior, set flag --hbm_oom_exit=false when starting the script.Trx   r   s         r   _hbm_oom_eventzClient._hbm_oom_event   s    H% 		#6	6 .44S9!<&&//0@0CEl)l*i	X''0HI	I !&!2!235	6   r   c                 H   | j                   r| j                   S t        st        d      | j                  }||dk(  rt        j
                  j                         }| j                  r$t        j                  dd|| j                  d      S t        j                  dd|d      S )a  Creates a new Cloud TPU API object.

    This works around an issue where the underlying HTTP connection sometimes
    times out when the script has been running for too long. Other methods in
    this object call this method to get a new API object whenever they need
    to communicate with the Cloud API.

    Raises:
      RuntimeError: If the dependent Python packages are missing.

    Returns:
      A Google Cloud TPU API object.
    z_Missing runtime dependency on the Google API client. Run `pip install cloud-tpu-client` to fix.rY   rK   v1F)rg   discoveryServiceUrlcache_discovery)rg   r   )
ra   _GOOGLE_API_CLIENT_INSTALLEDRuntimeErrorrb   r   GoogleCredentialsget_application_defaultre   r   build)rf   rg   s     r   _tpu_servicezClient._tpu_service   s     }}]]' F G G ##KkY6,,DDFk__

!"11! ! __
;G Gr   c                 T    d| j                   d| j                  d| j                  S )z)Returns the full Cloud name for this TPU.z	projects/z/locations/z/nodes/)rc   rd   r_   rf   s    r   
_full_namezClient._full_name  s!     	tzz499. .r   c                 0   | j                         }	 |j                         j                         j                         j	                  | j                               }|j                         S # t        $ r }t        d| j                  d|      d}~ww xY w)z:Returns the TPU metadata object from the TPU Get API call.)namez)Could not lookup TPU metadata from name 'zY'. Please doublecheck the tpu argument in the TPUClusterResolver constructor. Exception: N)
r   projects	locationsnodesr   r   execute	Exceptionr]   r_   )rf   rh   rrL   s       r   _fetch_cloud_tpu_metadataz Client._fetch_cloud_tpu_metadata  s    !GF




&
&
(
.
.
0
4
4$//:K
4
LaYY[ F7;yy!E F FFs   AA, ,	B5BBc                 ^    | j                   r!| j                         }|j                  |      S y r   )r`   r   r   )rf   keymetadatas      r   _get_tpu_propertyzClient._get_tpu_property  s)    }}//1h\\#r   c                     d| _         y )NT)_openr   s    r   	__enter__zClient.__enter__$  s	    DJr   c                  
    ~~~y r   r   )rf   typevalue	tracebacks       r   __exit__zClient.__exit__'  s	    eYr   c                     | j                         }| j                         }|r|dv ryt        j                  r| j	                  |      ryt        j
                  r| j                  |      ryy)zReturns true if the TPU is in a state where training should eventually resume.

    If false the TPU is in a unrecoverable state and should be recreated.
    )
TERMINATED	PREEMPTEDFT)stater   FLAGSr   r   r   r   )rf   r   r   s      r   recoverablezClient.recoverable*  s[    
 JJLE}}H55			DOOH$=			 3 3H =r   c                 $    | j                  d      S )z%Return Cloud TPU Symptoms of the TPU.r   r   r   s    r   r   zClient.symptoms9  s    !!*--r   c                 $    | j                  d      S )zReturn state of the TPU.r   r   r   s    r   r   zClient.state=  s    !!'**r   c                 $    | j                  d      S )zReturn health of the TPU.healthr   r   s    r   r   zClient.healthA  s    !!(++r   c                    | j                   st        j                  | j                         d   d         }	 t        j
                  j                  |      }t        j
                  j                  |      }t        j                  |j                               }|j                  d      S | j                  d      S # t        j                  j                  $ r}|j                  }|dk(  rY d}~y|d}~ww xY w)z"Return runtime version of the TPU.r   r7   currentVersion  NtensorflowVersion)r`   _VERSION_SWITCHER_ENDPOINTr   network_endpointsr(   r)   r*   r+   rD   rE   r-   r   error	HTTPErrorcoder   )rf   urlr/   r0   version_detailsrL   status_codes          r   runtime_versionzClient.runtime_versionE  s     ==&--

 
 
"1
%k
24c
nn$$S)~~%%c***TYY[1""#344 !!"566 \\## ff#'s   A1B: :C4C/-C//C4c                 $    | j                  d      S )z#Return accelerator type of the TPU.acceleratorTyper   r   s    r   accelerator_typezClient.accelerator_typeY  s    !!"344r   c                     | j                   S )zPReturn if the Cloud TPU API is available, if not certain features will not work.)r`   r   s    r   api_availablezClient.api_available]  s    ==r   c                     | j                   S )zFReturn the name of the tpu, or the ip address if name is not provided.)r_   r   s    r   r   zClient.namea  s    99r   c                     t        d      S )zNReturn the local ip address of the Google Cloud VM the workload is running on.z instance/network-interfaces/0/ip)r1   r   s    r   get_local_ipzClient.get_local_ipe  s    $%GHHr   c                    | j                   st        t        | j                              S | j	                         }|j                  d      dk7  r+t        d| j                  d|j                  d      d      d|v r|d   S |d   |d   d	gS )
zReturn a list of tpu endpoints.r   READYzTPU "z" is not yet ready; state: ""networkEndpointsr7   r8   r6   )r`   r\   rA   r_   r   r   r   )rf   responses     r   r   zClient.network_endpointsi  s    ==7		BCC--/H||G'))X\\'%:< = =X%())$[18F;KLMMr   c                    t        j                          |z   }| j                         dk7  rt        j                  d| j	                         | j                         | j                                t        j                          |z   |kD  rt        d| j	                         z        t        j                  |       | j                         dk7  rt        j                  d| j	                                y)a?  Wait for TPU to become healthy or raise error if timeout reached.

    Args:
      timeout_s (int): The timeout in seconds for waiting TPU to become healthy.
      interval (int): The interval in seconds to poll the TPU for health.

    Raises:
      RuntimeError: If the TPU doesn't become healthy by the timeout.
    HEALTHYzFWaiting for TPU "%s" with state "%s" and health "%s" to become healthyz0Timed out waiting for TPU "%s" to become healthyzTPU "%s" is healthy.N)timer   r}   r~   r   r   r   sleep)rf   	timeout_sintervaltimeouts       r   wait_for_healthyzClient.wait_for_healthyw  s     iikI%G
++-9
$oo/
))+tzz|T[[]4 
x	'	)>LN 	N
jj ++-9
$ OO*DIIK8r   c                     fd}| j                         }t        j                  t        |            5 }|j	                  ||      }|D ]  }|s|j                           	 ddd       y# 1 sw Y   yxY w)a  Configure TPU software version.

    Args:
      version (string): Version of software to configure the TPU with.
      restart_type (string): Restart behaviour when switching versions,
        defaults to always restart. Options are 'always', 'ifNeeded'.

    c                    | d   }t         dz   j                  |      }t        j                  j	                  |d      }	 t        j                  j                  |       y# t        j                  j                  $ rJ}|j                  }|dk(  rt        dj                              t        dj                  |            d}~ww xY w)	zConfigure individual TPU worker.

      Args:
        worker: A dict with the field ipAddress where the configure request will
          be sent.
      r7   z/{}?restartType={}r   )datar   zTensorflow version {} is not available on Cloud TPU, try a previous nightly version or refer to https://cloud.google.com/tpu/docs/release-notes for the latest official version.zFailed to configure worker {}N)
r   r   r(   r)   r*   r+   r   r   r   r   )workerr@   r   r/   rL   r   restart_typeversions         r   configure_workerz6Client.configure_tpu_version.<locals>.configure_worker  s     +&j'*>>FF
g|-cNN""3S"1cNs#\\## 	Nff#- .4VG_	> > 9@@LM
M	Ns   A# #C
 ACC
)max_workersN)r   r   ThreadPoolExecutorr<   mapresult)rf   r   r   r   workersexecutorresultsr   s    ``     r   configure_tpu_versionzClient.configure_tpu_version  sn    N0 $$&G		#	#G	= -w7g &
--/  s   A/A//A8)NNNrY   NN)i     )always)__name__
__module____qualname____doc__rl   ro   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rT   rT   ~   s     $!0Jd,,"GH.
	F.+,7(5IN9.(r   rT   )&r   
concurrentr   r   rD   r}   r   r   r(   abslr   r   googleapiclientr   oauth2clientr   ImportErrorr   DEFINE_boolrI   rC   _ENDPOINTS_SEPARATORrJ   r   r"   r!   r;   r|   r   r   r   r$   r1   rA   rG   rM   r,   rT   r   r   r   <module>r      s         	   # ''! 	   $d@B   .$<> 6 *  " &= #!2 &9 #  < 	$="x x  '!&'s   B! !B+*B+