
    AVhL                       d Z ddlZddlZddlZddlZddlZddlZddlmZ ddl	Z
ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlm Z  ddl!m"Z# ddl$mZ% ddl&m'Z' ddl&m(Z( ddl&m)Z) ddl*m+Z+ ddl*m,Z- ddl*m.Z. ddl*m/Z/ ddl*m0Z0 ddl*m1Z1 ddl*m2Z2 ddl*m3Z3 ddl*m4Z4 dd l5m6Z6 dd!l5m7Z7 dd"l5m8Z8 dd#l5m9Z9 dd$l5m:Z; dd%l<m=Z= dd&l>m?Z? dd'l@mAZB dd(l@mCZC dd)l@mDZD dd*l@mEZE dd+lFmGZG dd,lHmIZI dd-lHmJZJ dd.lHmKZK dd/lHmLZL d0ZMd1ZNd2 ZOej                  d3        ZQd4 ZRd5 ZS eKj                  d6g 7       G d8 d9ej                               ZU eKj                  d:g 7       eIj                  d:       G d; d<ej                                      ZW eKj                  d:g7       G d= d>ej                               ZY G d? d@ej                        Z[dA Z\e/j                  e/j                  e/j                  e/j                  e/j                  e/j                  fZc G dB dCej                        ZedD Zfy)EzTPU Strategy.    N)logging)ag_ctx)api)xla_sharding)cross_device_ops)device_util)distribute_lib)distribute_utils)	input_lib)
input_util)numpy_dataset)reduce_util)tpu_replicated_variable)tpu_util)
tpu_values)values)tpu_cluster_resolver)context)def_function)function)constant_op)device)device_spec)dtypes)indexed_slices)ops)sparse_tensor)tensor_shape)tensor_util)	array_ops)control_flow_ops)math_ops)resource_variable_ops)	variables)ragged_tensor)save_contextdevice_assignment)tpu)tpu_hardware_feature)training_loop)tpu_ops)deprecation)nest)	tf_export)
tf_inspect   Fc                  h    t         xr+ t        j                         xr t        j                          S )z8Whether to batch variable initialization in tf.function.)/_EXPERIMENTAL_TPU_BATCH_VARIABLE_INITIALIZATIONr   executing_eagerlyr&   in_save_context     Y/home/dcms/DCMS/lib/python3.12/site-packages/tensorflow/python/distribute/tpu_strategy.py$enable_batch_variable_initializationr9   N   s1     6 -

#
#
%-**,
,r7   c               #      K   t        j                         rd  y t        j                         5  d  d d d        y # 1 sw Y   y xY wwN)r   #executing_eagerly_outside_functions
init_scoper6   r7   r8   maybe_init_scoper>   W   s6     ,,.			   s   .A
>	A
AA
c                    t        j                         rqt        | t        j                        sVt        | t
        j                        s;t        |       r$t        | j                  t        j                        st        d      yyyy)z/Validate the function passed into strategy.run.zTPUStrategy.run(fn, ...) does not support pure eager execution. please make sure the function passed into `strategy.run` is a `tf.function` or `strategy.run` is called inside a `tf.function` if eager behavior is enabled.N)
r   r4   
isinstancer   Functionr   ConcreteFunctioncallable__call__NotImplementedError)fns    r8   validate_run_functionrG   `   sq     !\223X667
2,:bkk<3H3HI
	%& & J 8 4 "r7   c           
         d i }i }|r,|j                         D ci c]  \  }} |      s|| }}}|r,|j                         D ci c]  \  }} |      r|| }}}g }d}t        t        j                  |       j                  j                               D ]  \  }	}
|	dk(  r|
j                  dk(  r|
j                  t        j                  j                  k(  r|j                  |
j                         ^|
j                  t        j                  j                  k(  r|	}|
j                  t        j                  j                  k(  s|st        fd|D              r-t        dt        |       dt!        fd|D               d	      | ||fc S  g }d
}t        |      D ]  \  }	} |      r6||	|k\  rt        d      t        |      |	k  rt        d      ||||	   <   d}D|$|	|k\  r|rt        d      |j                  |       jt        |      |	k  rt        d      ||||	   <    |rt#        j$                  | fi |||fS | ||fS c c}}w c c}}w )a|  Inspects arguments to partially apply any DistributedVariable.

  This avoids an automatic cast of the current variable value to tensor.

  Note that a variable may be captured implicitly with Python scope instead of
  passing it to run(), but supporting run() keeps behavior consistent
  with MirroredStrategy.

  Since positional arguments must be applied from left to right, this function
  does some tricky function inspection to move variable positional arguments
  into kwargs. As a result of this, we can't support passing Variables as *args,
  nor as args to functions which combine both explicit positional arguments and
  *args.

  Args:
    fn: The function to run, as passed to run().
    args: Positional arguments to fn, as passed to run().
    kwargs: Keyword arguments to fn, as passed to run().

  Returns:
    A tuple of the function (possibly wrapped), args, kwargs (both
    possibly filtered, with members of args possibly moved to kwargs).
    If no variables are found, this function is a noop.

  Raises:
    ValueError: If the function signature makes unsupported use of *args, or if
      too many arguments are passed.
  c                 n    t        j                  |       }|xr t        |d   t        j                        S Nr   )r.   flattenr@   r   DistributedVariable)xflats     r8   is_distributed_varz:_maybe_partial_apply_variables.<locals>.is_distributed_var   s+    <<?DCJtAw(B(BCCr7   Nr   selfc              3   .   K   | ]  } |        y wr;   r6   .0arO   s     r8   	<genexpr>z1_maybe_partial_apply_variables.<locals>.<genexpr>   s     ?Q-a0?   zWMixing Variables and positional-only parameters not supported by TPUStrategy. Received z& DistributedVariables in **kwargs and c              3   .   K   | ]  } |        y wr;   r6   rR   s     r8   rU   z1_maybe_partial_apply_variables.<locals>.<genexpr>   s     D! 21 5DrV   z" in *args, expected zero for both.FzTPUStrategy.run() cannot handle Variables passed to *args. Either name the function argument, or capture the Variable implicitly.zBToo many positional arguments passed to call to TPUStrategy.run().TzTPUStrategy.run() cannot handle both Variables and a mix of positional args and *args. Either remove the *args, or capture the Variable implicitly.)items	enumerater0   	signature
parametersr   namekind	ParameterPOSITIONAL_OR_KEYWORDappendVAR_POSITIONALPOSITIONAL_ONLYany
ValueErrorlensum	functoolspartial)rF   argskwargs
var_kwargsnonvar_kwargskvpositional_argsindex_of_star_argsip	star_argshave_seen_var_argrT   rO   s                 @r8   _maybe_partial_apply_variablesru   {   s   <D
 *-#)<<>K41a5G5J!Q$KJKA/A!/D1M  /
,,R0;;BBDE da 	Av!&&F"vv%%;;;QVV$	
:''66	6 	
:''77	7	s?$??%%(_$5 6DtDDE F''
 	
 v;> )o ,da!		'A1C,C 	 
_		"P
 	
 ()j#$		'A1C,C)* *
 

1

	_		"P
 	
 +,mOA&'9,< R.:.	=HH	T6	[ Ls   IIIIzdistribute.TPUStrategy)v1c                   T     e Zd ZdZ	 	 	 d fd	Zd	dZed        Zd Zd Z	d Z
 xZS )
TPUStrategyV2a  Synchronous training on TPUs and TPU Pods.

  To construct a TPUStrategy object, you need to run the
  initialization code as below:

  >>> resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
  >>> tf.config.experimental_connect_to_cluster(resolver)
  >>> tf.tpu.experimental.initialize_tpu_system(resolver)
  >>> strategy = tf.distribute.TPUStrategy(resolver)

  While using distribution strategies, the variables created within the
  strategy's scope will be replicated across all the replicas and can be kept in
  sync using all-reduce algorithms.

  To run TF2 programs on TPUs, you can either use `.compile` and
  `.fit` APIs in `tf.keras` with TPUStrategy, or write your own customized
  training loop by calling `strategy.run` directly. Note that
  TPUStrategy doesn't support pure eager execution, so please make sure the
  function passed into `strategy.run` is a `tf.function` or
  `strategy.run` is called inside a `tf.function` if eager
  behavior is enabled. See more details in https://www.tensorflow.org/guide/tpu.

  `distribute_datasets_from_function` and
  `experimental_distribute_dataset` APIs can be used to distribute the dataset
  across the TPU workers when writing your own training loop. If you are using
  `fit` and `compile` methods available in `tf.keras.Model`, then Keras will
  handle the distribution for you.

  An example of writing customized training loop on TPUs:

  >>> with strategy.scope():
  ...   model = tf.keras.Sequential([
  ...     tf.keras.layers.Dense(2, input_shape=(5,)),
  ...   ])
  ...   optimizer = tf.keras.optimizers.SGD(learning_rate=0.1)

  >>> def dataset_fn(ctx):
  ...   x = np.random.random((2, 5)).astype(np.float32)
  ...   y = np.random.randint(2, size=(2, 1))
  ...   dataset = tf.data.Dataset.from_tensor_slices((x, y))
  ...   return dataset.repeat().batch(1, drop_remainder=True)
  >>> dist_dataset = strategy.distribute_datasets_from_function(
  ...     dataset_fn)
  >>> iterator = iter(dist_dataset)

  >>> @tf.function()
  ... def train_step(iterator):
  ...
  ...   def step_fn(inputs):
  ...     features, labels = inputs
  ...     with tf.GradientTape() as tape:
  ...       logits = model(features, training=True)
  ...       loss = tf.keras.losses.sparse_categorical_crossentropy(
  ...           labels, logits)
  ...
  ...     grads = tape.gradient(loss, model.trainable_variables)
  ...     optimizer.apply_gradients(zip(grads, model.trainable_variables))
  ...
  ...   strategy.run(step_fn, args=(next(iterator),))

  >>> train_step(iterator)

  For the advanced use cases like model parallelism, you can set
  `experimental_device_assignment` argument when creating TPUStrategy to specify
  number of replicas and number of logical devices. Below is an example to
  initialize TPU system with 2 logical devices and 1 replica.

  >>> resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
  >>> tf.config.experimental_connect_to_cluster(resolver)
  >>> topology = tf.tpu.experimental.initialize_tpu_system(resolver)
  >>> device_assignment = tf.tpu.experimental.DeviceAssignment.build(
  ...     topology,
  ...     computation_shape=[1, 1, 1, 2],
  ...     num_replicas=1)
  >>> strategy = tf.distribute.TPUStrategy(
  ...     resolver, experimental_device_assignment=device_assignment)

  Then you can run a `tf.add` operation only on logical device 0.

  >>> @tf.function()
  ... def step_fn(inputs):
  ...   features, _ = inputs
  ...   output = tf.add(features, features)
  ...
  ...   # Add operation will be executed on logical device 0.
  ...   output = strategy.experimental_assign_to_logical_device(output, 0)
  ...   return output
  >>> dist_dataset = strategy.distribute_datasets_from_function(
  ...     dataset_fn)
  >>> iterator = iter(dist_dataset)
  >>> strategy.run(step_fn, args=(next(iterator),))

  `experimental_spmd_xla_partitioning` enables the experimental XLA SPMD feature
  for model parallelism. This flag can reduce the compilation time and HBM
  requirements. When running in this mode, every input tensor must either be
  partitioned (via `strategy.experimental_split_to_logical_devices`) or fully
  replicated (via `strategy.experimental_replicate_to_logical_devices`) to all
  logical devices. And calling `strategy.experimental_assign_to_logical_device`
  will result in a ValueError in this mode.
  c                    t         |   t        | |||             t        j                  j                  d      j                  d       t        j                  j                  d      j                  | j                  j                         t        j                  j                  d      j                  | j                  j                         d| _        y)aK  Synchronous training in TPU donuts or Pods.

    Args:
      tpu_cluster_resolver: A
        `tf.distribute.cluster_resolver.TPUClusterResolver` instance, which
        provides information about the TPU cluster. If None, it will assume
        running on a local TPU worker.
      experimental_device_assignment: Optional
        `tf.tpu.experimental.DeviceAssignment` to specify the placement of
        replicas on the TPU cluster.
      experimental_spmd_xla_partitioning: If True, enable the SPMD (Single
        Program Multiple Data) mode in XLA compiler. This flag only affects the
        performance of XLA compilation and the HBM requirement of the compiled
        TPU program. Ceveat: if this flag is True, calling
        `tf.distribute.TPUStrategy.experimental_assign_to_logical_device` will
        result in a ValueError.
    )r(   use_spmd_for_xla_partitioningV2TPUStrategynum_workersnum_replicas_per_workerTNsuper__init__TPUExtendedr	   distribution_strategy_gaugeget_cellset#distribution_strategy_replica_gaugeextended	num_hostsnum_replicas_per_host%_enable_packed_variable_in_eager_mode)rP   r   experimental_device_assignment"experimental_spmd_xla_partitioning	__class__s       r8   r   zTPUStrategyV2.__init__Y  s    * 
G <*L		
 ..77=AA-P66??s4==22366??!##&3t}}'J'J#K 26D.r7   c                     t        |       t        |||      \  }}}t        j                  |t	        j
                               }|xs t        j                         }| j                  j                  ||||      S )a3  Run the computation defined by `fn` on each TPU replica.

    Executes ops specified by `fn` on each replica. If `args` or `kwargs` have
    `tf.distribute.DistributedValues`, such as those produced by a
    `tf.distribute.DistributedDataset` from
    `tf.distribute.Strategy.experimental_distribute_dataset` or
    `tf.distribute.Strategy.distribute_datasets_from_function`,
    when `fn` is executed on a particular replica, it will be executed with the
    component of `tf.distribute.DistributedValues` that correspond to that
    replica.

    `fn` may call `tf.distribute.get_replica_context()` to access members such
    as `all_reduce`.

    All arguments in `args` or `kwargs` should either be nest of tensors or
    `tf.distribute.DistributedValues` containing tensors or composite tensors.

    Example usage:

    >>> resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
    >>> tf.config.experimental_connect_to_cluster(resolver)
    >>> tf.tpu.experimental.initialize_tpu_system(resolver)
    >>> strategy = tf.distribute.TPUStrategy(resolver)
    >>> @tf.function
    ... def run():
    ...   def value_fn(value_context):
    ...     return value_context.num_replicas_in_sync
    ...   distributed_values = (
    ...       strategy.experimental_distribute_values_from_function(value_fn))
    ...   def replica_fn(input):
    ...     return input * 2
    ...   return strategy.run(replica_fn, args=(distributed_values,))
    >>> result = run()

    Args:
      fn: The function to run. The output must be a `tf.nest` of `Tensor`s.
      args: (Optional) Positional arguments to `fn`.
      kwargs: (Optional) Keyword arguments to `fn`.
      options: (Optional) An instance of `tf.distribute.RunOptions` specifying
        the options to run `fn`.

    Returns:
      Merged return value of `fn` across replicas. The structure of the return
      value is the same as the return value from `fn`. Each element in the
      structure can either be `tf.distribute.DistributedValues`, `Tensor`
      objects, or `Tensor`s (for example, if running on a single replica).
    
rG   ru   	autograph
tf_convertautograph_ctxcontrol_status_ctxr	   
RunOptionsr   tpu_runrP   rF   ri   rj   optionss        r8   runzTPUStrategyV2.run  sm    ` "5b$GBf 
		b-"B"B"D	EB4224G==  T67;;r7   c                 .    | j                   j                  S )a[  Returns the cluster resolver associated with this strategy.

    `tf.distribute.TPUStrategy` provides the associated
    `tf.distribute.cluster_resolver.ClusterResolver`. If the user provides one
    in `__init__`, that instance is returned; if the user does not, a default
    `tf.distribute.cluster_resolver.TPUClusterResolver` is provided.
    r   _tpu_cluster_resolverrP   s    r8   cluster_resolverzTPUStrategyV2.cluster_resolver  s     ==...r7   c                    | j                   j                  rt        d      | j                   j                  j                  d   }|dk  s||k\  rt        dj                  ||            t        j                  ||d      S )ay  Adds annotation that `tensor` will be assigned to a logical device.

    This adds an annotation to `tensor` specifying that operations on
    `tensor` will be invoked on logical core device id `logical_device_id`.
    When model parallelism is used, the default behavior is that all ops
    are placed on zero-th logical device.

    ```python

    # Initializing TPU system with 2 logical devices and 4 replicas.
    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
    tf.config.experimental_connect_to_cluster(resolver)
    topology = tf.tpu.experimental.initialize_tpu_system(resolver)
    device_assignment = tf.tpu.experimental.DeviceAssignment.build(
        topology,
        computation_shape=[1, 1, 1, 2],
        num_replicas=4)
    strategy = tf.distribute.TPUStrategy(
        resolver, experimental_device_assignment=device_assignment)
    iterator = iter(inputs)

    @tf.function()
    def step_fn(inputs):
      output = tf.add(inputs, inputs)

      # Add operation will be executed on logical device 0.
      output = strategy.experimental_assign_to_logical_device(output, 0)
      return output

    strategy.run(step_fn, args=(next(iterator),))
    ```

    Args:
      tensor: Input tensor to annotate.
      logical_device_id: Id of the logical core to which the tensor will be
        assigned.

    Raises:
      ValueError: The logical device id presented is not consistent with total
      number of partitions specified by the device assignment or the TPUStrategy
      is constructed with `experimental_spmd_xla_partitioning=True`.

    Returns:
      Annotated tensor with identical value as `tensor`.
    zCannot assign a tensor to a logical device in SPMD mode. To disable SPMD, Please construct the TPUStrategy with `experimental_spmd_xla_partitioning=False`   r   z`logical_core_id` to assign must be lower then total number of logical devices per replica. Received logical device id {} but there are only total of {} logical devices in replica.Tuse_sharding_op)r   _use_spmd_for_xla_partitioningrd   _tpu_devicesshapeformatr   assign_device)rP   tensorlogical_device_idnum_logical_devices_per_replicas       r8   %experimental_assign_to_logical_devicez3TPUStrategyV2.experimental_assign_to_logical_device  s    \ }}3378 8
 '+mm&@&@&F&Fq&I#A<< 5 6<V,.M6OP P
 %%!49 9r7   c                 <   | j                   j                  j                  d   }t        j                  |      }|j                  }t        |      }|t        |      k7  r$t        dj                  |t        |                  t        |      D ]1  \  }}|	||   }	||	z  dk7  st        dj                  ||	|             ||k7  rt        dj                  |||            t        j                  |      j                  |      }
t        j                  ||
d      S )a  Adds annotation that `tensor` will be split across logical devices.

    This adds an annotation to tensor `tensor` specifying that operations on
    `tensor` will be split among multiple logical devices. Tensor `tensor` will
    be split across dimensions specified by `partition_dimensions`.
    The dimensions of `tensor` must be divisible by corresponding value in
    `partition_dimensions`.

    For example, for system with 8 logical devices, if `tensor` is an image
    tensor with shape (batch_size, width, height, channel) and
    `partition_dimensions` is [1, 2, 4, 1], then `tensor` will be split
    2 in width dimension and 4 way in height dimension and the split
    tensor values will be fed into 8 logical devices.

    ```python
    # Initializing TPU system with 8 logical devices and 1 replica.
    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
    tf.config.experimental_connect_to_cluster(resolver)
    topology = tf.tpu.experimental.initialize_tpu_system(resolver)
    device_assignment = tf.tpu.experimental.DeviceAssignment.build(
        topology,
        computation_shape=[1, 2, 2, 2],
        num_replicas=1)
    # Construct the TPUStrategy. Since we are going to split the image across
    # logical devices, here we set `experimental_spmd_xla_partitioning=True`
    # so that the partitioning can be compiled in SPMD mode, which usually
    # results in faster compilation and smaller HBM requirement if the size of
    # input and activation tensors are much bigger than that of the model
    # parameters. Note that this flag is suggested but not a hard requirement
    # for `experimental_split_to_logical_devices`.
    strategy = tf.distribute.TPUStrategy(
        resolver, experimental_device_assignment=device_assignment,
        experimental_spmd_xla_partitioning=True)

    iterator = iter(inputs)

    @tf.function()
    def step_fn(inputs):
      inputs = strategy.experimental_split_to_logical_devices(
        inputs, [1, 2, 4, 1])

      # model() function will be executed on 8 logical devices with `inputs`
      # split 2 * 4  ways.
      output = model(inputs)
      return output

    strategy.run(step_fn, args=(next(iterator),))
    ```
    Args:
      tensor: Input tensor to annotate.
      partition_dimensions: An unnested list of integers with the size equal to
        rank of `tensor` specifying how `tensor` will be partitioned. The
        product of all elements in `partition_dimensions` must be equal to the
        total number of logical devices per replica.

    Raises:
      ValueError: 1) If the size of partition_dimensions does not equal to rank
        of `tensor` or 2) if product of elements of `partition_dimensions` does
        not match the number of logical devices per replica defined by the
        implementing DistributionStrategy's device specification or
        3) if a known size of `tensor` is not divisible by corresponding
        value in `partition_dimensions`.

    Returns:
      Annotated tensor with identical value as `tensor`.
    r   zvLength of `partition_dimensions` must equal to the rank of `tensor.shape` ({}). Received len(partition_dimensions)={}.r   zTensor shape at `partition_dimensions[{}]` must be divisible by corresponding value specified by `partition_dimensions` ({}). Received: {}.zThe product of `partition_dimensions` should be the same as the number of logical devices (={}). Received `partition_dimensions`={},and their product is {}.Tr   )r   r   r   npprodre   rd   r   rY   arangereshaper   tile)rP   r   partition_dimensionsr   num_partition_splitsinput_shapetensor_rank	dim_indexdim_size
split_sizetile_assignments              r8   %experimental_split_to_logical_devicesz3TPUStrategyV2.experimental_split_to_logical_devices  sB   F '+mm&@&@&F&Fq&I#77#78,,Kk"Kc.// 77=v&,@(A8CD D
  )5 	?	8		'	2j	J	!	# IIO&
HJ>? 	?	? >>%%+V,K,@,@&BC C ii 45==OV_dKKr7   c                 0    t        j                  |d      S )a  Adds annotation that `tensor` will be replicated to all logical devices.

    This adds an annotation to tensor `tensor` specifying that operations on
    `tensor` will be invoked on all logical devices.

    ```python
    # Initializing TPU system with 2 logical devices and 4 replicas.
    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
    tf.config.experimental_connect_to_cluster(resolver)
    topology = tf.tpu.experimental.initialize_tpu_system(resolver)
    device_assignment = tf.tpu.experimental.DeviceAssignment.build(
        topology,
        computation_shape=[1, 1, 1, 2],
        num_replicas=4)
    strategy = tf.distribute.TPUStrategy(
        resolver, experimental_device_assignment=device_assignment)

    iterator = iter(inputs)

    @tf.function()
    def step_fn(inputs):
      images, labels = inputs
      images = strategy.experimental_split_to_logical_devices(
        inputs, [1, 2, 4, 1])

      # model() function will be executed on 8 logical devices with `inputs`
      # split 2 * 4  ways.
      output = model(inputs)

      # For loss calculation, all logical devices share the same logits
      # and labels.
      labels = strategy.experimental_replicate_to_logical_devices(labels)
      output = strategy.experimental_replicate_to_logical_devices(output)
      loss = loss_fn(labels, output)

      return loss

    strategy.run(step_fn, args=(next(iterator),))
    ```
    Args:
      tensor: Input tensor to annotate.

    Returns:
      Annotated tensor with identical value as `tensor`.
    Tr   )r   	replicate)rP   r   s     r8   )experimental_replicate_to_logical_devicesz7TPUStrategyV2.experimental_replicate_to_logical_devicesi  s    \ !!&$??r7   )NNFr6   NN)__name__
__module____qualname____doc__r   r   propertyr   r   r   r   __classcell__r   s   @r8   rx   rx      sI    cL %).227%6N8<t / /=9~cLJ.@r7   rx   z#distribute.experimental.TPUStrategyc                   @     e Zd ZdZ	 	 d fd	ZddZed        Z xZS )r|   a  Synchronous training on TPUs and TPU Pods.

  To construct a TPUStrategy object, you need to run the
  initialization code as below:

  >>> resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
  >>> tf.config.experimental_connect_to_cluster(resolver)
  >>> tf.tpu.experimental.initialize_tpu_system(resolver)
  >>> strategy = tf.distribute.experimental.TPUStrategy(resolver)

  While using distribution strategies, the variables created within the
  strategy's scope will be replicated across all the replicas and can be kept in
  sync using all-reduce algorithms.

  To run TF2 programs on TPUs, you can either use `.compile` and
  `.fit` APIs in `tf.keras` with TPUStrategy, or write your own customized
  training loop by calling `strategy.run` directly. Note that
  TPUStrategy doesn't support pure eager execution, so please make sure the
  function passed into `strategy.run` is a `tf.function` or
  `strategy.run` is called inside a `tf.function` if eager
  behavior is enabled.
  c                    t        j                  d       t        |   t	        | ||             t
        j                  j                  d      j                  d       t
        j                  j                  d      j                  | j                  j                         t
        j                  j                  d      j                  | j                  j                         d| _        y)	aP  Synchronous training in TPU donuts or Pods.

    Args:
      tpu_cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver,
        which provides information about the TPU cluster.
      device_assignment: Optional `tf.tpu.experimental.DeviceAssignment` to
        specify the placement of replicas on the TPU cluster.
    z`tf.distribute.experimental.TPUStrategy` is deprecated, please use the non-experimental symbol `tf.distribute.TPUStrategy` instead.r'   r{   r|   r}   r~   TN)r   warningr   r   r   r	   r   r   r   r   r   r   r   r   )rP   r   r(   r   s      r8   r   zTPUStrategy.__init__  s     OO	KL 
G /	
 ..77=AA-P66??s4==22366??!##&3t}}'J'J#K 26D.r7   c                     t        |       t        |||      \  }}}t        j                  |t	        j
                               }|xs t        j                         }| j                  j                  ||||      S )zSee base class.r   r   s        r8   r   zTPUStrategy.run  sj    "5b$GBf 
		b-"B"B"D	EB4224G==  T67;;r7   c                 .    | j                   j                  S )al  Returns the cluster resolver associated with this strategy.

    `tf.distribute.experimental.TPUStrategy` provides the
    associated `tf.distribute.cluster_resolver.ClusterResolver`. If the user
    provides one in `__init__`, that instance is returned; if the user does
    not, a default
    `tf.distribute.cluster_resolver.TPUClusterResolver` is provided.
    r   r   s    r8   r   zTPUStrategy.cluster_resolver  s     ==...r7   )NNr   )	r   r   r   r   r   r   r   r   r   r   s   @r8   r|   r|     s/    0 %)!%6F
< 	/ 	/r7   r|   c                   B     e Zd ZdZ	 	 	 d fd	Zed        ZddZ xZS )TPUStrategyV1z)TPU distribution strategy implementation.c                    t         |   t        | |||             t        j                  j                  d      j                  d       t        j                  j                  d      j                  | j                  j                         t        j                  j                  d      j                  | j                  j                         d| _        y)a  Initializes the TPUStrategy object.

    Args:
      tpu_cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver,
          which provides information about the TPU cluster.
      steps_per_run: Number of steps to run on device before returning to the
          host. Note that this can have side-effects on performance, hooks,
          metrics, summaries etc.
          This parameter is only used when Distribution Strategy is used with
          Keras.
      device_assignment: Optional `tf.tpu.experimental.DeviceAssignment` to
          specify the placement of replicas on the TPU cluster. Currently only
          supports the usecase of using a single core within a TPU cluster.
    V1r|   r}   r~   TNr   )rP   r   steps_per_runr(   r   s       r8   r   zTPUStrategyV1.__init__  s    $ 
G["M3DF G..77=AA-P66??s4==22366??!##&3t}}'J'J#K 26D.r7   c                 .    | j                   j                  S )z0DEPRECATED: use .extended.steps_per_run instead.)	_extendedr   r   s    r8   r   zTPUStrategyV1.steps_per_run  s     >>'''r7   c                     t        |       t        |||      \  }}}t        j                  |t	        j
                               }|xs t        j                         }| j                  j                  ||||      S )a  Run `fn` on each replica, with the given arguments.

    Executes ops specified by `fn` on each replica. If `args` or `kwargs` have
    "per-replica" values, such as those produced by a "distributed `Dataset`",
    when `fn` is executed on a particular replica, it will be executed with the
    component of those "per-replica" values that correspond to that replica.

    `fn` may call `tf.distribute.get_replica_context()` to access members such
    as `all_reduce`.

    All arguments in `args` or `kwargs` should either be nest of tensors or
    per-replica objects containing tensors or composite tensors.

    Users can pass strategy specific options to `options` argument. An example
    to enable bucketizing dynamic shapes in `TPUStrategy.run`
    is:

    >>> resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
    >>> tf.config.experimental_connect_to_cluster(resolver)
    >>> tf.tpu.experimental.initialize_tpu_system(resolver)
    >>> strategy = tf.distribute.experimental.TPUStrategy(resolver)

    >>> options = tf.distribute.RunOptions(
    ...     experimental_bucketizing_dynamic_shape=True)

    >>> dataset = tf.data.Dataset.range(
    ...    strategy.num_replicas_in_sync, output_type=dtypes.float32).batch(
    ...        strategy.num_replicas_in_sync, drop_remainder=True)
    >>> input_iterator = iter(strategy.experimental_distribute_dataset(dataset))

    >>> @tf.function()
    ... def step_fn(inputs):
    ...  output = tf.reduce_sum(inputs)
    ...  return output

    >>> strategy.run(step_fn, args=(next(input_iterator),), options=options)

    Args:
      fn: The function to run. The output must be a `tf.nest` of `Tensor`s.
      args: (Optional) Positional arguments to `fn`.
      kwargs: (Optional) Keyword arguments to `fn`.
      options: (Optional) An instance of `tf.distribute.RunOptions` specifying
        the options to run `fn`.

    Returns:
      Merged return value of `fn` across replicas. The structure of the return
      value is the same as the return value from `fn`. Each element in the
      structure can either be "per-replica" `Tensor` objects or `Tensor`s
      (for example, if running on a single replica).
    r   r   s        r8   r   zTPUStrategyV1.run  sk    f "5b$GBf			b-"B"B"D	EB4224G==  T67;;r7   )NNNr   )	r   r   r   r   r   r   r   r   r   r   s   @r8   r   r     s/    1 %)!!%6< ( (9<r7   r   c                        e Zd ZdZ	 	 	 	 d. fd	Zd Zd Zd Zej                  j                  fdZd Zd Zd	 Zd
 Zd Zd Z	 d/dZd Zej*                  d        Zd Zd Zed        Zd Zd Zd Zd Zd Zd Z d Z!d Z"ed        Z#ed        Z$ed        Z%ed        Z&ed        Z'ed         Z(ed!        Z)ed"        Z*ed#        Z+ed$        Z,d% Z-d& Z.	 	 	 	 d0d'Z/d( Z0ed)        Z1d/d*Z2d+ Z3d, Z4d- Z5 xZ6S )1r   zImplementation of TPUStrategy.c                    t         |   |       |t        j                  d      }|d}t	        j
                         | _        || _        | j                  j                         | _	        || _
        | j                  j                  D cg c]  }d|j                  v s|j                   }}|0t        j                  |D cg c]  }|g c}t              | _        nt"        j$                  j'                  |d         j(                  }g }	t+        |j,                        D ]e  }
g }t+        |j.                        D ]8  }|j1                  t3        j4                  |j7                  |
||                   : |	j1                  |       g t        j                  |	t              | _        t3        j8                  | j                   d   d         | _        t=        j>                         | _         t=        j>                         | _!        | j                   d d df   D ]  }t3        j8                  |      }| j@                  jE                  |g        | j@                  |   j1                  |       | jB                  jE                  |g        | jB                  |   j1                  |        || _#        d| _$        d| _%        dg| _&        tO        jP                         r#tS        jT                  tN        jV                         | | _,        || _-        d| _.        | j                   d d | jL                  d	   f   }|D ]'  }tO        j^                  |      sd| _.         d| _0        y  d| _0        y c c}w c c}w )
N r   zdevice:TPU:dtyper   )replicalogical_corejobTF)1r   r   tpu_cluster_resolver_libTPUClusterResolverweakrefWeakKeyDictionary_tpu_function_cacher   get_tpu_system_metadata_tpu_metadata_device_assignmentdevicesr\   r   arrayobjectr   r   DeviceSpecV2from_stringr   rangenum_replicasnum_cores_per_replicar`   r   canonicalize
tpu_deviceget_host_for_device_host_devicecollectionsOrderedDict_device_input_worker_devices_host_input_worker_devices
setdefaultr   _require_static_shapes(experimental_enable_get_next_as_optional_logical_device_stackr   r4   atexitregister
async_wait_use_var_policyr   _using_custom_deviceis_custom_device_enable_data_reorder)rP   container_strategyr   r   r(   rz   dtpu_devices_flatjob_nametpu_devices
replica_idreplica_devicesr   r   host_devicer   r   s                   r8   r   zTPUExtended.__init__Z  s9    
G'(#5HHL m
  '88:D!5D33KKMD/D **22Omqvv6MO O
  (((
)1A3
)9d ))556Fq6IJNNhk/<<= ,*!"3"I"IJ 	&L

 
 &&#..(#/" / $%&	& 	?+, ((;f=d#778I8I!8LQ8OPD )4(?(?(AD%&1&=&=&?D#''1- G
33J?k
''22;C
''4;;JG
%%00bA
%%k299+FG 'D"&D48D1"#D  " oog(()  =<D +HD' %D4#=#=b#A ABG 		!	!!	$$(! !&D !&DSO *s   
M&M&
M+c                    | j                   sy| j                  dddf   }g }t        |      D ]n  \  }}t        j                  j                  |      }|j                  |j                  |j                  |j                  |j                  |j                  f|f       p t        |      D cg c]  \  }}|	 c}}S c c}}w )a(  Get the replica order based on the tpu device order.

    For example, if the tpu_devices are:
    '/job:worker/replica:0/task:0/device:TPU:0',
    '/job:worker/replica:0/task:0/device:TPU:2',
    '/job:worker/replica:0/task:1/device:TPU:0',
    '/job:worker/replica:0/task:1/device:TPU:2',
    '/job:worker/replica:0/task:1/device:TPU:6',
    '/job:worker/replica:0/task:1/device:TPU:4',
    '/job:worker/replica:0/task:0/device:TPU:6',
    '/job:worker/replica:0/task:0/device:TPU:4',

    the returned replica order will be:
    [0, 1, 7, 6, 2, 3, 5, 4]

    This replica order will be used to reorder the data returned by the
    iterators,
    so that they can be placed on the same node as their computation graphs.

    Returns:
      A list containing the order ids of corresponding TPU devices.
    Nr   )r   r   rY   	tf_device
DeviceSpecr   r`   r   r   device_typetaskdevice_indexsorted)rP   r  devices_with_idsrq   r   spec_s          r8   _get_replica_orderzTPUExtended._get_replica_order  s    . $$##AqD)K";/ 	:!!--j9dhhllii 	 			 !!123$!QA333s   -B=c                 0    t        j                  ||        y r;   )r
   validate_colocate)rP   colocate_with_variables     r8    _validate_colocate_with_variablez,TPUExtended._validate_colocate_with_variable  s    &&'=tDr7   c                     t        j                  t        | j                  j	                                     }t        j                  ||| j                         | j                        S )z)Make iterators for each of the TPU hosts.)num_replicas_in_sync)	r   InputWorkerstupler   rX   input_lib_v1DatasetIterator_container_strategy_num_replicas_in_sync)rP   datasetinput_workerss      r8   _make_dataset_iteratorz"TPUExtended._make_dataset_iterator  sW    **d//5578:M''  "!77	9 9r7   c                 X   g }t        j                  t        | j                  j	                                     }|j
                  }t        |      D ]3  }|j                  t        j                  ||| j                               5 t        j                  |||| j                               S )Nnum_input_pipelinesinput_pipeline_idr  )r   r  r  r   rX   r}   r   r`   r	   InputContextr  r  InputFunctionIteratorr  )rP   input_fnreplication_modeinput_contextsr  r}   rq   s          r8   _make_input_fn_iteratorz#TPUExtended._make_input_fn_iterator  s     N**d//5578:M++K; @

%
%"- !#'#=#=?@@ --h.<.2.F.F.HJ Jr7   c                 j    t        j                  |t        j                  | j                        |      S r;   )r   one_host_numpy_datasetSingleDevicer   )rP   numpy_inputsessions      r8    _experimental_make_numpy_datasetz,TPUExtended._experimental_make_numpy_dataset
  s.    //]//0A0AB r7   c                     |r|j                   r6t        j                  t        | j                  j                                     S t        j                  t        | j                  j                                     S r;   )experimental_fetch_to_devicer   r  r  r   rX   r   )rP   r   s     r8   _get_input_workerszTPUExtended._get_input_workers  sc    g::##
11779
:< < ##
//557
8: :r7   c           	      *   t        |t        j                        r|j                  }t	        j
                  |      }|D ]S  \  }}t        |t        j                  t        j                  f      s1t        dj                  |t        |                   y )Na>  Found tensor {} with spec {}. TPUStrategy does not support distributed datasets with device prefetch when using sparse or ragged tensors. If you intend to use sparse or ragged tensors, please pass a tf.distribute.InputOptions object with experimental_fetch_to_device set to False to your dataset distribution function.)r@   r   PerReplicaSpec_component_specsr.    flatten_with_joined_string_pathsr   SparseTensorSpecr%   RaggedTensorSpecrd   r   type)rP   element_specspecspathr  s        r8   _check_speczTPUExtended._check_spec  s    , 5 56!22l11,?E 	?
d	D=99(99; 
<%
 &,VD$t*%=? 	?	?r7   c           	      \   |r2|j                   t        j                  j                  k(  rt	        d      ||j
                  r| j                  |j                         t        j                  || j                  |      | j                         | j                  || j                               S )NzgInputReplicationMode.PER_REPLICA is only supported in `experimental_distribute_datasets_from_function`.)r  r   replica_order)experimental_replication_moder	   InputReplicationModePER_REPLICArE   r1  r=  r:  r   get_distributed_datasetr2  r  r  r  )rP   r  r   s      r8    _experimental_distribute_datasetz,TPUExtended._experimental_distribute_dataset&  s    G99++778> 
 '>>
w++,--(  "!77--/ r7   c           	         |r2|j                   t        j                  j                  k(  rt	        d      | j                  |      }g }|j                  }t        |      D ]3  }|j                  t        j                  ||| j                               5 t        j                  |||| j                         || j                               }||j                  r| j!                  |j"                         |S )NzInputReplicationMode.PER_REPLICA is only supported in  `experimental_distribute_datasets_from_function` of tf.distribute.MirroredStrategyr!  )r   r?  )r@  r	   rA  rB  rE   r2  r}   r   r`   r$  r  r   &get_distributed_datasets_from_functionr  r  r1  r=  r:  )rP   
dataset_fnr   r  r(  r}   rq   distributed_datasets           r8   "_distribute_datasets_from_functionz.TPUExtended._distribute_datasets_from_function:  s    G99++778./ /
 ++G4MN++K; <N77)#99; << %KK  "--/ '>>
*778r7   c           	          g }t        | j                        D ]7  }|j                   |t        j                  || j                                     9 t        j                  |d      S )NT)always_wrap)r   r  r`   r	   ValueContextr
   regroup)rP   value_fnper_replica_valuesr  s       r8   -_experimental_distribute_values_from_functionz9TPUExtended._experimental_distribute_values_from_functionY  sk    D667 M

>..z/3/I/IK LMM ##$6DIIr7   c                 "    |i }t        j                  |      }t        j                         fdt	        j
                         j                          _         fd}t        |t              sJ | j                  z  }t	        j                   j                        5   j                  dk(  r |       }nt        j                  |||      }d d d         `t!        j"                        _        t        |t              ra|D cg c]  }t        |t        j&                        r|! }}t)        |       j                  z  }	t+        |	      D 
cg c]
  }
||
d |	    }}
ng }t-        |       S # 1 sw Y   xY wc c}w c c}
w )Nc                     |       }t        j                  j                        }|rCt        j                  |g      5  |D cg c]  }t        j                  |       c}cddd       S |S c c}w # 1 sw Y   yxY w)zSingle step on the TPU device.N)r.   rK   last_step_outputsr   control_dependenciesr    identity)inputs	fn_resultflat_last_step_outputsfctxrF   s       r8   run_fnz?TPUExtended._experimental_run_steps_on_iterator.<locals>.run_fnl  s|    S&/i#||C,A,AB	%%yk2 	I1GHA)$$Q'H	I 	I  I	I 	Is   A6A1#A61A66A?c                     ~ j                         }g }t        j                        D ]-  fd}|j                  t	        j
                  ||      f       / t        j                  |j                  t        j                  j                              }t        |d   t              rt	        j                  |      }|S )z%The rewritten step fn running on TPU.c                 0    t        j                  |       S r;   )r
   select_replica)rM   r  s    r8   <lambda>zUTPUExtended._experimental_run_steps_on_iterator.<locals>.rewrite_fn.<locals>.<lambda>  s    #3#B#B$ r7   rz   )r(   xla_optionsr   )get_nextr   r  r`   r.   map_structurer)   r   r   
XLAOptionsr   r@   listrK   )	ri   per_replica_inputsreplicate_inputsr^  replicate_outputsr  multi_worker_iteratorr[  rP   s	        @r8   
rewrite_fnzCTPUExtended._experimental_run_steps_on_iterator.<locals>.rewrite_fn  s    
099;d889 3*!3!3."0 !2 	33 --

 33nn4&D&DF	G 
%a($	/ LL):;r7   r   )r.   rK   r   MultiStepContextr   get_default_graph_get_control_flow_context_outer_control_flow_contextr@   re  r  r   r   r   r+   repeatr!   grouprun_op	Operationre   r   _set_last_step_outputs)rP   rF   ri  
iterationsinitial_loop_valuesrj  rh  rM   last_step_tensor_outputs
output_numrq   rZ  r[  s   ```        @@r8   #_experimental_run_steps_on_iteratorz/TPUExtended._experimental_run_steps_on_iteratord  s    ",,':;

$
$
&C" 	99; 	$: )4000-0J0JJ 
D%%	& F			q	 &L)00Z1DF	F 	(!''(9:CJ#T* '"jCMM.J!" " /0D4N4NNj;@;L"67
"1=j=
1" "
 "$3 89JEF F""s   '/E;F.FF;Fc                 p    t        | j                               5   ||i |cd d d        S # 1 sw Y   y xY wr;   )_TPUReplicaContextr  )rP   rF   ri   rj   s       r8   _call_for_each_replicaz"TPUExtended._call_for_each_replica  s7     
D446	7 !  ! ! !s   ,5c              #     K   | j                   j                  d   }||k\  rt        dj                  ||            | j                  j                  |       	 t        j                         d n5t        j                  t        j                  |            5  d ddd       | j                  j                          y# 1 sw Y   $xY w# | j                  j                          w xY ww)9Places variables and ops on the specified logical device.r   z]`logical_device_id` not in range (was {}, but there are only {} logical devices per replica).N)r   r   rd   r   r   r`   r   enclosing_tpu_contextr   r   r)   corepop)rP   r   r   s      r8   experimental_logical_devicez'TPUExtended.experimental_logical_device  s      '+&7&7&=&=a&@#;;**0&!@+BC C
 	%%&78'		'	'	)	1ZZ!234 	
	   $$&	 	   $$&s7   AC,AC CC &C,C
C C))C,c                 B    t        j                  | j                         y)zExperimental method added to be used by Estimator.

    This is a private method only to be used by Estimator. Other frameworks
    should directly be calling `tf.tpu.experimental.initialize_tpu_system`
    N)r   initialize_tpu_systemr   r   s    r8   _experimental_initialize_systemz+TPUExtended._experimental_initialize_system  s     2243M3MNr7   c                    	
 |j                  dd      r di |S |j                  dd      }|	 |fi |S |j                  dd      }|! j                  dd j                  d   f   nWt        |t        j
                        r1t        j                  |j                        5   di |cddd       S |j                   j                  j                  \  fd
 fd	 fd	
fd
}	 fd} j                  s&t               r j                  rdkD  r|}n|}d|d<   n j                  rdkD  r	}n
}t        j                   j                         |t        j                   t        j"                  fi |} j                  s!t               rt%        |d j&                         |S # 1 sw Y   xY w)z?Create a TPUMirroredVariable. See `DistributionStrategy.scope`.skip_mirrored_creatorFcustom_tpu_variable_creatorNcolocate_withr   c                  J   d}g }t              D ]  \  }}t        j                  |      5  |dk(  r,| d   }t               5  t	        |      r |       n|}ddd       |dkD  r+|d   j
                  j                  d      d   }d||fz  | d<   || d<   t        j                  t        j                        5   di | }ddd       t        t        j                        rJ |j                  |       ddd        |S # 1 sw Y   xY w# 1 sw Y   NxY w# 1 sw Y   xY w)Returns a list of `tf.Variable`s.

      The list contains `number_replicas` `tf.Variable`s and can be used to
      initialize a `TPUMirroredVariable`.

      Args:
        **kwargs: the keyword arguments for creating a variable
      Nr   initial_value:%s/replica_%d/r\   r6   )rY   r   r   r>   rC   r\   splitr   device_policyDEVICE_PLACEMENT_SILENTr@   r   TPUMirroredVariabler`   )	rj   r  
value_listrq   r   var0namern   r   next_creators	          r8   _create_mirrored_tpu_variableszDTPUExtended._create_variable.<locals>._create_mirrored_tpu_variables  s;    mjG$ $!QZZ] 	!V"?3M "# 4192!mo&3 4 U!!}))//4Q7H .1=F6N$1&
!$$W%D%DE '&v&A'  :#A#AB
BB


A
-	 	0 %4 4' '%	 	s<   DD A D6	D?5D D	DDDD"	c                     | d   }t               5  t        |      r |       n|}ddd       g }t              D ]  }g }t        
      D ]N  }t        j                  j
                  |   |         5  || d<    	di | }ddd       |j                         P dj                  | d   |      }t        j                  ||      }|j                  |        |S # 1 sw Y   xY w# 1 sw Y   ixY w)  Returns a list of `TPUReplicatedVariable`s.

      The list consists of `num_replicas` `TPUReplicatedVariable`s and can be
      used to initialize a `TPUMirroredVariable`. Each `TPUReplicatedVariable`
      contains a list of `tf.Variable`s which are replicated to
      `num_cores_per_replica` logical cores to enable XLA SPMD compilation.

      Args:
        **kwargs: the keyword arguments for creating a variable
      r  N{}/r:{}r\   r$   r\   r6   )
r>   rC   r   r   r   r   r`   r   r   TPUReplicatedVariable)rj   r  mirrored_replicated_var_listr  replicated_var_listlogic_core_idrn   replica_nametpu_replicated_varr  r   r   rP   s            r8   )_create_mirrored_tpu_replicated_variableszOTPUExtended._create_variable.<locals>._create_mirrored_tpu_replicated_variables#  s    _-m  .+3, - 	. &("l+ @* "#89 	(Mzz$++J7FG '&3F?#&v&A' 
$
$Q
'		(
 !''v
C4JJ)> 	%++,>?@ *)%. .' 's   C3C)C&)C2c                      t        j                  di | }j                  j                  |       t	        |dj                         |S )N_lazy_scoper6   )r   TPUUninitializedVariablelazy_variable_trackeradd_uninitialized_varsetattr)rj   uninitialized_variablerP   s     r8   uninitialized_variable_creatorzDTPUExtended._create_variable.<locals>.uninitialized_variable_creatorI  sI    '@@J6J
  66
  $mT5O5OP##r7   c            
      z   | j                  dd       d
i | S g }d}t              D ]g  \  }}t        j                  |      5  |dk(  r`| j                  dd      }t	               5  |9t        |      r |       }t        j                  || j                  dd            }ddd       |dkD  r+|d   j                  j                  d      d   }d||fz  | d<   || d<   | j                  dd      | d   j                  | d<   | j                  d	d      | d   j                  | d	<   t        j                  t        j                        5   	d
i | }ddd       t        t        j                         rJ |j#                  |       ddd       j |S # 1 sw Y   xY w# 1 sw Y   OxY w# 1 sw Y   xY w)r  r  Nr   r   r   r  r  r\   r   r6   )getrY   r   r   r>   rC   convert_to_tensorr\   r  r   r   r   r  r  r@   r   r  r`   )
rj   r  r  rq   r   r  rn   r  r   r  s
          r8   ,_create_uninitialized_mirrored_tpu_variableszRTPUExtended._create_variable.<locals>._create_uninitialized_mirrored_tpu_variablesR  s    
OT	*	2-777jmG$ "$!QZZ] !	!V"JJ=M!# *M*"//- # 5 5!GT)B! U!!}))//4Q7H .1=F6N$1&
!ZZ&.$_5;;F7OZZ&.$_5;;F7O$$W%D%DE 9.88A9  :#A#AB
BB


A
C!	 !	"F = 29 9;!	 !	s=   "F0)<F%B(F0	F$5F0F!F0$F-)F00F:	c                     | j                  dd      }| j                  dd      }| j                  dd      }| d	i | S t               5  |Vt        |      r |       }t        j                  ||      }|| d<   || d   j
                  | d<   || d   j                  | d<   ddd       g }t              D ]  }g }t              D ]I  }t        j                  j                  |   |         5   d	i | }ddd       |j                         K dj                  | d   |      }	t        j                  ||	      }
|j                  |
        |S # 1 sw Y   xY w# 1 sw Y   ixY w)
r  r   Nr   r  r   r  r\   r  r6   )r  r>   rC   r   r  r   r   r   r   r   r`   r   r   r  )rj   r   r   r  r  r  r  r  rn   r  r  r  r   r   rP   r  s              r8   7_create_uninitialized_mirrored_tpu_replicated_variablesz]TPUExtended._create_variable.<locals>._create_uninitialized_mirrored_tpu_replicated_variables  s    jj$'ejj$'ejj$7m		8B6BB <$m$)OM//5- %2&
!]$_5;;F7O]$_5;;F7O<  &("l+ @* "#89 	(Mzz$++J7FG 9.88A9

$
$Q
'	( !''v
C4JJ)
 	%++,>?@ *)=< <*9 9s   AE2	E#E #E,r   T!experimental_batch_initializationr  r6   )r  r   r   r@   r   r,  r   r   _devicesr   r   r9   r   r
   create_mirrored_variabler  TPU_VARIABLE_CLASS_MAPPINGTPU_VARIABLE_POLICY_MAPPINGr  r  )rP   r  rj   r  r  r  r  real_creatormirrored_variabler  r  r   r   r   r  s   ``       @@@@@@r8   _create_variablezTPUExtended._create_variable  s    zz)51#F##"(**%t# #.(@@@JJ5M!!!T%?%?%C"CDg	M=#=#=	>::m**+ &%f%& & &&g*.*;*;*A*A'L'#J *L$2h0* 0*d $$)M)O		,	,1F1JNC48f01 
	,	,1F1J@5(AA  "3344	
  $$)M)O0J0JKu& &s   .GGc                 f    t        | dd       st        j                         | _        | j                  S )N_lazy_variable_tracker)getattrr   LazyVariableTrackerr  r   s    r8   r  z!TPUExtended.lazy_variable_tracker  s,    4148$,$@$@$Bd!&&&r7   c                 :      fd}t        j                  d|      S )Nc                    t        j                         }j                  j                         D ]+  }t	        j
                  |      5   | |i |||<   d d d        - t        j                  j                         |      S # 1 sw Y   [xY wr;   )	r   r   r   keysr   r   r   PerWorkerResourcer  )r  ri   rj   host_to_tabler  rP   s        r8   lookup_creatorz;TPUExtended._resource_creator_scope.<locals>.lookup_creator  s    !--/m::??A E+ZZ$ 	E'3T'DV'D-
$	E 	EE %%d&>&>&@-PP	E 	Es   BB	StaticHashTable)r   resource_creator_scope)rP   r  s   ` r8   _resource_creator_scopez#TPUExtended._resource_creator_scope  s    Q %%&7HHr7   c                 j   t        t        j                        sS t        j                        }t        t        j                        r4j
                  (t        fdj
                  j                  D              }t        j                        t        k  rt        j                  ||      }nlt        j                  |d t         |      }t        t        t        |      t        dz
        D ]*  }t        j                  |g|||t        z   dz
   z   |      }, | j                  ||      }|S )Nc              3   T   K   | ]  }j                   j                  |       ! y wr;   _packed_variable	on_devicerS   r   values     r8   rU   z8TPUExtended._gather_to_implementation.<locals>.<genexpr>  s*      3 
 
 
*
*1
-3   %(axisr   )r@   r   DistributedValuesre  rL   r  r   re   _XLA_OP_BY_OP_INPUTS_LIMITr    concatr   _broadcast_output)rP   r  destinationsr  r   r  outputrq   s    `      r8   _gather_to_implementationz%TPUExtended._gather_to_implementation  s   eV556lell#J""$(-(>(>(J 3))113 3j 5<<66
6f
00
1>f/Z/!35 !!!Hz!A(B$BQ$FGG ##L&9FMr7   c                 x   t        j                  |      }t        |      dk(  rst        j                  |d         }t        j                  | j
                        }||k7  r5t        j                  |      5  t        j                  |      }d d d        |S |S t        j                  ||      }|S # 1 sw Y   |S xY w)Nr   r   )cross_device_ops_libget_devices_fromre   r   r   r   r   r   r    rU  simple_broadcast)rP   r  r  r   dest_canonicalhost_canonicals         r8   r  zTPUExtended._broadcast_output  s    "33LAG
7|q"//
;n"//0A0ABn	>	)ZZ' 	.%%f-&	.
 M6M $44V\JfM	.
 Ms   5B//B9c                 r   t        t        j                        st        j                        rt        j                         |t        j                  j                  k(  r$t        j                  d| j                  z        n,|t        j                  j                  k7  rt        d| d      t        j                         S t        t        j                        s"t#        j$                  ||| j                        S j                  }t        t        j&                        r4j(                  (t+        fdj(                  j,                  D              }t/        j                        t0        k  rt        j2                  |      }nht5        j6                  |d   |d   j8                        }t;        dt/        |      t0              D ]$  }|t        j2                  |||t0        z          z  }& |t        j                  j                  k(  r|dt/        |      z  z  }| j=                  ||      }|S )Ng      ?z`reduce_op`=z[ is not supported. Currently we only support ReduceOp.SUM and ReduceOp.MEAN in TPUStrategy.c              3   T   K   | ]  }j                   j                  |       ! y wr;   r  r  s     r8   rU   z)TPUExtended._reduce_to.<locals>.<genexpr>0  s*      3 
 
 
*
*1
-3r  r   r   )r@   r   r  r   
is_tf_typer   r~  r   ReduceOpMEANr"   
scalar_mulr  SUMrE   r,   cross_replica_sumr  reduce_non_distributed_valuerL   r  r  r   re   r  add_nr    
zeros_liker   r   r  )rP   	reduce_opr  r  r   r  r  rq   s     `     r8   
_reduce_tozTPUExtended._reduce_to  s   5&223u%++-9	k**//	/ ##R(B(B%BUK,,000!9+ &E EF 	F &&u--eV556
 ">>
UL$*D*DF F J""$(-(>(>(J 3))113 3j 5<<66~~j)f##JqMA9L9LMfQJ)CD O!(..Aa2L.L!MNNO K((---c*o%&f##L&9FMr7   c                    t        |t        j                        st        |t        j                        sJ t        j                         |r ||g|i |S  ||g|i |fS |j                  }|/t        j                         s|r ||g|i |S  ||g|i |fS g }g }|%|j                  D ]  }	|j                  ||	f        n.|j                  D ]  }
|j                  |
|
j                  f       ! |j                  t        j                   j"                  k7  rQ|j$                  t        j&                  j(                  k7  r*t+        j,                  |       t+        j,                  |       t/        |      D ]  \  }}|d   }
|d   }	d|z  }t1        j                  |	      5  t3        j4                  |      5  t1        j6                  |      5  |j                   ||
gt+        j8                  ||      i t+        j8                  ||             d d d        d d d        d d d         t+        j:                  | ||      S # 1 sw Y   2xY w# 1 sw Y   6xY w# 1 sw Y   xY w)Nr   r   z	update_%d)r@   r   TPUVariableMixinr#   BaseResourceVariabler   r~  r  r   r4   r   r`   r   r   synchronizationvariables_libVariableSynchronizationON_READaggregationVariableAggregationNONEr
   assert_mirroredrY   r   r	   UpdateContext
name_scoper^  update_regroup)rP   varrF   ri   rj   rp  
packed_varupdatesvalues_and_devicesr   r  rq   value_and_devicer\   s                 r8   _updatezTPUExtended._updateF  sn   c:667:"77<9 9 9%%'3	#''''3(((** %%Jg&?&?&A	*.t.v..:///11 G&& 8&!!:v"678 :: 9%!!5%,,"789 	}DDLLL=<<AAA&&t,&&v.();< 
>q!e"f1_d::f >''*>>>$> 	u ='66q$? =!00F;=	>	> > > >	
> **4%@@> > > > > >s=   )I"?IAI
	I!I"
IIII""I+	c                     t        |t        j                        st        |t        j                        sJ |j                         S r;   )r@   r   r  r#   r  
read_value)rP   r  s     r8   read_varzTPUExtended.read_vart  s<    c:667:"77<9 9 9>>r7   c                     |S r;   r6   )rP   r  s     r8   value_containerzTPUExtended.value_containery  s    Lr7   c                     ~t        |t        t        f      r|S t        j                         Jt        | j                        D cg c]  }| }}t        j                  |dd| j                        }|d   S |S c c}w )Nr   concat_dimensionsplit_dimensionsplit_count)	r@   floatintr   r~  r   r  r,   
all_to_all)rP   r   r  r  broadcast_tensorresults         r8   _broadcast_tozTPUExtended._broadcast_to|  s     &5#,'m%%'3*/0J0J*KLQ&LL!!
00	2f AYM Ms   	A:c           
          | j                   | j                  j                  S t        t	        t        | j                   j                        D cg c]  }| j                   j                  |       c}            S c c}w r;   )r   r   r   re   r   r   r   r  )rP   rs     r8   r   zTPUExtended.num_hosts  sw    &)))s"4#:#:#G#GHJ ++77: J K L L Js   "A;c                     | j                   | j                  j                  S | j                  j                  | j                   j                  z  }t	        | j                   j
                  |      S r;   )r   r   num_of_cores_per_hostr   minr   )rP   max_models_per_hosts     r8   r   z!TPUExtended.num_replicas_per_host  sc    &555  --CC22HHIt&&335HIIr7   c                 r    | j                   | j                  j                  S | j                   j                  S r;   )r   r   	num_coresr   r   s    r8   r  z!TPUExtended._num_replicas_in_sync  s2    &)))""///r7   c                      y)NFr6   r   s    r8   experimental_between_graphz&TPUExtended.experimental_between_graph  s    r7   c                      yNTr6   r   s    r8   experimental_should_initz$TPUExtended.experimental_should_init      r7   c                      yr  r6   r   s    r8   should_checkpointzTPUExtended.should_checkpoint  r  r7   c                      yr  r6   r   s    r8   should_save_summaryzTPUExtended.should_save_summary  r  r7   c                 T    t        | j                  d d | j                  d   f         S )Nr   )r  r   r   r   s    r8   worker_deviceszTPUExtended.worker_devices  s(    ""1d&@&@&D#DEFFr7   c                     | j                   S r;   )r  r   s    r8   parameter_deviceszTPUExtended.parameter_devices  s    r7   c                 T    t        j                  | j                  j                         S )z7Return the `tf.tpu.experimental.HardwareFeature` class.)r*   HardwareFeaturer   r   s    r8   r*   z TPUExtended.tpu_hardware_feature  s&      //""779 9r7   c                     | j                   S r;   )r   )rP   var_lists     r8   non_slot_deviceszTPUExtended.non_slot_devices  s    r7   c                 R   ~t        j                  | j                        5  t        j                  d       5   ||i |}|r|cd d d        cd d d        S t        j                  | j                  |      cd d d        cd d d        S # 1 sw Y   nxY wd d d        y # 1 sw Y   y xY wr;   )r   r   r   r	   r  r.   rc  _local_results)rP   r  rF   ri   rj   rp  r
  s          r8   _update_non_slotzTPUExtended._update_non_slot  s    	D%%	& ?(D(DT(J ?4"6"f	? ? ?
 !!$"5"5v>? ? ? ? ? ? ?s.   BB	BB5	BB	BB&c                 P    ~~~|r!|j                  | j                  |             y y r;   )CopyFrom_update_config_proto)rP   session_configcluster_spec	task_typetask_ids        r8   
_configurezTPUExtended._configure  s,    
 	id77GH r7   c                     t        j                  |      }d|_        | j                  j	                         }|r)|j
                  j                  |j                                |S r  )copydeepcopyisolate_session_stater   r.  cluster_defr+  as_cluster_def)rP   config_protoupdated_configr.  s       r8   r,  z TPUExtended._update_config_proto  sR    ]]<0N+/N(--::<L  )),*E*E*GHr7   c                      y)z`make_dataset_iterator` and `make_numpy_iterator` use global batch size.

    `make_input_fn_iterator` assumes per-replica batching.

    Returns:
      Boolean.
    Tr6   r   s    r8   _global_batch_sizezTPUExtended._global_batch_size  s     r7   c                 8    | j                  ||      } |||      S r;   )_tpu_function_creator)rP   rF   ri   rj   r   funcs         r8   r   zTPUExtended.tpu_run  s!    %%b'2Dfr7   c                     t        j                         r j                  v r j                     S  j                          fd}t        j                         r$t	        j
                  |      }| j                  <   |S )Nc                    t        j                  dd| |       |i }g gfd}g }t        j                        D ]a  }|j	                  t        j                  |t        j                        t        j                  ||       t        j                  ||      g       c j                  r|rg }t        j                  |d         }|D ]  }t        j                  |      r|j                   j"                  }nt%        j&                  |      }|t)        dj+                  |            t-        j.                  dg|z        }	|j	                  |	        t        j0                  |d   |      }nd}j2                  rt4        j6                  j8                  }
nd}
j;                         5  j<                  xs  t5        j>                  j@                        }t5        jB                  ||jD                  ||
|	      }ddd       d
 }tG        d   tH              r |d         d<   d   tG        d   tJ        jL                        rdgtO              z  }n?D cg c]4  }t        j0                  d    |t        j                  |                  6 }}t        jP                  |      S # 1 sw Y   xY wc c}w )z3TF Function used to replicate the user computation.r   z8`TPUStrategy.run` is called with [args: %s] [kwargs: %s]Nc                 p    t        |       5   |i |d<   ddd       d   S # 1 sw Y   d   S xY w)z>Wraps user function to provide replica ID and `Tensor` inputs.replica_id_in_sync_groupr   N)rz  )r  replica_argsreplica_kwargsrF   r
  strategys      r8   replicated_fnzNTPUExtended._tpu_function_creator.<locals>.tpu_function.<locals>.replicated_fn  sD    :N 	:,9.9&)	:ay	:ays   (5r   r   zKinput tensor {} to TPUStrategy.run() has unknown rank, which is not allowedr`  )r(   maximum_shapespadding_specra  c                 b    | D cg c]  }t        |t        j                        r|! c}S c c}w r;   )r@   r   rr  )rM   os     r8   r_  zITPUExtended._tpu_function_creator.<locals>.tpu_function.<locals>.<lambda>?  s!    OA*Q2NaO Os   ,,))r   vlogr   r  r`   r   constantr   int32r
   r^  &experimental_enable_dynamic_batch_sizer.   rK   r   r  r   rankr   ndimrd   r   r   TensorShapepack_sequence_as&experimental_bucketizing_dynamic_shaper)   PaddingSpecPOWER_OF_TWOscopeexperimental_xla_optionsrd  r   r   r   r@   re  r   rr  re   rM  )ri   rj   rG  rg  rq   rH  flattened_listinput_tensorrP  maximum_shaperI  ra  rh  
filter_opsr  r
  rF   r   rP   rF  s                  @r8   tpu_functionz7TPUExtended._tpu_function_creator.<locals>.tpu_function  s   ll1M! 
 tf X223 :!!!!6<<8,,Q5,,Q79	:: 
	7	7<L&6q&9:* 
	/L##L1%%**D77<(D\''-vl';= = '22D6D=A-



.
	/ ../?/B/=? 		7	733>> 	%66 O#..*.*M*M;OMM"55)%#%	% Pj	F1It	$vay)q	 
	jCMMB!FS):%;; ,
 !!&)ZV8L-MN
 
 %%&7881	% 	%(
s   /AJ5%9K5J>)r   r4   r   r  r   r   )rP   rF   r   r]  rF  s   ``` @r8   r=  z!TPUExtended._tpu_function_creator  sv      "rT-E-E'E%%b))'')HL9\   "!**<8l%1dr"r7   c                      y)zAWhether this strategy indicates working in multi-worker settings.Fr6   r   s    r8   _in_multi_worker_modez!TPUExtended._in_multi_worker_modeR  s     r7   c                     |S r;   r6   )rP   rC  s     r8   _get_local_replica_idz!TPUExtended._get_local_replica_id[  s    ##r7   )NNNFr;   )NNNN)7r   r   r   r   r   r  r  r  r	   rA  
PER_WORKERr)  r/  r2  r=  rD  rI  rP  rx  r{  
contextlibcontextmanagerr  r  r  r   r  r  r  r  r  r  r  r   r  r   r   r  r  r  r  r  r  r!  r*   r&  r)  r1  r,  r;  r   r=  r_  ra  r   r   s   @r8   r   r   W  s   &
  $)d&L)4VE9 &::EEJ$
:?(>J HL^@! ' '&OkZ ' '
I> /b,A\
, L L J J 0 0
         G G   9 9
? !%"	I  Wr$r7   r   c                     t        | t              r| dk\  r| S | |z   S t        j                  t	        j
                  | d      | | |z         S rJ   )r@   r  r    where_v2r"   greater_equal)r  rP  s     r8   _make_axis_nonnegativerh  _  sO    cqykD[tQ't r7   c                   <    e Zd ZdZddZed        Zd Zd Zd	dZ	y)
rz  z+Replication Context class for TPU Strategy.c                 H    t         j                  j                  | ||       y )NrB  )r	   ReplicaContextr   )rP   rF  rC  s      r8   r   z_TPUReplicaContext.__init__}  s$    !!**h1I + Kr7   c                     t        j                  |        | j                  }t        j                  | j
                        }|t        j                  d      fS |j                  j                  |   fS rJ   )
r	   require_replica_context	_strategyr   constant_valuerC  r)   r  r   r  )rP   dsr  s      r8   r   z_TPUReplicaContext.devices  s]    **40	B++D,I,IJJhhqk^kk((466r7   c                 L    | j                   j                  j                  |      S )r}  )rF  r   r  )rP   r   s     r8   r  z._TPUReplicaContext.experimental_logical_device  s    ==!!==>OPPr7   c                    t        |t              r$t        |      }||xx   | j                  z  cc<   |S t	        j
                  t        j                  t        j                  |      |      |t        j                  z  |      }|S r;   )
r@   r  re  r  r    rf  r"   equalr   r   )rP   value_shape
value_rankr  output_shapes        r8    _compute_all_gather_output_shapez3_TPUReplicaContext._compute_all_gather_output_shape  sv    *c"+&l4D555 	 ''
..
3T
:
44
4
l r7   Nc                     ~t        j                  |      D ]'  }t        |t        j                        st        d        fd}t        j                  |      D cg c]  } |||       }}t        j                  ||      S c c}w )Nz)all_gather does not support IndexedSlicesc                 t   t        j                  |       } | j                  j                  +t	        j                  |       }t	        j                  |       }nl| j                  j                  }| j                  j                         }t	        j                  |       }t        t        |            D ]  }||   	||   ||<    t        ||      }t        |t              rdg|dz   z  }j                  ||<   nKt	        j                  t        j                  t        j                  |dz         |      j                  d      }j                  |||      }| j                   t"        v rt	        j$                  j&                  j                        }t	        j(                  ||      }t        j*                  || j                         }t	        j,                  | |      |z  }	j/                  t0        j2                  j4                  |	      }	t	        j(                  |	|      S t	        j,                  | |      }
t	        j6                  |
|      }
t9        j:                  |
||j                        }t	        j(                  j&                  dg      }t	        j6                  |j                  g      }t9        j:                  |ddj                        }t        j<                  t	        j$                  |j                        d      }t	        j>                  |||      }t	        j(                  ||      S )Nr   r  r  r   ) r   r  r   rP  r    as_listr   re   rh  r@   r  r  rf  r"   rs  rw  r   &_DTYPES_SUPPORTED_BY_CROSS_REPLICA_SUMone_hotrC  r   castexpand_dims
all_reducer   r  r  r   r,   r  argmaxgather)r  r  ru  rt  value_shape_tensorrq   replica_broadcast_shaperv  replica_id_maskgathered_valuerV  unordered_outputconcat_replica_idxla_to_replica_context_idreplica_context_to_xla_idsorted_with_extra_dimrP   s                   r8   _all_gather_tensorz9_TPUReplicaContext.all_gather.<locals>._all_gather_tensor  s   ##E*e
 
			!^^E*
ooe,[[%%
kk))+&__U3s;'( 	3A^#/2KN	3 $D*5d 
J	$#$#a"8(,(A(A%"+"4"4NN8>>*Q,7>%%#
 ::
z4)l 
>	>#++))4+D+DF#++46"--E"..ud;oM  $$n6  >> &&u48(?@"--! 11	3 &--))A30%NN 9 9:<$+$6$611	%3! %-OO7"779%! !* 0 07d!D  !6EEr7   r  )r.   rK   r@   r   IndexedSlicesrE   rS  )rP   r  r  experimental_hintsrn   r  tyss   `       r8   
all_gatherz_TPUReplicaContext.all_gather  s    \\%  O	A~33	4!"MNNOYFv 59LL4G	Hq
QT
*	HB	H  ++ 
Is   B)r   r;   )
r   r   r   r   r   r   r   r  rw  r  r6   r7   r8   rz  rz  x  s0    3K 	7 	7Q	b,r7   rz  c                     t        j                  | j                  |      }| j                  j	                         D ]-  \  }}||   }|t        j                  |      ||<   &|d   ||<   / | j                  |       y)z0Sets the last step outputs on the given context.Nr   )r.   rS  rS  _last_step_outputs_reduce_opsrX   r   
PerReplicars  )rZ  rv  last_step_tensor_outputs_dictr\   r  r  s         r8   rs  rs    s     #'"7"7	5#7 ::@@B 6odI*40F ,2,=,=f,E#D) -31I#D)6 :;r7   )gr   r   r   rc  r3  rg   r   abslr   numpyr    tensorflow.python.autograph.corer   r    tensorflow.python.autograph.implr   r   +tensorflow.python.compiler.xla.experimentalr   tensorflow.python.distributer   r  r   r	   r
   r   r   r   r   r   r   r   r   -tensorflow.python.distribute.cluster_resolverr   r   tensorflow.python.distribute.v1r  tensorflow.python.eagerr   r   r   tensorflow.python.frameworkr   r   r  r   r   r   r   r   r   r   tensorflow.python.opsr    r!   r"   r#   r$   r  tensorflow.python.ops.raggedr%   tensorflow.python.saved_modelr&   tensorflow.python.tpur(   device_assignment_libr)   r*   r+   tensorflow.python.tpu.opsr,   tensorflow.python.utilr-   r.   r/   r0   r  r3   r9   rd  r>   rG   ru   Strategyrx   deprecated_endpointsr|   
StrategyV1r   StrategyExtendedV1r   rh  bfloat16float16float32float64rN  uint32r{  rk  rz  rs  r6   r7   r8   <module>r     s            D = D Q 4 7 9 2 3 6 4 @ 1 3 / j E + 0 , 3 ; 3 . 6 + 5 4 3 + 2 * 7 < 6 6 L % 6 / - . ' , -  27 /  &6tn -"5d@N++ d@ 6d@N :rB!!!"GHQ/.)) Q/ I CQ/h >?@b<N-- b< Ab<LE$.33 E$P   OO
NN
NN
NN
LL
MM* &F,66 F,R<r7   