
    AVhOl                        d Z ddlZddlZddlmZmZmZmZmZ ddl	m
Z
 ddl	mZ ddl	mZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlm Z  ddl!m"Z# ddl$m%Z% ddl&m'Z' ejP                   G d d             Z) G d dejT                        Z+ G d dejX                        Z-deej\                     deej^                     d e0fd!Z1	 d1d"ej\                  d#ee2   d$ee3   fd%Z4d"ej\                  d&ej^                  d$ejj                  fd'Z6d(eee3e#jn                  f      d)ed*ejp                  d+e3fd,Z9 e'd-g .       G d/ d0ejt                               Z;y)2a  APIs to deal with input datasets efficiently in DTensor.

When using tf.data with DTensor, the `DTensorDataset` API can be used to
efficiently handle loading the input data and correctly packing it to the
corresponding devices. This API is intended to work with unbatched data and can
be used for both data and model parallel setups.

Example usage:

>>> # 1-D mesh with 4 devices
>>> mesh = dtensor.Mesh(dim_names=['batch'], ...)
>>> layout = dtensor.Layout.batch_sharded(mesh, 'batch', rank=1)
>>> dataset = tf.data.Dataset.range(256)
>>> d_dataset = dtensor.DTensorDataset(
...     dataset=dataset,
...     global_batch_size=16,
...     mesh=mesh,
...     layouts=layout,
...     batch_dim='batch')
>>> d_iter = iter(d_dataset)
>>> # Each batch is a length 16 tensor sharded across 4 devices
>>> batch_0_dtensor = next(d_iter)
>>> batch_0_dtensor
<tf.Tensor: shape=(16,),
            dtype=int64,
            value={"CPU:0": [ 0  1  2  4],
                   "CPU:1": [ 5  6  7  8],
                   "CPU:2": [ 9 10 11 12],
                   "CPU:3": [13 14 15 16]}>
>>> batch_1_dtensor = next(d_iter)
>>> batch_1_dtensor
<tf.Tensor: shape=(16,),
            dtype=int64,
            value={"CPU:0": [17 18 19 20],
                   "CPU:1": [21 22 23 24],
                   "CPU:2": [25 26 27 28],
                   "CPU:3": [29 30 31 32]}>

For multi-client setups, `DTensorDataset` interacts with tf.data service to
correctly distribute the dataset among the participating clients. DTensor works
with tf.data service in co-located mode where each worker is running alongside
the DTensor client (the Tensorflow Python process). The `TFDataServiceConfig`
dataclass can be filled with information about the tf.data service cluster, and
passed to `DTensorDataset` to enable distribution.
    N)AnyListOptionalSequenceTuple)api)configlayout)data_service_ops)
distribute)dataset_ops)iterator_ops)context)constant_op)dtypes)errors)ops)tensor)tensor_shape)tensor_spec)	array_ops)math_ops)data)nest)	tf_exportc                   &    e Zd ZU dZeed<   eed<   y)TFDataServiceConfiga0  Specifies the tf.data service configuration to use.

  Attributes:
    dispatcher_address: a string specifying the address of the tf.data service
      dispatcher server.
    job_name: a non-empty string identifying the shared job that will be created
      on tf.data service to process this dataset.
  dispatcher_addressjob_nameN)__name__
__module____qualname____doc__str__annotations__     T/home/dcms/DCMS/lib/python3.12/site-packages/tensorflow/dtensor/python/input_util.pyr   r   X   s     -r(   r   c                   t     e Zd ZdZdeej                     dej                  de	f fdZ
d Zed        Z xZS )_DTensorIteratora  An iterator for a tf.data.Dataset distributed using DTensor.

  DTensorIterator encapsulates multiple underlying dataset iterators. It handles
  retrieving the tensors to be placed on each underlying device and then uses
  the 'pack' operation to create and return a DTensor. Thus users need only
  interact with a single DTensorIterator to automatically distribute dataset
  tensors onto devices.
  dtensor_componentsglobal_element_speclayoutsc                     |\  | _         || _        || _        t        j                  d |      | _        t        |   ||       y)a  Initializes a distributed iterator for DTensor datasets.

    This iterator encapsulates tf.data iterators for the underlying devices, and
    treats it as a packed DTensor of iterator resource tensors.

    Args:
      dtensor_components: a tuple containing the underlying iterator resources
        packed into a DTensor. This is expected to be a tuple with a single
        element.
      global_element_spec: the underlying dataset's element spec from a global
        view.
      layouts: a structure of DTensor layouts to be applied to the elements
        returned by the underlying iterators. This can be a single layout or
        (possibly nested) tuples or dictionaries of layouts, and the structure
        must match the structure of the iterator elements.
    c                 "    | j                         S N)	to_stringr
   s    r)   <lambda>z+_DTensorIterator.__init__.<locals>.<lambda>   s    v'') r(   )
componentselement_specN)_iterator_resource_dtensor_global_element_spec_layoutsr   map_structure_layouts_strsuper__init__)selfr,   r-   r.   	__class__s       r)   r<   z_DTensorIterator.__init__q   sR    , );%T$ 3DDM**)74D 
G%4G  Ir(   c                 F   	 | j                         }t        j                          t        j                  t
        j                  || j                        }t        j                          |S # t        j                  $ r"}t        j                         rt        ||d }~ww xY wr1   )_next_internalr   
async_waitr   r9   r   copy_to_meshr8   r   OutOfRangeErrorexecuting_eagerlyStopIteration)r=   	host_elemdevice_elemes       r)   __next__z_DTensorIterator.__next__   s     %%'i&&


It}}6k!!  
	"	"	$"s   A(A+ +B >BB c                 B    t        | j                  | j                        S r1   )_DTensorIteratorSpecr7   r:   r=   s    r)   
_type_specz_DTensorIterator._type_spec   s     9 94;L;LMMr(   )r!   r"   r#   r$   r   r   Tensorr   
TensorSpecr   r<   rI   propertyrM   __classcell__r>   s   @r)   r+   r+   g   sW    I.I '11I 	I>& N Nr(   r+   c                        e Zd ZdZddgZdej                  def fdZe	d        Z
d Ze	d	        Zd
 Zd Zed        Z xZS )rK   z*Type specification for `_DTensorIterator`.r7   r:   r-   layouts_strc                 @    t         |   |       || _        || _        y r1   )r;   r<   r7   r:   )r=   r-   rT   r>   s      r)   r<   z_DTensorIteratorSpec.__init__   s!    	G() 3D#Dr(   c                     t         S r1   )r+   rL   s    r)   
value_typez_DTensorIteratorSpec.value_type   s    r(   c                 2    | j                   | j                  fS r1   r7   r:   rL   s    r)   
_serializez_DTensorIteratorSpec._serialize   s    %%t'8'899r(   c                 L    t        j                  g t        j                        fS r1   )r   rO   r   resourcerL   s    r)   _component_specsz%_DTensorIteratorSpec._component_specs   s    ""2v799r(   c                     |j                   fS r1   )r6   )r=   values     r)   _to_componentsz#_DTensorIteratorSpec._to_components   s    ,,..r(   c                     t        j                  t        j                  j                  | j
                        }t        || j                  |      S )Nr,   r-   r.   )r   r9   
layout_libLayoutfrom_stringr:   r+   r7   )r=   r4   r.   s      r)   _from_componentsz%_DTensorIteratorSpec._from_components   sE      %%t'8'8:G% 55 r(   c                 <     | |j                   |j                        S r1   rY   )clsr_   s     r)   
from_valuez_DTensorIteratorSpec.from_value   s    u))5+=+=>>r(   )r!   r"   r#   r$   	__slots__r   rO   r   r<   rP   rW   rZ   r]   r`   rf   classmethodri   rQ   rR   s   @r)   rK   rK      sv    2%~6)$!,!7!7$FI$  : : :/ ? ?r(   rK   flattened_layoutsflattened_elem_specdataset_already_batchedc                    |st        d      |d   j                  }t        | |      D ]  \  }}|j                  j                  t        d|z        |j                  j                  }|s|dz  }|j                  |k7  rt        d|||j                  fz        |st|j                         d   }|t        d|z        |j                  j                         d   |k7  st        d||fz         y)	a  Checks that the dataset's layouts and element specs are compatible.

  Args:
    flattened_layouts: the flattened list of layouts used to distribute the
      dataset.
    flattened_elem_spec: the flattened list of element specs used in the
      dataset's components.
    dataset_already_batched: whether the dataset to be validated is already
      batched.

  Raises:
    ValueError: if the dataset's inputs are incompatible.
  z?Expected input element spec of at least one element, was empty.r   Nz:Dataset element shape must have a valid rank, got spec %s.   zExpected layout with rank %d for element spec %s, got layout %s. Check that the dataset is not batched before passing to DTensorDataset.ziSize of batch dimension of element spec %s is None. Ensure drop_remainder=True when batching the dataset.zKSize of batch dimension of element spec %s does not match expected size %d.)
ValueErrorshapezipranksharding_specsas_list)rl   rm   rn   first_elem_shaper   	elem_specexpected_rankbatch_dim_sizes           r)   _validate_inputr{      s?     

IK K )+1102EF Afi#
F
  OO((M"qm{{m# )V%:%:
;<= = '//1!4n		>AJKL 	L 
	 	 	"1	%	7!%.$?@A 	A9Ar(   r   	batch_dimreturnc                     g }| j                   D ]R  }||t        j                  fv r|j                  d       )|j                  | j                  j                  |             T |S )a  Computes a list of the number of shards in each dimension of the layout.

  The shard counts are used to slice each dataset element. The batch dimension's
  count is overridden to 1 since we only consider how many shards to make
  locally (within each local replica). Sharding across clients is handled by
  either tf.data.Dataset's shard transformation (in the single-client case) or
  tf.data service's distribute function (in the multi-client case).

  Args:
    layout: the layout to compute the shard counts for.
    batch_dim: the name of the batch dimension of the layout, if present.

  Returns:
    A list of shard counts, one element per dimension of the layout.
  rp   )ru   rc   	UNSHARDEDappendmeshdim_size)r   r|   shard_countsspecs       r)   _shard_countsr     sd    " ,## 6d	://00!&++..t45	6
 
r(   rx   c                    g }| j                   j                  D ]  }dg}t        | j                  dd       D ]i  \  }}|t        j
                  k(  s||k7  r|j                  d       0|j                  |j                  |   | j                   j                  |      z         k |j                  |        t        j                  |t        j                        S )a  Computes a utility matrix to derive device-based slice offsets.

  This function builds a matrix of shape `[mesh.rank, layout.rank]` for each
  dataset element. This matrix can be used to slice the DTensor components
  returned by the iterator according to the local device that component is to be
  placed on. This can be done by multiplying the device offsets of shape
  `[1, mesh.rank]` with this index matrix to get a `[1, layout.rank]` shape
  tensor containing the slice offsets.

  Note: the index on the batch dim is always 0 since sharding on the batch
  dimension is handled by either tf.data.Dataset's shard transformation (in the
  single-client case) or tf.data service's distribute function (in the
  multi-client case). If there is no sharding on the batch dimension (or any
  other dimension), the slice index remains 0.

  Args:
    layout: the layout of the dataset element.
    elem_spec: the spec of the dataset element.

  Returns:
    The index matrix as a tensor.
  r   rp   Ndtype)r   	dim_names	enumerateru   rc   r   r   rr   r   r   constantr   int32)r   rx   matrixdimrow
layout_idxr   s          r)   _index_matrixr     s    0 &[["" c#C%f&;&;AB&?@ M
D	%%	%

1

9??:.&++2F2Fs2KKL	M
 MM# 
		fFLL	99r(   datasetsr.   r   num_local_devices_per_replicac                    j                         j                         }d}g }| D ]h  \  }}t        |      D ]U  }	t        j                  ||         5  |j                  ||	      }
|j                  t        |
             ddd       |dz  }W j |t        |      k7  rt        d| dt        |       d      t        j                  fd|      }|D cg c]  }|j                   }}t        j                  |t        j                   j#                  j                         d	            }t        j$                         j'                  |t        j(                  |             |S # 1 sw Y   xY wc c}w )
a  Creates a DTensor iterator resource for the per-replica datasets.

  Given a list of replica ID to tf.data.Dataset mappings, this function creates
  iterators for each device and then packs the underlying iterator resource
  tensors into a single DTensor. This resource tensor is used by the
  IteratorGetNext op to retrieve the next element in the dataset.

  Args:
    datasets: a list of tuples of each unique local replica ID to the dataset
      object whose elements will be placed on the devices corresponding to that
      replica.
    layouts: a structure of DTensor layouts to be applied to the elements
      returned by the underlying iterators. This can be a single layout or
      (possibly nested) tuples or dictionaries of layouts, and the structure
      must match the structure of the iterator elements.
    mesh: the DTensor mesh to place the iterator batches on.
    num_local_devices_per_replica: the number of devices in each data-parallel
      replica.

  Returns:
    A DTensor of the underlying iterator resource tensors.
  r   )
num_shardsindexNrp   zWThe `datasets` argument does not have the correct number of underlying datasets, found z but expected .c                 `    t        j                  | j                  j                               S r1   )rc   rd   ru   	host_mesh)lr   s    r)   r3   z1_pack_iterator_resource_dtensor.<locals>.<lambda>r  s!    
!!!"2"2DNN4DE r(   )r   rt   )r   local_devicesranger   	device_v2shardr   iterlenrq   r   r9   _iterator_resourcer   packrc   rd   
replicated_dtensor_deviceset_iterator_element_layoutsflatten)r   r.   r   r   host_mesh_devices
device_idx	iterators_datasetidxdevice_datasethost_layoutsititerator_resourcesd_iterator_resources     `            r)   _pack_iterator_resource_dtensorr   D  s   6 nn&446*) ja23 ==*:67 / 4C ' An-./ Aoj 3())
	''1l 3!"#1	&' '
 ##EwP,
 9BB"--BB""(8q"AC 444<<57 
1/ /" Cs   .E(E4(E1z#experimental.dtensor.DTensorDataset)v1c                        e Zd ZdZddddddej
                  dej                  dede	d	e
d
ee   dee	   dee   f fdZd Zd Zd Zed        Z xZS )DTensorDatasetzA dataset of DTensors.

  DTensorDataset encapsulates a `tf.data.Dataset` whose elements are
  automatically packed and returned as DTensors based on a given mesh and
  layouts.
  FN)rn   r|   prefetchtf_data_service_configr   r   r.   global_batch_sizern   r|   r   r   c          	         t         |   |t        j                  |             |t	        d      || _        || _        || _        || _        || _	        t        j                  |j                  |       t        j                  |      }	t        j                  |j                        }
|r|j                  |      | _        t!        t"        j%                  |j'                         D cg c]  }||   	 c}            | _        |	D ]%  }||j*                  d   k7  st-        d|d|       nd| _        dg| _        t/        |	|
|       || j                  z  }|s|j1                  |d	      | _        n<|
d   j4                  j7                         d   }||k7  rt-        d
||fz        || _        g }t9        j:                  |g      }t        j                  | j2                  j                        D ]`  }t=        j>                  tA        jB                  ||j4                  dd       |jD                  |jF                        }|jI                  |       b t        jJ                  |j                  |      | _&        tO        jP                  |jS                               | j                  z  }tU        | j(                        | _+        |jY                         | jV                  z  | _-        || jZ                  z  | _.        tO        j^                         | j\                  z  | jZ                  z  | _0        |	D cg c]  }tc        ||       c}| _2        tg        |	|
      D cg c]  \  }}ti        ||       c}}| _5        yc c}w c c}w c c}}w )ai  Creates a DTensorDataset.

    DTensorDataset automatically handles distribution of the dataset elements to
    each client's devices. It can be used to create an iterator that returns
    DTensors of the input data on each iteration.

    DTensorDataset works best with unbatched datasets. It takes the mesh and the
    provided layouts to automatically calculate how to batch the input locally
    for each replica.

    If the provided dataset is already batched according to the per-replica
    batch size, then `dataset_already_batched` must be set and DTensorDataset
    will check that the batch size is consistent with the intended
    `global_batch_size` using the layout information. Each replica receives a
    separate slice of the global batch, thus the per-replica batch size can be
    computed as the global batch size divided by the number of model replicas.
    For a DTensor mesh, the number of replicas is equal to the size of the
    mesh's batch dimension.

    Note: `tf.experimental.dtensor.DTensorDataset` instances do *not* implement
    the full interface of `tf.data.Dataset`. It only supports two usages we will
    mention below: iteration and `element_spec`. We don't support any other APIs
    to transform or inspect the dataset.

    TODO(b/223275517): add support for input datasets that are already batched
    to the global batch size.

    Args:
      dataset: a `tf.data.Dataset` object.
      mesh: the DTensor mesh to place the dataset batches on.
      layouts: a structure of DTensor layouts to be applied to the input dataset
        values. This can be a single layout or (possibly nested) tuples or
        dictionaries of layouts, and the structure must match the structure of
        the dataset. Either all or none of the layouts should be sharded on the
        batch dimension; having only a subset of layouts batch sharded will not
        work and raises a ValueError.
      global_batch_size: the desired global batch size.
      dataset_already_batched: must be set only if the dataset is already
        batched to the per-replica batch size. The batched dataset must have
        `drop_remainder=True` set since DTensor requires static shapes for
        slicing the input tensors.
      batch_dim: the mesh dimension on which the input's batch dimension is
        sharded. Set to None if the input layouts do not shard on the batch
        dimension.
      prefetch: number of batches to prefetch using Dataset.prefetch.
      tf_data_service_config: if operating in multi-client mode, this config
        specifies the tf.data service configuration to use.

    Raises:
      ValueError: on any of the following situations,
        1. if the structures and ranks of layouts and the dataset do not match.
        2. if the shapes in the dataset's spec are not fully defined.
        3. if batch_dim is specified and all layouts are not batch-sharded.
        4. if per_replica_batch_size is specified for an already batched Dataset
           but it does not match the expected per-replica size based on the
           provided mesh.
      TypeError: if type of structures of layouts and the dataset do not match.
    NzJMulti-client DTensorDataset is currently not supported. Check b/271162918.r   z
batch_dim z; was specified but at least one layout did not contain it: rp   )rn   T)drop_remainderz`per_replica_batch_size does not matched expected size based on the mesh, got %d but expected %d.)rr   r   name)6r;   r<   r   
to_variantNotImplementedError_meshr8   
_batch_dim	_prefetch_tf_data_service_configr   assert_same_structurer5   r   r   num_global_replicaslistdictfromkeyslocal_device_locations_local_replica_idsru   rq   r{   batch_batched_datasetrr   rv   r   as_shaper   rO   operatorconcatr   r   r   pack_sequence_asr7   r	   num_global_devicesdevice_typer   _num_local_replicasnum_local_devices_num_local_devices_per_replica_num_clients_per_replica	client_id_partition_offsetr   _all_shard_countsrs   r   _index_matrices)r=   r   r   r.   r   rn   r|   r   r   rl   rm   locr   expected_batch_sizeper_replica_batch_sizeflattened_global_elem_specbatch_tensor_shaperx   new_elem_specnum_global_devices_per_replicar>   s                       r)   r<   zDTensorDataset.__init__  s   H 
GWk44W=> ) ! ! DJDMDODN#9D w33W=W-,,w';';<!%y!9d $
--)-)D)D)FG#s9~GI!Jd & 8&--a00$-v78 88 "#d!"d  79
 ,t/G/GG"%mm
d , 4d  315;;CCEaH	#6	61#%89:; 	; &d "$%..0A/BC\\$"7"7"D"DE 7	!,, 2IOOAB4GH~~m !''67 !% 5 58!:D &,%>%>&#77&8""4#:#:;D*.*@*@ +		!	!+"D' 	'$*M*MM 	! %..043P3PP!%!D!DED
 8I-3fi(D
 "%%68K!LFI 	fi(DA Hzs   'M;1N Nc           
      .   g }| j                   }| j                  a| j                  dkD  r7| j                  || j                        }t        j
                  j                  }n5t        j
                  j                  }nt        j
                  j                  }| j                  g|j                  t	        j                  || j                  j                  | j                  j                   dt        j                          d            }t!        | j"                        D ]  \  }}t        j$                  || j&                  || j(                        }| j                  || j*                        }| j-                  |      }| j.                  (|j1                  | j.                  | j*                  z        }|j3                  ||f        t5        || j6                  | j8                  | j*                        }t;        |f| j<                  | j6                        S )Nrp   r   LOCAL)processing_modeservicer    target_workers)num_workersr   num_replicas)r   r.   r   r   rb   )r   r   r   _repeat_batchr   ShardingPolicyDATAFILEOFFr   applyr   r   r    r	   r   r   r   _AutoShardDatasetr   r   r   
_partitionr   r   r   r   r8   r   r+   r7   )r=   r   local_datasetsharding_policylocal_replica_idx
replica_idr   r   s           r)   __iter__zDTensorDataset.__iter__(  s   79H ))M"		&	&	* **=+/+H+HJ*99>>
 +99>> )77;;o ##/#))

%
%-22EE66??@&BRBRBTAUV$	&'m *343J3J)K -%:,,
..!//	1g ""7D,O,OPg (g 
	#""NNT@@@B ooz7+,+-0 :ZZ&*&I&I	K /1 55 r(   c                 >    dk(  r|S fd}|j                  |      S )Nrp   c                  `    t         j                  j                  |       j                        S r1   )r   	DatasetV2from_tensorsrepeat)xrepeatss    r)   r   z,DTensorDataset._repeat_batch.<locals>.repeatr  s%    ""//299'BBr(   )flat_map)r=   r   r   r   s     ` r)   r   zDTensorDataset._repeat_batchm  s(    !|nC F##r(   c                       j                   dk(  r j                  dk(  r|S  fd}|j                         }|j                  |      }|S )z?Slices each dataset element on any sharded non-batch dimension.rp   r   c                    t        j                  |      }g }t        j                  | j                  z  t
        j                        }|j                  z  }j                  j                  |      }t        j                  |d      }t        |j                  j                        D ]  \  }}}t        j                  ||      }	t        j                  |	d      }
t        j                   |t
        j                        |z  }|j#                  t        j$                  ||
|              t        j&                  ||      S )Nr   )rp   )r   )out_type)beginsize)r   r   r   castr   r   r   r   r   coordsr   reshapers   r   r   matmulshape_v2r   slicer   )r   r   flattened_batchflattened_output
norm_indexr  elementr   
idx_matrixindexesstartr  r=   s               r)   slice_batchz.DTensorDataset._partition.<locals>.slice_batch~  s   U+o==
$55
5V\\KjD***jzz  ,f  1f/2?373I3I373G3G0I >
+'< //&*5!!'51!!fll,/;<OOG5t<	>> ""5*:;;r(   )r   r   r   map)r=   r   r  enumerated_datasetpartitioned_datasets   `    r)   r   zDTensorDataset._partitionw  sP    **a/D4J4Ja4On<, !**,,00=r(   c                     | j                   S r1   )r7   rL   s    r)   r5   zDTensorDataset.element_spec  s    $$$r(   )r!   r"   r#   r$   
data_typesr   rc   Meshr   intboolr   r%   r   r<   r   r   r   rP   r5   rQ   rR   s   @r)   r   r     s     05*.)-GK]",,]  __] 	]
 #&] )-] #3-] "#] (00C'D]~CJ$B % %r(   r   r1   )<r$   dataclassesr   typingr   r   r   r   r   tensorflow.dtensor.pythonr   r	   r   rc   'tensorflow.python.data.experimental.opsr   r   tensorflow.python.data.opsr   r   tensorflow.python.eagerr   tensorflow.python.frameworkr   r   r   r   r   r   r   tensorflow.python.opsr   r   tensorflow.python.typesr   r  tensorflow.python.utilr    tensorflow.python.util.tf_exportr   	dataclassr   OwnedIteratorr+   IteratorSpecrK   rd   rO   r  r{   r%   r  r   rN   r   r   r  r   UnaryUnchangedStructureDatasetr   r'   r(   r)   <module>r'     s  ,\   7 7 ) , : D > 2 3 + 3 . . + . 4 3 + * 6 ' 6 
 
 
>N|11 >NB#?<44 #?L4Ax
0A0A'B 4A)1+2H2H)I4A-14Ap .2*++ %c]6:3i4":*++ ":(33":8>":J95j2223499 //9 $'	9x 0R8Y%[?? Y% 9Y%r(   