
    2Vh                     ~    d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlm	Z	 d dlm
Z
 d dlmZ  G d d	e      Zd
 Zd Zy)    N)backend)Layer)argument_validation)numerical_utils)tf_utils
tensorflowc                       e Zd ZdZ	 	 	 	 	 	 	 d% fd	Zd&dZd Z fdZd Zd'dZ	d Z
d	 Zed
        Zed        Zd Zd Zd'dZd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Z d Z!d Z"d  Z#d! Z$d" Z%d# Z&d$ Z' xZ(S )(IndexLookupa  Maps values from a vocabulary to integer indices.

    This layer translates a set of arbitrary hashables into an integer output
    via a table-based lookup, with optional out-of-vocabulary handling. This is
    the basis layer for both IntegerLookup and StringLookup; it holds the common
    logic but is not intended to be exported as part of the Keras API.

    Args:
        max_tokens: The maximum size of the vocabulary for this layer.
            If `None`, there is no cap on the size of the vocabulary.
            Note that this size includes the OOV and mask tokens.
        num_oov_indices: The number of out-of-vocabulary tokens to use.
            If this value is more than 1, OOV inputs are hashed to determine
            their OOV value. If this value is 0,
            OOV inputs will cause an error when calling the layer.
        mask_token: A token that represents masked inputs.
            When `output_mode` is `"int"`,
            the token is included in vocabulary and mapped to index 0.
            In other output modes, the token will not appear in the vocabulary
            and instances of the mask token in the input will be dropped.
            If set to `None`, no mask term will be added.
        oov_token: Only used when `invert` is `True`.
            The token to return for OOV indices.
        vocabulary: Optional. Either an array or a string path to a text file.
            If passing an array, can pass a tuple, list, 1D numpy array,
            or 1D tensor containing the vocbulary terms.
            If passing a file path, the file should contain one line per term
            in the vocabulary. If this argument is set,
            there is no need to `adapt` the layer.
        vocabulary_dtype: The dtype of the vocabulary terms.
            For example, `"int64"` or `"string"`.
        idf_weights: Only valid when `output_mode` is `"tf_idf"`.
            A tuple, list, 1D numpy array, or 1D tensor or the same length
            as the vocabulary, containing the floating point
            inverse document frequency weights, which will be multiplied
            by per sample term counts for the final TF-IDF
            weight. If the `vocabulary` argument is set, and `output_mode`
            is `"tf_idf"`, this argument must be supplied.
        invert: Only valid when `output_mode` is `"int"`.
            If `True`, this layer will map indices to vocabulary items
            instead of mapping vocabulary items to indices.
            Defaults to `False`.
        output_mode: Specification for the output of the layer. Values can be
            `"int"`, `"one_hot"`, `"multi_hot"`, `"count"`, or `"tf_idf"`
            configuring the layer as follows:
            - `"int"`: Return the raw integer indices of the input tokens.
            - `"one_hot"`: Encodes each individual element in the input into an
                array the same size as the vocabulary, containing a 1
                at the element index. If the last dimension is size 1,
                will encode on that dimension.
                If the last dimension is not size 1,
                will append a new dimension for the encoded output.
            - `"multi_hot"`: Encodes each sample in the input into
                a single array the same size as the vocabulary,
                containing a 1 for each vocabulary term present in the sample.
                Treats the last dimension as the sample dimension,
                if input shape is `(..., sample_length)`, output shape will
                be `(..., num_tokens)`.
            - `"count"`: As `"multi_hot"`, but the int array contains a count
                of the number of times the token at that index appeared
                in the sample.
            - `"tf_idf"`: As `"multi_hot"`, but the TF-IDF algorithm
                is applied to find the value in each token slot.
            Defaults to `"int"`.
        pad_to_max_tokens: Only valid when `output_mode` is `"multi_hot"`,
            `"count"`, or `"tf_idf"`. If `True`, the output will have its
            feature axis padded to `max_tokens` even if the number
            of unique tokens in the vocabulary is less than max_tokens,
            resulting in a tensor of shape `(batch_size, max_tokens)`
            regardless of vocabulary size. Defaults to `False`.
        sparse: Boolean. Only applicable to `"one_hot"`, `"multi_hot"`,
            `"count"` and `"tf-idf"` output modes.
            If `True`, returns a `SparseTensor` instead of a dense `Tensor`.
            Defaults to `False`.
    c                    ||dk  rt        d|       |r|t        d|       |dk  rt        d|       |	dk(  rd}	|	dk(  rd	}	t        j                  |	d
| j                  j                  d       |r|	dk7  rt        d|	       |
r|	dk(  rt        d|
 d|	       ||	d	k7  rt        d| d|	       t
        |   |       d| _        d| _        d| _	        || _
        || _        || _        || _        || _        |	| _        |
| _        || _        t%        j&                  |      j(                  | _        |j-                  dd       | _        || _        || _        |j-                  d|d u      | _        |j-                  dd        |j-                  dd        |rt        d|       |r.d| _        | j*                  | _        d}|}| j                  | _        n| j*                  | _        d| _        |}| j                  dk(  rdn(t%        j&                  | j8                        j<                  }| j                  dk(  rd| _        n,| j                  dk(  r| j?                         | _        nd| _        | j                  Jt%        j@                  || j6                        | _!        t%        j@                  || j8                        | _"        | j                  d	k(  ri| j4                  r|t        d      |Nt%        jF                  |tI        jJ                         d      | _&        | jL                  jO                         | _(        || jS                  ||       n| jU                         | _+        | j4                  st$        jX                  jZ                  j]                  |dd      | _/        | j                  d	k(  rOt$        jX                  jZ                  j]                  |dd      | _0        t%        jF                  ddd      | _1        y y y )N   zBIf set, `max_tokens` must be greater than 1. Received: max_tokens=zJIf pad_to_max_tokens is True, must set `max_tokens`. Received: max_tokens=r   zP`num_oov_indices` must be greater than or equal to 0. Received: num_oov_indices=binary	multi_hotztf-idftf_idf)intone_hotr   countr   output_mode)allowable_stringscaller_namearg_namer   zK`output_mode` must be `'int'` when `invert` is true. Received: output_mode=zt`sparse` may only be true if `output_mode` is `'one_hot'`, `'multi_hot'`, `'count'` or `'tf_idf'`. Received: sparse=z and output_mode=zW`idf_weights` should only be set if `output_mode` is `'tf_idf'`. Received: idf_weights=)nameFTvocabulary_sizehas_input_vocabulary	trainabledtypez"Unrecognized keyword argument(s): int64zsWhen specifying the `vocabulary` argument, in TF-IDF output mode, the `idf_weights` argument must also be provided.r   r   )	key_dtypevalue_dtypedefault_value)2
ValueErrorr   validate_string_arg	__class____name__super__init___convert_input_args!_allow_non_tensor_positional_argssupports_jitinvert
max_tokensnum_oov_indices
mask_token	oov_tokenr   sparsepad_to_max_tokenstfas_dtyper   vocabulary_dtypepop_frozen_vocab_sizeinput_vocabularyinput_idf_weights_has_input_vocabulary
_key_dtype_value_dtype_default_valuemax_oov_start_indexconvert_to_tensor	_mask_key_mask_valueVariabler   floatxidf_weightsvalueidf_weights_constset_vocabulary_uninitialized_lookup_tablelookup_tablelookupexperimentalMutableHashTabletoken_countstoken_document_countsnum_documents)selfr-   r.   r/   r0   r5   
vocabularyrE   r,   r   r1   r2   r   kwargsmask_key
mask_valuer%   s                   [/home/dcms/DCMS/lib/python3.12/site-packages/keras/src/layers/preprocessing/index_lookup.pyr(   zIndexLookup.__init__Z   ss   $ !jAo((2|5 
 !3((2|5 
 Q--<,=?  ("%K(""K// //"	
 kU*))47 
 kU*$$*8 ,*m-  "{h'>55@M B*m-  	d##( 15.!$.$"&!2 ",< = B B"(**->"E *!,
 &,ZZ"Zt%;&
" 	

;%

7D!A&JKK%DO $ 5 5DH#J"&..D"33DO 'D!H
 ##u, [[!2!2377 
 ##q( ')#%%* '+&;&;&=# ')#??&11(DOOLDN!33D-- D x'))k.A - 
 &#%;;!..*#$ 
 *.)9)9)?)?)A&!
K8 !% @ @ BD )) "		 6 6 G G*# !H !D
 8+II**;;"2$+&' <  * &([[W&" , *    c                      j                   j                         dk(  rg g }}nU j                   j                         \  }} j                  r||fn||f\  }} j	                  |      |j                         }}t        j                   fdt        ||            }t         j                               D cg c]  }||   	 }} j                   j                  dk(  r j                  |d<   |s| j                         d } j                  dk(  r1|D cg c]%  }t        |t               r|j#                  d      n|' c}S |S c c}w c c}w )a  Returns the current vocabulary of the layer.

        Args:
            include_special_tokens: If `True`, the returned vocabulary
                will include mask and OOV tokens,
                and a term's index in the vocabulary
                will equal the term's index when calling the layer.
                If `False`, the returned vocabulary will not include
                any mask or OOV tokens.
        r   c                       j                   S N)r0   rQ   s   rV   <lambda>z,IndexLookup.get_vocabulary.<locals>.<lambda>,  s    DNN rW   Nr   stringzutf-8)rJ   sizeexportr,   _tensor_vocab_to_numpynumpycollectionsdefaultdictzipranger   r/   r   _token_start_indexr5   
isinstancebytesdecode)	rQ   include_special_tokensvocabindiceskeysvaluesrK   xis	   `        rV   get_vocabularyzIndexLookup.get_vocabulary  sE    !!#q(7E,,335LD&/3{{fd^vNE7++E2 E (("C$7
 %*$*>*>*@$ABqBB??&4+;+;u+DE!H%$11356E  H,JOEFZ5%9!q@  L Cs   7E*Ec                    t        j                         rBt        | j                  j	                         j                               | j                         z   S | j                  j	                         | j                         z   S )zGets the current size of the layer's vocabulary.

        Returns:
          The integer size of the vocabulary, including optional mask and oov
          indices.
        )r3   executing_eagerlyr   rJ   r^   ra   rf   r[   s    rV   r   zIndexLookup.vocabulary_size:  sk     !D%%**,2245))+,
 $$))+d.E.E.GGGrW   c                    | j                   | j                  | j                  | j                  | j                  | j
                  | j                  | j                  | j                  t        | j                        t        | j                        | j                  d}t        | 9         }t        t!        |j#                               t!        |j#                               z         S )N)r,   r-   r.   r0   r/   r   r1   r2   r5   rE   rR   r   )r,   r-   r.   r0   r/   r   r1   r2   r5   listify_tensorsr9   r8   r7   r'   
get_configdictlistitems)rQ   configbase_configr%   s      rV   rv   zIndexLookup.get_configI  s    kk//#33//++kk!%!7!7 $ 5 5*4+A+AB)$*?*?@#66
 g(*D**,-V\\^0DDEErW   c                     | j                          t        j                         5  | j                         | _        d d d        y # 1 sw Y   y xY wrZ   )_ensure_vocab_size_unchangedr3   
init_scoper   r7   r[   s    rV   _record_vocabulary_sizez#IndexLookup._record_vocabulary_size[  s=    ))+]]_ 	=&*&:&:&<D#	= 	= 	=s   AAc                 	   | j                   dk(  r|(t        d      |t        d| j                    d|       t        |t              ryt        j
                  j                  j                  |      st        d| d      | j                   dk(  rt        d      | j                  |      | _	        | j                          yt	        j                         sCt	        j                  |      st	        j                  |      rt        d	| j                   d
      t	        j                  |      r| j                  |      }n+t        |t         t"        f      rt%        j&                  |      }t	        j                  |      r|j)                         }n+t        |t         t"        f      rt%        j&                  |      }|j*                  dk(  rt        d|       | j-                         }| j/                         }| j0                  g|z  | j2                  g| j4                  z  z   }t%        j6                  ||d|       }|r||d }n|}| j9                  |      }|rt        d|       | j0                  Q| j0                  |v rCt%        j:                  || j0                  k(        d   }	t        d| d| j0                   d|	       | j2                  ]| j<                  rQ| j2                  |v rCt%        j:                  || j2                  k(        d   }
t        d| d| j2                   d|
       |t?        |      z   }| j@                  +|| j@                  kD  rt        d| d| j@                   d      | jC                  |      | _	        | j                          | j                   dk(  rM|It?        |      t?        |      k7  r#t        dt?        |       dt?        |             | jE                  |      }|jF                  dk7  rt        dtI        |             |rd}d}n|}t%        jJ                  |      }d}| jL                  r(| j@                  | j@                  |z
  t?        |      z
  }nd}t%        jN                  |||fd||f      }t	        jP                  |tS        jT                               }t	        jV                  |d      | _,        | jX                  j[                         | _.        yyy) a  Sets vocabulary (and optionally document frequency) for this layer.

        This method sets the vocabulary and idf weights for this layer directly,
        instead of analyzing a dataset through `adapt`. It should be used
        whenever the vocab (and optionally document frequency) information is
        already known.  If vocabulary data is already present in the layer, this
        method will replace it.

        Args:
            vocabulary: Either an array or a string path to a text file.
                If passing an array, can pass a tuple, list,
                1D numpy array, or 1D tensor containing the vocbulary terms.
                If passing a file path, the file should contain one line
                per term in the vocabulary.
            idf_weights: A tuple, list, 1D numpy array, or 1D tensor
                of inverse document frequency weights with equal
                length to vocabulary. Must be set if `output_mode`
                is `"tf_idf"`. Should not be set otherwise.
        r   Nz5`idf_weights` must be set if output_mode is 'tf_idf'.zU`idf_weights` should only be set if output_mode is `'tf_idf'`. Received: output_mode=z and idf_weights=zVocabulary file z does not exist.zGoutput_mode `'tf_idf'` does not support loading a vocabulary from file.z(Cannot set a tensor vocabulary on layer zi when not executing eagerly. Create this layer or call `set_vocabulary()` outside of any traced function.r   z5Cannot set an empty vocabulary. Received: vocabulary=zlThe passed vocabulary has at least one repeated term. Please uniquify your dataset. The repeated terms are: r   a  Found reserved mask token at unexpected location in `vocabulary`. Note that passed `vocabulary` does not need to include the OOV and mask tokens. Either remove all mask and OOV tokens, or include them only at the start of the vocabulary in precisely this order: z. Received: mask_token=z at vocabulary index a  Found reserved OOV token at unexpected location in `vocabulary`. Note that passed `vocabulary` does not need to include the OOV and mask tokens. Either remove all mask and OOV tokens, or include them only at the start of the vocabulary in precisely this order: z. Received: oov_token=z^Attempted to set a vocabulary larger than the maximum vocab size. Received vocabulary size is z; `max_tokens` is .zI`idf_weights` must be the same length as vocabulary. len(idf_weights) is z; len(vocabulary) is r   zATF-IDF data must be a 1-index array. Received: type(idf_weights)=constantconstant_valuesr   F)r   )/r   r#   rg   strr3   iogfileexists_lookup_table_from_filerJ   r   rs   	is_tensorRuntimeErrorr   r`   rx   tuplenparrayra   r^   r?   rf   r/   r0   r.   array_equal_find_repeated_tokensargwherer,   lenr-   _lookup_table_from_tokens_convert_to_ndarrayndimtypeaverager2   padr@   r   rD   rC   rE   rF   rG   )rQ   rR   rE   	oov_starttoken_startspecial_tokensfound_special_tokenstokensrepeated_tokens
mask_index	oov_indexnew_vocab_sizefront_paddingfront_padding_valueback_padding_valueback_paddingweightss                    rV   rH   zIndexLookup.set_vocabulary`  s8   ( x'" K  $5595E5E4F G##.-1  j#&55;;%%j1 &zl2BC  8+ ,  !% < <Z HD((*##%LL$[(A:499+ F2 2  <<
#44Z@J
T5M2*-J<<$%++-KdE]3((;/K??a((2|5 
 ))+	--///*Y6NN:
  :! !  "~~J|4 
  -FF44V<'(*  ??&4??f+DZ4??%BCBGJ7 8F6F G((,'8 9$$.<1  NN&&(J$..$@A"EI7 8F6F G''+~~&6 7$$-;0  %s6{2??&NT__,L55C4D E##'??"316 
 !::6B$$&x'K,C:#k"22 ++.{+;*< =**-j/):< 
 22;?K1$ 3373D2EG  $ !&'# +&(jj&=# "#%%$//*EOOm3c+6FF   !ff-!46H I	G **7'..:JKG!{{ D &*%5%5%;%;%=D"[ -D'rW   c                     i S rZ    r[   s    rV   get_build_configzIndexLookup.get_build_config  s    	rW   c                 &    | j                  d        y rZ   )build)rQ   rz   s     rV   build_from_configzIndexLookup.build_from_config  s    

4rW   c                     | j                   S rZ   r5   r[   s    rV   compute_dtypezIndexLookup.compute_dtype      $$$rW   c                     | j                   S rZ   r   r[   s    rV   variable_dtypezIndexLookup.variable_dtype  r   rW   c                 z    | j                   dk(  r|S | j                  r| j                  n| j                  }|d   |fS )Nr   r   )r   r2   r-   r7   )rQ   input_shapedepths      rV   compute_output_shapez IndexLookup.compute_output_shape#  sI    u$ %% OO(( 	
 A&&rW   c                     | j                   dk(  rd}nt        j                         }| j                  |j                        }t        j
                  ||      S )Nr   r   r   )r   r   rD   r   shapeKerasTensor)rQ   inputsoutput_dtypeoutput_shapes       rV   compute_output_speczIndexLookup.compute_output_spec-  sH    u$"L">>+L00>""<|DDrW   c                    | j                          t        |t        j                  j                        r,||j                  |      }|D ]  }| j                  |        nat        j                  || j                        }|j                  j                  dk(  rt        j                  |d      }| j                  |       | j                          y )Nr   r   r   )reset_staterg   r3   dataDatasettakeupdate_stater   ensure_tensorr5   r   rankexpand_dimsfinalize_state)rQ   r   stepsbatchs       rV   adaptzIndexLookup.adapt5  s    dBGGOO, yy' )!!%() ))$d6K6KLDzz!# ~~dB/d#rW   c                 r   | j                   rt        d| j                   d      t        j                  || j
                        }|j                  j                  dk(  rt        j                  |d      }|j                  j                  dk(  rt        j                  |d      }| j                  |      \  }}| j                  j                  ||| j                  j                  |      z          | j                  dk(  r2t        |t        j                         rt        j"                  d |      }n<|D cg c]  }t        j$                  |      d    }}t        j&                  |d      }| j                  |      \  }}| j(                  j                  ||| j(                  j                  |      z          t        |t        j                         r*| j*                  j-                  |j/                                y | j*                  j-                  t        j                  |d	
      d          y y c c}w )NzCannot adapt layer 'z[' after setting a static vocabulary via `vocabulary` argument or `set_vocabulary()` method.r   r   r   r   c                 2    t        j                  |       d   S )Nr   )r3   uniquero   s    rV   r\   z*IndexLookup.update_state.<locals>.<lambda>]  s    ryy|A rW   )axisr   )out_type)r:   r#   r   r   r   r5   r   r   r3   r   _num_tokensrN   insertrK   r   rg   RaggedTensormap_fnr   concatrO   rP   
assign_addnrows)rQ   r   r   countsdeduped_doc_dataro   s         rV   r   zIndexLookup.update_stateE  s   %%&tyyk 2- -  %%d$2G2GH::??a>>$*D::??a >>$*D))$/  FT..55f==	
 x'$0#%99-F#M =A#BBIIaLO#B #B#%99-=A#F !--.>?NFF&&--!;!;!B!B6!JJ $0""--djjl;""--HHTG4Q7! (
 $Cs   7H4c                 r   | j                   s.t        j                  | j                  j	                         d      r?| j
                  dk(  r| j                  j                         | _        | j                          y | j                  D| j                  j                  t        j                  | j                  g| j                               | j                  D| j                  j                  t        j                  | j                  g| j                               | j                  j                         \  }}t!        j"                  |j%                         |j%                         f      d d d   }| j'                         }| j(                  r| j(                  |z
  }|d | }t        j*                  ||      }| j-                  |      | _        | j
                  dk(  r<| j0                  j3                  |      }| j5                  || j6                        }t        j8                  |t;        j<                               }t        j>                  || j'                         dggt        j@                  |            }| jB                  rG| j(                  ;t        j>                  |d| j(                  t        j                  |      z
  ggd      }t        jD                  |t;        j<                         d      | _        | j                  j                         | _        | jG                          | j                          y )Nr   r   r   r   Fr   )$r:   r3   equalrN   r^   r   rE   rF   rG   r   r/   remover@   r5   r0   r_   r   lexsortra   rf   r-   gatherr   rJ   rO   rK   _inverse_document_frequencyrP   castr   rD   r   reduce_meanr2   rC   r   )rQ   r   r   sorted_indicesr   max_learned_tokensrO   rE   s           rV   r   zIndexLookup.finalize_staten  s   %%$2C2C2H2H2JA)N 8+)-)9)9)?)?)A&((* ??&$$$$doo%68M8MN >>%$$$$dnn%5t7L7LM **113 V\\^V\\^$DEddK--/??!%;!>+,?-?@N6>2 ::6Bx'$($>$>$E$Ef$M!::%t'9'9K ''+w~~/?@K &&))+Q/0 "{ ;K
 %%$//*E ff277;+??@A$%
  "{{nn& D
 &*%5%5%;%;%=D"
 	$$&rW   c                 L   | j                   ry | j                  j                  | j                  j                         d          | j                  dk(  rR| j
                  j                  | j
                  j                         d          | j                  j                  d       y y )Nr   r   )r:   rN   r   r_   r   rO   rP   assignr[   s    rV   r   zIndexLookup.reset_state  s    %%  !2!2!9!9!;A!>?x'&&--**113A6 %%a(	 (rW   c                    ddl m} | j                          t        j                  || j
                        }|j                  }|j                  j                  dk(  r| j                  |d      }t        |t        j                        rEt        j                  |j                  | j                  |j                        |j                        }nVt        |t        j                         r+t        j"                  j%                  | j                  |      }n| j                  |      }| j&                  dk(  r'|j                  dk(  rt        j(                  |d      }|S | j*                  r| j,                  n| j.                  }| j&                  dk(  r| j0                  nd }t3        j4                  || j&                  dk(  rdn| j&                  || j6                  | j8                  |      }| j&                  dk(  rM|t;        d	      |j<                  j?                  |j@                  jC                  ||jD                        |      }|S )
Nr   r   r   r   r   r   r   )r   r   r   r1   backend_modulezAWhen `output_mode` is `'tf_idf'`, `idf_weights` must be provided.)#keras.src.backendr	   _ensure_known_vocab_sizer   r   r;   r   r   _expand_dimsrg   r3   SparseTensorrl   _lookup_densern   dense_shaper   raggedmap_flat_valuesr   squeezer2   r-   r7   rG   r   encode_categorical_inputsr<   r1   r#   ra   multiplycorer   r   )rQ   r   
tf_backendoriginal_shapelookupsr   rE   outputs           rV   callzIndexLookup.call  s   >%%'''dooF<<!&&vr2Ffboo.oo""6==1""G
 0ii//0B0BFKG((0Gu$""a'**Wb1N %% OO(( 	 '+&6&6(&BD"" 	 !::++x7T=M=M##;;%	
 x'"     %%..$$V[->->?F rW   c                 6   t        j                         r7t        j                  |      r"t        j                  || j
                        }n| j                  j                  |      }| j                  At        j                  || j                        }t        j                  || j                  |      }| j                  r|S g }| j                  dk(  rt        j                  t        j                  |d            }t        j                  ||      }t         j                   j#                  d|f      }t        j$                  t        j                  t        j&                  |      d      |g      }|j)                  |       n| j                  dkD  rt        j*                  | j,                        j.                  r+t         j0                  j3                  || j                        }n+t         j                   j5                  || j                        }|| j7                         z   }t        j                  || j8                        }	t        j                  |	||      }t        j:                  |      5  t        j<                  |      cddd       S # 1 sw Y   yxY w)zALookup table values for a dense Tensor, handling masking and OOV.r   Nr   r   zwWhen `num_oov_indices=0` all inputs should be in vocabulary, found OOV values {}, consider setting `num_oov_indices=1`.r   )num_buckets)r3   rs   r   is_keras_tensor
zeros_liker<   rJ   rK   r/   r   rA   whererB   r,   r.   	gather_ndstringsformatAssertr^   appendr4   r;   
is_integermathfloormodto_hash_bucket_fastr?   r=   control_dependenciesidentity)
rQ   r   r   mask_locationslookup_checksoov_indices
oov_inputsmsg	assertionoov_locationss
             rV   r   zIndexLookup._lookup_dense  s    !g&=&=f&EmmF$2C2CDG''..v6G??&XXfdnn=Nhh~t/?/?IG;;N1$((288GR#89Kfk:J**##MC
 		"((277;+?"CcUKI  +!!A%{{4??+66 gg..vt7K7KL jj<<(<(< =  &(=(=(??KHHWd.A.ABMhh}k7CG$$]3 	(;;w'	( 	( 	(s   0JJc                 ^    | j                   dk(  r| j                  j                         |d<   y y Nr   rE   )r   rG   ra   rQ   stores     rV   save_own_variableszIndexLookup.save_own_variables#  s.    x'#'#9#9#?#?#AE-  (rW   c                     | j                   dk(  r>| j                  j                  |d          | j                  j                         | _        y y r  )r   rE   r   rF   rG   r  s     rV   load_own_variableszIndexLookup.load_own_variables'  sE    x'##E-$89%)%5%5%;%;%=D" (rW   c                 P   | j                   y | j                  d      }t        j                  j                  j                  |d      }t        |d      5 }|j                  dj                  |D cg c]  }t        |       c}             d d d        y c c}w # 1 sw Y   y xY w)NT)rj   vocabulary.txtw
)	r8   rq   r3   r   r   joinopenwriter   )rQ   dir_pathrR   vocabulary_filepathfr  s         rV   save_assetszIndexLookup.save_assets,  s      , (((E
 eekk..x9IJ%s+ 	=qGGDIIz:!s1v:;<	= 	=:	= 	=s   B1BBBB%c                 0   | j                   y t        j                  j                  j	                  |d      }t        |d      5 }|j                         j                  d      }t        j                  | j                        t        j                  k(  r|D cg c]  }t        |       }}n|D cg c]  }t        |       }}| j                  dk(  r| j                  |d       n| j                  |       d d d        y c c}w c c}w # 1 sw Y   y xY w)Nr  rr  r   F)rE   )r8   r3   r   r   r  r  readsplitr4   r5   r]   r   r   r   rH   )rQ   r  r   r!  lineslinern   s          rV   load_assetszIndexLookup.load_assets6  s      ,  eekk..x9IJ%s+ 		,qFFHNN4(E{{4001RYY>056#d)66056#d)668+##F#>##F+		, 		, 76		, 		,s+   ADD*D1D6D
DDc                     t        j                         5  t        | j                  | j                        }t         j
                  j                  || j                        cd d d        S # 1 sw Y   y xY wrZ   )r3   r~   get_null_initializerr;   r<   rK   StaticHashTabler=   )rQ   initializers     rV   rI   z'IndexLookup._uninitialized_lookup_tableH  sY    ]]_ 	O.!2!2K 99,,[$:M:MN		O 	O 	Os   A
A))A2c                    t        j                         5  | j                         }|t        j                  |      z   }| j                  r| j
                  n| j                  }t        j                  |||      }| j                  r||fn||f\  }}t         j                  j                  ||| j
                  | j                        }t         j                  j                  || j                        cd d d        S # 1 sw Y   y xY w)Nr   )r3   r~   rf   r^   r,   r;   r<   re   rK   KeyValueTensorInitializerr,  r=   )	rQ   r   r   	token_endindices_dtyperl   rm   rn   r-  s	            rV   r   z%IndexLookup._lookup_table_from_tokensO  s    ]]_ 	O113K#bggfo5I#';;D4E4E  hh{I]KG%)[[&!vw6G D& ))==fdoot/@/@K 99,,[$:M:MN	O 	O 	Os   CC::Dc           
      r   | j                   rIt        j                  j                  j                  }t        j                  j                  j
                  }nHt        j                  j                  j
                  }t        j                  j                  j                  }t        j                         5  t        j                  j                  || j                  || j                  || j                               }t        j                  j                  || j                        cd d d        S # 1 sw Y   y xY w)N)filenamer    	key_indexr!   value_indexvalue_index_offset)r,   r3   rK   TextFileIndexLINE_NUMBER
WHOLE_LINEr~   TextFileInitializerr;   r<   rf   r,  r=   )rQ   r3  r4  r5  r-  s        rV   r   z#IndexLookup._lookup_table_from_file_  s    ;;		//;;I))11<<K		//::I))11==K]]_ 		O))77!//# --'#'#:#:#< 8 K 99,,[$:M:MN		O 		O 		Os   2A1D--D6c                 \    t        |t        t        f      rt        j                  |      S |S rZ   )rg   rx   r   r   r   )rQ   ro   s     rV   r   zIndexLookup._convert_to_ndarrayq  s"    (T5M:rxx{AArW   c                     t        |t        j                        r t        j                  j	                  ||      S t        j                  ||      S rZ   )rg   r3   r   r1   r   )rQ   r   r   s      rV   r   zIndexLookup._expand_dimst  s8    fboo.99((66>>&$//rW   c                 @    | j                   | j                  dk(  rdS dS )Nr   r   r   )r/   r   r[   s    rV   r?   zIndexLookup._oov_start_indexz  s/     *t/?/?5/H 	
 	
rW   c                 <    | j                         | j                  z   S rZ   )r?   r.   r[   s    rV   rf   zIndexLookup._token_start_index  s    $$&)=)===rW   c                     | j                   dk(  s| j                  ry | j                  t        d| j                    d      y )Nr   When using `output_mode=z` and `pad_to_max_tokens=False`, you must set the layer's vocabulary before calling it. Either pass a `vocabulary` argument to the layer, or call `adapt` with some sample data.)r   r2   r7   r   r[   s    rV   r   z$IndexLookup._ensure_known_vocab_size  sQ    u$(>(>""**4+;+;*< =) )  +rW   c                 6   | j                   dk(  s| j                  ry t        j                         5  | j	                         }d d d        | j
                  8| j
                  k7  r(t        d| j                    d| j
                   d|       y y # 1 sw Y   NxY w)Nr   r@  zt` and `pad_to_max_tokens=False`, the vocabulary size cannot be changed after the layer is called. Old vocab size is z, new vocab size is )r   r2   r3   r~   r   r7   r   )rQ   r   s     rV   r}   z(IndexLookup._ensure_vocab_size_unchanged  s    u$(>(>]]_ 	4!113N	4 ##/$"9"99*4+;+;*< =- .2-D-D,E F%%3$4	6  : 0		4 	4s   BBc                     t        |      }t        |      t        |      k7  r;t        j                  |      j	                         D cg c]  \  }}|dkD  r| c}}S g S c c}}w )z+Return all repeated tokens in a vocabulary.r   )setr   rb   Counterry   )rQ   rR   vocabulary_setitemr   s        rV   r   z!IndexLookup._find_repeated_tokens  sd    Zz?c.11 $/#6#6z#B#H#H#JD%19   Is   	A c                 
   t        |t        j                        r|j                  }n>t        |t        j                        r|j
                  }nt        j                  |dg      }t        j                  |d      \  }}}||fS )z?Count the number of tokens in a ragged, sparse or dense tensor.r   r   )out_idx)rg   r3   r   rn   r   flat_valuesreshapeunique_with_counts)rQ   r   rI  r   _r   s         rV   r   zIndexLookup._num_tokens  sg    dBOO,++Kboo.**K**TB40K11+wO6v~rW   c                 R    t         j                  j                  d|d|z   z  z         S )aa  Computes the inverse-document-frequency (IDF) component of "tf_idf".
        Args:
            token_document_counts: An array of the # of documents each token
                appears in.
            num_documents: An int representing the total number of documents

        Returns:
            An array of "inverse document frequency" weights.
        r   )r3   r  log)rQ   rO   rP   s      rV   r   z'IndexLookup._inverse_document_frequency  s'     ww{{1}4I0IJJKKrW   c                 "    |j                         S )z3Converts a tensor vocabulary to a numpy vocabulary.)ra   )rQ   rR   s     rV   r`   z"IndexLookup._tensor_vocab_to_numpy  s    !!rW   )NNFr   FFN)TrZ   ))r&   
__module____qualname____doc__r(   rq   r   rv   r   rH   r   r   propertyr   r   r   r   r   r   r   r   r   r   r  r  r"  r)  rI   r   r   r   r   r?   rf   r   r}   r   r   r   r`   __classcell__)r%   s   @rV   r   r      s    Jf xt$LHF$=
s>j % % % %'E 'R?'B	)7r.(`B>
=,$OO O$B0
>
&
	
L"rW   r   c                 Z     G d dt         j                  j                        } || |      S )Nc                   <    e Zd ZdZd Zed        Zed        Zd Zy)-get_null_initializer.<locals>.NullInitializerz:A placeholder initializer for restoring from a SavedModel.c                      || _         || _        y)zConstruct a table initializer object.

            Args:
            key_dtype: Type of the table keys.
            value_dtype: Type of the table values.
            N)r;   r<   )rQ   r    r!   s      rV   r(   z6get_null_initializer.<locals>.NullInitializer.__init__  s     (DO +DrW   c                     | j                   S )zThe expected table key dtype.)r;   r[   s    rV   r    z7get_null_initializer.<locals>.NullInitializer.key_dtype  s     ??"rW   c                     | j                   S )zThe expected table value dtype.)r<   r[   s    rV   r!   z9get_null_initializer.<locals>.NullInitializer.value_dtype  s     $$$rW   c                      y)z$Returns the table initialization op.Nr   )rQ   tables     rV   
initializez8get_null_initializer.<locals>.NullInitializer.initialize  s    rW   N)	r&   rP  rQ  rR  r(   rS  r    r!   r]  r   rW   rV   NullInitializerrW    s7    H	, 
	# 
	# 
	% 
	%	rW   r^  )r3   rK   r/  )r    r!   r^  s      rV   r+  r+    s'    "))== 6 9k22rW   c                     t        j                  |       r| j                         } t        | t        j
                        r| j                         } | S )zFConvert any tensors or numpy arrays to lists for config serialization.)r3   r   ra   rg   r   ndarraytolistr   s    rV   ru   ru     s6    	||AGGI!RZZ HHJHrW   )rb   ra   r   	keras.srcr   keras.src.layers.layerr   keras.src.utilsr   r   r   keras.src.utils.module_utilsr	   r3   r   r+  ru   r   rW   rV   <module>rf     s6       ( / + $ 9|"% |"~3>rW   