
    2Vh4                     l    d dl mZ d dl mZ d dlmZ d dlmZ d dlmZ  ed       G d de             Z	y	)
    )backend)ops)keras_export)KerasTensor)Layerzkeras.layers.Attentionc                        e Zd ZdZ	 	 	 	 d fd	Zd Zd ZddZd Z	 	 	 	 ddZ	ddZ
d	 Z	 	 	 	 dd
ZddZ fdZ xZS )	Attentiona  Dot-product attention layer, a.k.a. Luong-style attention.

    Inputs are a list with 2 or 3 elements:
    1. A `query` tensor of shape `(batch_size, Tq, dim)`.
    2. A `value` tensor of shape `(batch_size, Tv, dim)`.
    3. A optional `key` tensor of shape `(batch_size, Tv, dim)`. If none
        supplied, `value` will be used as a `key`.

    The calculation follows the steps:
    1. Calculate attention scores using `query` and `key` with shape
        `(batch_size, Tq, Tv)`.
    2. Use scores to calculate a softmax distribution with shape
        `(batch_size, Tq, Tv)`.
    3. Use the softmax distribution to create a linear combination of `value`
        with shape `(batch_size, Tq, dim)`.

    Args:
        use_scale: If `True`, will create a scalar variable to scale the
            attention scores.
        dropout: Float between 0 and 1. Fraction of the units to drop for the
            attention scores. Defaults to `0.0`.
        seed: A Python integer to use as random seed in case of `dropout`.
        score_mode: Function to use to compute attention scores, one of
            `{"dot", "concat"}`. `"dot"` refers to the dot product between the
            query and key vectors. `"concat"` refers to the hyperbolic tangent
            of the concatenation of the `query` and `key` vectors.

    Call arguments:
        inputs: List of the following tensors:
            - `query`: Query tensor of shape `(batch_size, Tq, dim)`.
            - `value`: Value tensor of shape `(batch_size, Tv, dim)`.
            - `key`: Optional key tensor of shape `(batch_size, Tv, dim)`. If
                not given, will use `value` for both `key` and `value`, which is
                the most common case.
        mask: List of the following tensors:
            - `query_mask`: A boolean mask tensor of shape `(batch_size, Tq)`.
                If given, the output will be zero at the positions where
                `mask==False`.
            - `value_mask`: A boolean mask tensor of shape `(batch_size, Tv)`.
                If given, will apply the mask such that values at positions
                 where `mask==False` do not contribute to the result.
        return_attention_scores: bool, it `True`, returns the attention scores
            (after masking and softmax) as an additional output argument.
        training: Python boolean indicating whether the layer should behave in
            training mode (adding dropout) or in inference mode (no dropout).
        use_causal_mask: Boolean. Set to `True` for decoder self-attention. Adds
            a mask such that position `i` cannot attend to positions `j > i`.
            This prevents the flow of information from the future towards the
            past. Defaults to `False`.

    Output:
        Attention outputs of shape `(batch_size, Tq, dim)`.
        (Optional) Attention scores after masking and softmax with shape
            `(batch_size, Tq, Tv)`.
    c                     t        |   di | || _        || _        || _        | j                  dkD  r%t
        j                  j                  |      | _        | j                  dvrt        d|       d| _
        y )Nr   seed)dotconcatz_Invalid value for argument score_mode. Expected one of {'dot', 'concat'}. Received: score_mode=F )super__init__	use_scale
score_modedropoutr   randomSeedGeneratorseed_generator
ValueError_return_attention_scores)selfr   r   r   r   kwargs	__class__s         T/home/dcms/DCMS/lib/python3.12/site-packages/keras/src/layers/attention/attention.pyr   zAttention.__init__B   s     	"6""$<<!")..">">D">"ID??"33((2|5  ).%    c                    | j                  |       d | _        d | _        | j                  r%| j	                  ddd| j
                  d      | _        | j                  dk(  r&| j	                  ddd| j
                  d      | _        y y )Nscaler   onesT)nameshapeinitializerdtype	trainabler   concat_score_weight)_validate_inputsr    r'   r   
add_weightr%   r   )r   input_shapes     r   buildzAttention.buildZ   s    k*
#' >>"jj ) DJ ??h&'+*"jj (7 (D$ 'r   c                 f   | j                   dk(  rJt        j                  |t        j                  |g d            }| j                  || j                  z  }|S | j                   dk(  rt        j
                  |d      }t        j
                  |d      }| j                  I| j                  t        j                  t        j                  | j                  ||z   z        d      z  }|S | j                  t        j                  t        j                  ||z         d      z  }|S t        d	      )
a  Calculates attention scores as a query-key dot product.

        Args:
            query: Query tensor of shape `(batch_size, Tq, dim)`.
            key: Key tensor of shape `(batch_size, Tv, dim)`.

        Returns:
            Tensor of shape `(batch_size, Tq, Tv)`.
        r   )r         )axesr   axiszscores not computed)
r   r   matmul	transposer    expand_dimsr'   sumtanhr   )r   querykeyscores
q_reshaped
k_reshapeds         r   _calculate_scoreszAttention._calculate_scoreso   s    ??e#ZZs}}Sy'IJFzz%$**$$ # __( R8J26Jzz%11CGGHHTZZ:
+BCD25   11CGGHHZ*45B5   233r   c                    |Ot        j                  |      }|j                  dk(  rdnd}||t        j                  ||j                        z  z  }t        j                  |d      }|rE| j
                  dkD  r6t        j                  j                  || j
                  | j                        }t        j                  ||      |fS )	a  Applies attention scores to the given value tensor.

        To use this method in your attention layer, follow the steps:

        * Use `query` tensor of shape `(batch_size, Tq)` and `key` tensor of
            shape `(batch_size, Tv)` to calculate the attention `scores`.
        * Pass `scores` and `value` tensors to this method. The method applies
            `scores_mask`, calculates
            `attention_distribution = softmax(scores)`, then returns
            `matmul(attention_distribution, value).
        * Apply `query_mask` and return the result.

        Args:
            scores: Scores float tensor of shape `(batch_size, Tq, Tv)`.
            value: Value tensor of shape `(batch_size, Tv, dim)`.
            scores_mask: A boolean mask tensor of shape `(batch_size, 1, Tv)`
                or `(batch_size, Tq, Tv)`. If given, scores at positions where
                `scores_mask==False` do not contribute to the result. It must
                contain at least one `True` value in each line along the last
                dimension.
            training: Python boolean indicating whether the layer should behave
                in training mode (adding dropout) or in inference mode
                (no dropout).

        Returns:
            Tensor of shape `(batch_size, Tq, dim)`.
            Attention scores after masking and softmax with shape
                `(batch_size, Tq, Tv)`.
        float16g     @g    eAr%   r4   r1   r   r   )
r   logical_notr%   castsoftmaxr   r   r   r   r5   )r   r<   valuescores_masktrainingpadding_mask	max_valueweightss           r   _apply_scoreszAttention._apply_scores   s    < "??;7L $*<<9#<%Ii#((<v||"LLLF++f2.q(nn,,(( - G
 zz'5)722r   c                 b   |rt        j                  |      }d|d   |d   f}t        j                  |d      }t        j                  |d      }t        j                  |d      }t        j                  ||      }	|-t        j
                  |d      }t        j                  ||	      S |	S |S )Nr.   r0   r4   int32)r#   r%   r1   )r   r#   r!   cumsumgreater_equalr7   logical_and)
r   r<   v_maskuse_causal_maskscore_shape
mask_shape	ones_mask	row_index	col_indexcausal_masks
             r   _calculate_score_maskzAttention._calculate_score_mask   s     ))F+K[_k"o>JzAI

926I

926I++IyAK!b9v{;; Mr   c                    | j                  ||       || _        |d   }|d   }t        |      dkD  r|d   n|}|r|d   nd }	|r|d   nd }
| j                  ||      }| j	                  ||
|      }| j                  ||||      \  }}|	;t        j                  |	d      }	|t        j                  |	|j                  	      z  }|r||fS |S )
Ninputsmaskr   r.   r-   )r:   r;   )r<   rF   rG   rH   r4   r1   rB   )
r(   r   lenr?   rZ   rL   r   r7   rD   r%   )r   r]   r^   rH   return_attention_scoresrS   qvkq_maskrR   r<   rG   attention_outputattention_scoress                  r   callzAttention.call   s     	V$7(?%1I1IVqF1Ia ad ad''aQ'700FO
 .2-?-?h .@ .
** __V"5F7G7M7M NN"$&677##r   c                 h    | j                  ||       ||d   y t        j                  |d         S )Nr\   r   )r(   r   convert_to_tensor)r   r]   r^   s      r   compute_maskzAttention.compute_mask   s:    V$7<47?$$T!W--r   c                 p    |\  }}}||}g |d d |d   }| j                   r|d   |d   |d   f}||fS |S )Nr4   r   r.   )r   )r   r*   query_shapevalue_shape	key_shapeoutput_shapescores_shapes          r   compute_output_shapezAttention.compute_output_shape   se    .9+[)#I;Sb);;r?;(('NKNIaLIL--r   c                    | j                  ||       |d   }|d   }t        |      dkD  r|d   n|}| j                  |j                  |j                  |j                  g      }	t	        |	| j
                        }
| j                  s|rG|j                  d   |j                  d   |j                  d   f}t	        || j
                        }|
|fS |
S )Nr   r.   r-   rB   )r(   r_   rq   r#   r   compute_dtyper   )r   r]   r^   r`   rH   rS   r:   rF   r;   ro   output_specrp   attention_scores_specs                r   compute_output_speczAttention.compute_output_spec  s     	fd+q	q	v;?fQi 00[[%++syy1
 ",d6H6HI ((,CAA		!L
 %0D$6$6%!  !677r   c                    | j                   j                  }t        |t              st	        | d| d      t        |      dk  st        |      dkD  rt	        | dt        |       d      |Rt        |t              st	        | d| d      t        |      dk  st        |      dkD  rt	        | d| d	| d      yy)
z'Validates arguments of the call method.zj layer must be called on a list of inputs, namely [query, value] or [query, value, key]. Received: inputs=.r-      zl layer accepts inputs list of length 2 or 3, namely [query, value] or [query, value, key]. Received length: NzL layer mask must be a list, namely [query_mask, value_mask]. Received: mask=z< layer accepts mask list of length 2 or 3. Received: inputs=z, mask=)r   __name__
isinstancelistr   r_   )r   r]   r^   
class_names       r   r(   zAttention._validate_inputs'  s    ^^,,
&$', $$*81. 
 v;?c&kAo, $$'K=3 
 dD) !l #GGKfAO  4y1}D	A !l #((.xwtfA?  !. r   c                 t    t         |          }| j                  | j                  | j                  d}i ||S )N)r   r   r   )r   
get_configr   r   r   )r   base_configconfigr   s      r   r   zAttention.get_configB  s>    g(*//||

 )+(((r   )Fr   g        N)NF)NFFF)N)NFNF)rz   
__module____qualname____doc__r   r+   r?   rL   rZ   rg   rj   rq   rv   r(   r   __classcell__)r   s   @r   r	   r	      sr    6t .0*B,3\4  %$>.	  % D6) )r   r	   N)
	keras.srcr   r   keras.src.api_exportr   keras.src.backendr   keras.src.layers.layerr   r	   r   r   r   <module>r      s8      - ) ( &'@) @) (@)r   