
    Vh/                         d Z ddlmZmZ ddlZddlmZmZmZm	Z	m
Z
 dgZdeeeej                  f   deeeej                  f   fdZ G d	 d      Zy)
zu
This module implements Paged Attention on top of flex_attention.
This module is experimental and subject to change.
    )OptionalUnionN)	_identity_mask_mod_signature_score_mod_signature	BlockMask	noop_maskPagedAttentionxmultiplec                     | |z   dz
  |z  S )N    )r   r   s     `/home/dcms/DCMS/lib/python3.12/site-packages/torch/nn/attention/experimental/_paged_attention.py_cdivr      s     L1))    c                      e Zd ZdZ	 ddedededefdZdej                  dej                  d	d
fdZ	dej                  d	d
fdZ
dej                  dej                  dej                  dej                  dej                  dej                  d	d
fdZ	 ddedeej                     d	efdZdee   d	efdZdee   d	efdZy
)r
   aa  
    PagedAttention supports flex attention inference with a large batch size.
    With PagedAttention, a batch of key/value tensors with varying kv length
    is splitted into tensor blocks of fixed length and cached in a compact way.
    Thus we can avoid redundant memory consumption due to varying kv length and
    support a larger batch size.
    n_pages	page_sizemax_batch_sizedevicec                 ~   || _         || _        t        j                  || j                   ft        j                  |       | _        t        j                  |t        j                  |      | _        t        t        |dz
  dd            | _
        t        j                  ||ft        j                  |       | _        y )Ndtyper   r   )r   r   torchonesint64
page_tablezeroscapacitylistrangeempty_pagesphysical_to_logical)selfr   r   r   r   s        r   __init__zPagedAttention.__init__%   s      # !::T\\*%++f
 

 N%++fU  gk2r :; %*JJW%U[[%
 $
 r   	batch_idxseq_lenreturnNc                    || j                   |   k  ryt        || j                   |   z
  | j                        }t        | j                        |k\  s/J d|j                          dt        | j                         d       | j                   |   | j                  z  }||z   }t        j                  | j                  | d |j                        }| j                  d|  | _        || j                  |||f<   t        j                  |j                         |j                         |j                        | j                  ||f<   | j                   |xx   || j                  z  z  cc<   y)a'  
        Requests the capacity of a given batch to be at least enough to
        hold `seq_len` elements.

        Args:
            batch_idx (Tensor): batch index to be reserved; shape :math:`(1)`.
            seq_len (Tensor): minimum capacity for the given batch; shape :math:`(1)`.
        Nz
requested z pages but there are only z empty pagesr   )r!   r   r   lenr$   itemr   tensorr   r   aranger%   )r&   r(   r)   num_pages_to_allocatestart_page_idxend_page_idxallocated_pagess          r   reservezPagedAttention.reserveB   s    dmmI.. %dmmI..!
 4##$(== 	
.3356 7""%d&6&6"7!8F	
=
 y1T^^C%(==  ,,3345(//
  ++,C.C-CD  	<')	
 @E||!(//@
  O!;<
 	i $9DNN$JJ r   c                     | j                   |   dk7  }| j                   |   |   }d| j                  |<   | xj                  |j                         z  c_        d| j                  |   dd|f<   d| j                   |<   y)z
        Removes a single batch from paged attention.

        Args:
            batch_idx (Tensor): batch index to be removed; shape :math:`(1)`.
        r   r   N)r   r!   r$   tolistr%   )r&   r(   allocated_page_idxr4   s       r   erasezPagedAttention.erasep   s~     "__Y72=//)45GH $%i O2244BD  +A,>?%'	"r   	input_posk_valv_valk_cachev_cachec                    |j                   rt        d      |j                  \  }}}	}
|j                  d   }||j                  d   k7  rt        d| d|j                  d    d      ||j                  d   k7  rt        d| d	|j                  d    d      |	|j                  d   k7  rt        d
|	 d|j                  d    d      |
|j                  d   k7  rt        d|
 d|j                  d    d      ||j                  d   k7  rt        d| d|j                  d    d      || j                  z  }|| j                  z  }t	        j
                  | j                  |   d|j                  t        j                              j                  t        j                        }|| j                  z  |z   j                  d      }|j                  dddd      j                         j                  d|||	z  |
      }|j                  dddd      j                         j                  d|||	z  |      }||dddd|ddf<   ||dddd|ddf<   y)a  
        Assigns new contents `val` to the storage `cache` at the location
        `batch_idx` and `input_pos`.

        Args:
            batch_idx (Tensor): batch index; shape :math:`(B)`.
            input_pos (Tensor): input positions to be assigned for the given batch; shape :math:`(B, S)`.
            val (Tensor): value to be assigned; shape :math:`(B, H, S, D)`
            cache (Tensor): the cache to store the values; shape:`(1, H, MAX_S, D)`
        zval must not require gradient   r   z<Expect val and batch_idx have the same batch size but got B=z and B=.r   z<Expect val and cache has the same number of heads but got H=z and H=z7Expect val and input_pos has the same length but got S=z and S=z;Expect k_val and k_cache has the same hidden dim but got D=z and D=z;Expect v_val and v_cache has the same hidden dim but got D=r      N)requires_gradRuntimeErrorshaper   r   gatherr   tor   int32viewpermute
contiguous)r&   r(   r:   r;   r<   r=   r>   BHSK_DV_Dlogical_block_idxlogical_block_offsetphysical_block_idxaddrs                   r   assignzPagedAttention.assign   s   & >??{{1akk!n	""Cwyq'9&:!=  a  Cww}}Q'7&8;  	""Cwyq'9&:!=  '--"" Eq)9(:!=  '--"" Eq)9(:!=  &7(4>>9"\\OOI&+<+?+?+L

"KK
 	 #T^^36JJPP
 aAq)446;;Aq!a%MaAq)446;;Aq!a%M!&1dA!&1dAr   
block_maskc           	         |j                   j                  \  }}}}|j                  d   | j                  k7  r(t	        d|j                  d    d| j                         |j
                  j                  }|t        j                  ||      }| j                  |   }|j
                  j                         }	t        j                  |||| j                  ft        j                  |      }
t        j                  |d|j                   j                  |d      j!                  t        j"                              j                  |j                   j                        j!                  t        j                        |
ddddddd|f<   d\  }}|j$                  |j&                  J |j$                  j                         }t        j                  |||| j                  ft        j                  |      }t        j                  |d|j&                  j                  |d      j!                  t        j"                              j                  |j&                  j                        j!                  t        j                        |ddddddd|f<   | j)                  |j*                        }|j,                  d	   | j                  | j                  z  f}t/        j0                  |	|
|||j                  ||
      S )a  
        Converts a logical block mask by mapping its logical kv indices to the corresponding
        physical kv indices.

        Args:
            block_mask (BlockMask): logical block mask;
                kv_indices shape :math:`(B, H, ROWS, MAX_BLOCKS_IN_COL)`.
            batch_idx (Tensor): batch index corresponding to the block_mask
                batch dimension. This provides flexibility to convert a
                block mask with smaller batch size than the page table;
                shape :math:`(B)`.
        r   zJExpect block_mask has the same column block size as page_sizebut got size=z
 and size=Nr,   r   r   )NNr   )seq_lengths)
kv_indicesrE   
BLOCK_SIZEr   rD   kv_num_blocksr   r   r0   r   cloner    r   rH   rF   rI   rG   r   full_kv_num_blocksfull_kv_indicesget_mask_modmask_modrX   r   from_kv_blocks)r&   rV   r(   rL   rM   ROWSMAX_BLOCKS_IN_COLr   r   new_kv_num_blocksnew_kv_indicesnew_full_kv_indicesnew_full_kv_num_blocksnew_mask_modrX   s                  r   convert_logical_block_maskz)PagedAttention.convert_logical_block_mask   s   " )3(=(=(C(C%1d%  #t~~5  * 5 5a 89DNNCSU  ))00Qv6I__Y/
&44::<4&ekk&
 LLAz4499!R@CCEKKP T*''--.R_ 	q!Q 2!2 223 7A33((4--999%/%B%B%H%H%J""'++AtT\\*%++f# ..33Ar:==ekkJ
 j00667EKK  1a);*;); ;< (()<)<=!--a0$,,2OP''"!!#
 	
r   r`   c                      t         dt        j                  dt        j                  dt        j                  dt        j                  f fd}|S )z
        Converts a mask_mod based on mapping from the physical block index to the logical
        block index.

        Args:
            mask_mod (_mask_mod_signature): mask_mod based on the logical block index.
        bhq_idxphysical_kv_idxc           	          |	j                   z  }|	j                   z  }	j                  | |f   }|	j                   z  |z   }t        j                  |dk\   | |||      d      S )Nr   F)r   r%   r   where)
rk   rl   rm   rn   physical_kv_blockphysical_kv_offsetrQ   logical_kv_idxr`   r&   s
           r   rh   z1PagedAttention.get_mask_mod.<locals>.new_mask_mod$  sv     !04>> A!04>>!A $ 8 8<M9M N.?BTTN;;!Q&Aun(Mu r   )r	   r   Tensor)r&   r`   rh   s   `` r   r_   zPagedAttention.get_mask_mod  sQ      H	||	||	 <<	 #\\		 r   	score_modc           
           t         dt        j                  dt        j                  dt        j                  dt        j                  dt        j                  f
 fd}|S )z
        Converts a score_mod based on mapping from the physical block index to the logical
        block index.

        Args:
            score_mod (_score_mod_signature): score_mod based on the logical block index.
        scorerk   rl   rm   rn   c           
          |
j                   z  }|
j                   z  }
j                  ||f   }|
j                   z  |z   }t        j                  |dk\   	| ||||      t	        d            S )Nr   z-inf)r   r%   r   rp   float)rw   rk   rl   rm   rn   rq   rr   rQ   rs   ru   r&   s            r   new_score_modz3PagedAttention.get_score_mod.<locals>.new_score_modA  s~     !04>> A!04>>!A $ 8 8<M9M N.?BTTN;;!Q&%Aun=f r   )r   r   rt   )r&   ru   rz   s   `` r   get_score_modzPagedAttention.get_score_mod4  s_     !I	<<	||	 ||	 <<		
 #\\	" r   )cuda)N)__name__
__module____qualname____doc__intstrr'   r   rt   r5   r9   rU   r   r   ri   r   r_   r   r{   r   r   r   r
   r
      sI    

 
 	

 
:,K ,K ,K ,K\(u|| ( ($C'<<C' <<C' ||	C'
 ||C' C' C' 
C'P -1N
N
 ELL)N
 
	N
` !45	:!"67	r   )r   typingr   r   r   !torch.nn.attention.flex_attentionr   r   r   r   r	   __all__r   ry   rt   r   r
   r   r   r   <module>r      sg   
 #   
*S%%&*27UELL8P2Q*v vr   