
    nVh2                      d dl mZ d dlmZ d dlmZ d dlmZ d dlm	Z	m
Z
 d dlZd dlmZmZmZmZmZmZmZ d dlZdd	lmZ d dlZd dlZdd
lmZ ddlmZ ddlmZmZ  ed      ZdZ ejB                  Z"ddZ#ddZ$d Z%ddZ&e#dd       Z' G d d      Z( G d d      Z) e)d       Z*d Z+d Z, G d d      Z- G d d      Z. G d d e.      Z/e/Z0 G d! d"e/      Z1 G d# d$e1      Z2 G d% d&e/      Z3 G d' d(e.      Z4 G d) d*e/      Z5 e/d+      Z6 e/d,      Z7 e/d-      Z8 e/d.      Z9 e/d/      Z: e/d0      Z; e/d1      Z< e/d2      Z= e/d3      Z> e/d4      Z? e/d5      Z@ e/d6      ZA e/d7      ZB e/d8      ZC e/d9      ZD e/d:      ZE e/d;      ZF e/d<      ZG e/d=      ZH e1e:      ZIdd>ZJ G d? d@e-      ZK G dA dBe-      ZL G dC dD      ZM G dE dFe.      ZN G dG dHe-      ZO G dI dJeN      ZP G dK dLeO      ZQdM ZRdN ZSe#ddO       ZTe#ddP       ZUe#ddQ       ZVdRe dSeV_W        dT ZXdU ZYe#ddV       ZZe#ddW       Z[e$e#ddXdY              Z\e$e#ddXddZ              Z]e$e#ddXd[              Z^e#dd]       Z_e#dd^       Z`ed_        Zae$e#ddd`              Zbe$e#ddXda              Zce$e#d\ddbdc              Zddd Zee$e#dde              Zfe$e#dddf              Zge#ddddeGdfdg       Zhe#dd\eGdfdh       Zie#	 	 ddi       Zje#	 d	 ddj       Zke#ddk       Zle#ddl       Zme$e#ddm              Zne#dddn       Zoe$e#ddo              Zpe#	 d	 	 	 	 	 	 	 	 	 ddp       ZqdddqZre$e# erdrdst      ddu                     Zse$e# erdv      ddw                     Zte$e# erdx      ddy                     Zue$e# erdz      dd{                     Zve$e# erd|      dd}                     Zwe$e# erd~      dd                     Zxe$e# erd      dd                     Zye$e# erd      dd                     Zze#dd       Z{e#ddd       Z|e#ddd       Z}e#ddd       Z~e#e"j                  dfdd       Ze#e"j                  dfdd       Ze#e"j                  dfdd       Z	 	 d	 	 	 ddZed        Ze$e#dd              Ze#dd       Ze#dd       ZddZe$e#dd              Ze$e#dd              Ze$e#dd              Ze#dd       Ze#dd       Ze#dd       Ze#dd       Ze#dd       Ze#dddd\dddd       Ze#dd       Ze#d\ddd       Ze#dd       Ze#	 d	 	 	 dd       Z G d d      Z G d d      Z	 d	 ddZe#	 ddd       Zd Zd Zy)    )annotations)warn)contextmanager)Enum)partialwrapsN)UnionCallableListSequenceTypeVarOptionalTuple   )jit)ir   )semantic)TRITON_MAX_TENSOR_NUMELvalidate_block_shapeT__triton_builtin__c                j     t               sJ t                fd       }t        |t        d       |S )zMark a function as a builtin.c                 <    d|vs|d   t        d       | i |S )N_builderzdDid you forget to add @triton.jit ? (`_builder` argument must be provided outside of JIT functions.)
ValueErrorargskwargsfns     D/home/dcms/DCMS/lib/python3.12/site-packages/triton/language/core.pywrapperzbuiltin.<locals>.wrapper   s:    V#vj'9'A ` a a4"6""    T)callabler   setattrTRITON_BUILTIN)r!   r#   s   ` r"   builtinr(      s8    B<<
2Y# # G^T*Nr$   c                    t               sJ t        j                         }t        |j                  j                         ddhz
        dkD  } j                  sd _         xj                  d j                   d|rdnd d j                   d	|rd
nd d	z  c_         fd}t        |j                  j                               }|d   j                  d      |d<   |j                  |      }||_        d j                   d|_        t               rt        |t        d       t        t         j                  |        S )a  Decorator that adds this free function as a member fn on class tensor.

    When called as a member function on class tensor, the first argument to `fn`
    is `self`, i.e. the tensor object.

    If there are multiple decorators on a function, you probably want this one
    to be the highest one (i.e. furthest from the function's `def`), so it's
    applied last.

    Unfortunately you still need to add a type stub to the body of class tensor
    in order for pytype to know about it.
    r   
_generatorr    zb
    This function can also be called as a member function on :py:class:`tensor`,
    as :code:`x.(z...z)` instead of
    :code:`z(xz, ...z)`.
    c                      | i |S N r   s     r"   r#   z"_tensor_member_fn.<locals>.wrapperC   s    4"6""r$   r   selfname)
parameterszForwards to :py:func:`z` free functionT)r%   inspect	signaturelenr3   keys__doc____name__listvaluesreplace__signature__
is_builtinr&   r'   tensor)r!   orig_sighas_argsr#   
new_paramsnew_sigs   `     r"   _tensor_member_fnrD   )   s3    B<<  $H8&&++-\0JJKaOH::
JJ Qub9 :KK=h7B7 8 J#
 h))0023JqM))v)6JqM*5G#G.r{{m?KGO"~.FBKK)Ir$   c                j    t        |       dk(  r	 t        | d          | d   S | S # t        $ r Y | S w xY w)z7Returns x[0] if x has one element and x[0] is iterable.r   r   )r6   iter	TypeErrorxs    r"   _unwrap_iterablerJ   U   sF    
1v{	1JQ4K H  	H	s   % 	22c                $    t        | t        d      S )z-Is this a registered triton builtin function?F)getattrr'   r!   s    r"   r>   r>   k   s    2~u--r$   c                .    t        j                  | |      S r.   )r   	to_tensor)rI   r   s     r"   rO   rO   p   s    a**r$   c                      e Zd ZdZy)consta~  
    This class is used as a type annotation to mark pointers to constant data.
    The `store` function cannot be called with a pointer to const. Constness
    is part of the pointer type and the usual Triton type consistency rules
    apply. For example you cannot have a function that returns constant pointer
    in one return statement and non-constant pointer in another.
    N)r9   
__module____qualname__r8   r/   r$   r"   rQ   rQ   z   s     	r$   rQ   c                     e Zd ZdZd Zd+dZd Zd Zd Zd Z	d Z
d	 Zd
 Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Z d Z!d  Z"d! Z#d" Z$d# Z%d$ Z&d% Z'd& Z(d' Z)d( Z*d) Z+y*),	constexprzL
    This class is used to store a value that is known at compile-time.
    c                    t        |t              r|j                  | _        t        | _        y || _        t        | _        y r.   )
isinstancerU   valuetype)r0   rX   s     r"   __init__zconstexpr.__init__   s0    eY'DJ 	 DJ	r$   c                "    d| j                    dS )Nz
constexpr[]rX   r0   s    r"   __repr__zconstexpr.__repr__   s    DJJ<q))r$   c                    | j                   S r.   r]   r^   s    r"   	__index__zconstexpr.__index__       zzr$   c                D    t        | j                  t        |      z         S r.   rU   rX   _constexpr_to_valuer0   others     r"   __add__zconstexpr.__add__       &9%&@@AAr$   c                D    t        t        |      | j                  z         S r.   rU   re   rX   rf   s     r"   __radd__zconstexpr.__radd__       ,U3djj@AAr$   c                D    t        | j                  t        |      z
        S r.   rd   rf   s     r"   __sub__zconstexpr.__sub__   ri   r$   c                D    t        t        |      | j                  z
        S r.   rk   rf   s     r"   __rsub__zconstexpr.__rsub__   rm   r$   c                D    t        | j                  t        |      z        S r.   rd   rf   s     r"   __mul__zconstexpr.__mul__   ri   r$   c                D    t        | j                  t        |      z        S r.   rd   rf   s     r"   __mod__zconstexpr.__mod__   ri   r$   c                D    t        t        |      | j                  z        S r.   rk   rf   s     r"   __rmul__zconstexpr.__rmul__   rm   r$   c                D    t        | j                  t        |      z        S r.   rd   rf   s     r"   __truediv__zconstexpr.__truediv__   ri   r$   c                D    t        t        |      | j                  z        S r.   rk   rf   s     r"   __rtruediv__zconstexpr.__rtruediv__   rm   r$   c                D    t        | j                  t        |      z        S r.   rd   rf   s     r"   __floordiv__zconstexpr.__floordiv__       ':5'AABBr$   c                D    t        t        |      | j                  z        S r.   rk   rf   s     r"   __rfloordiv__zconstexpr.__rfloordiv__       ,U3tzzABBr$   c                D    t        | j                  t        |      kD        S r.   rd   rf   s     r"   __gt__zconstexpr.__gt__   ri   r$   c                D    t        t        |      | j                  kD        S r.   rk   rf   s     r"   __rgt__zconstexpr.__rgt__   rm   r$   c                D    t        | j                  t        |      k\        S r.   rd   rf   s     r"   __ge__zconstexpr.__ge__   r~   r$   c                D    t        t        |      | j                  k\        S r.   rk   rf   s     r"   __rge__zconstexpr.__rge__   r   r$   c                D    t        | j                  t        |      k        S r.   rd   rf   s     r"   __lt__zconstexpr.__lt__   ri   r$   c                D    t        t        |      | j                  k        S r.   rk   rf   s     r"   __rlt__zconstexpr.__rlt__   rm   r$   c                D    t        | j                  t        |      k        S r.   rd   rf   s     r"   __le__zconstexpr.__le__   r~   r$   c                D    t        t        |      | j                  k        S r.   rk   rf   s     r"   __rle__zconstexpr.__rle__   r   r$   c                D    t        | j                  t        |      k(        S r.   rd   rf   s     r"   __eq__zconstexpr.__eq__   r~   r$   c                D    t        | j                  t        |      k7        S r.   rd   rf   s     r"   __ne__zconstexpr.__ne__   r~   r$   c                ,    t        | j                        S r.   )boolrX   r^   s    r"   __bool__zconstexpr.__bool__       DJJr$   c                .    t        | j                         S r.   rU   rX   r^   s    r"   __neg__zconstexpr.__neg__       $**%%r$   c                D    t        | j                  t        |      z        S r.   rd   rf   s     r"   __and__zconstexpr.__and__   ri   r$   c                F    t        | j                  xr t        |            S r.   rd   rf   s     r"   logical_andzconstexpr.logical_and   s    B(;E(BCCr$   c                D    t        | j                  t        |      z        S r.   rd   rf   s     r"   __or__zconstexpr.__or__   ri   r$   c                D    t        | j                  t        |      z        S r.   rd   rf   s     r"   __xor__zconstexpr.__xor__   ri   r$   c                F    t        | j                  xs t        |            S r.   rd   rf   s     r"   
logical_orzconstexpr.logical_or   s    A':5'ABBr$   c                .    t        | j                        S r.   r   r^   s    r"   __pos__zconstexpr.__pos__   r   r$   c                .    t        | j                         S r.   r   r^   s    r"   
__invert__zconstexpr.__invert__   r   r$   c                D    t        | j                  t        |      z        S r.   rd   rf   s     r"   __pow__zconstexpr.__pow__   s    %8%??@@r$   c                D    t        t        |      | j                  z        S r.   rk   rf   s     r"   __rpow__zconstexpr.__rpow__   s    ,U3TZZ?@@r$   c                D    t        | j                  t        |      z	        S r.   rd   rf   s     r"   
__rshift__zconstexpr.__rshift__   r~   r$   c                D    t        | j                  t        |      z        S r.   rd   rf   s     r"   
__lshift__zconstexpr.__lshift__   r~   r$   c                .    t        | j                         S r.   r   r^   s    r"   __not__zconstexpr.__not__  s    TZZ((r$   c                ,    t        | j                        S r.   )rF   rX   r^   s    r"   __iter__zconstexpr.__iter__  r   r$   c                &     | j                   |i |S r.   r]   )r0   r   kwdss      r"   __call__zconstexpr.__call__  s    tzz4(4((r$   Nreturnstr),r9   rR   rS   r8   rZ   r_   ra   rh   rl   ro   rq   rs   ru   rw   ry   r{   r}   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r/   r$   r"   rU   rU      s    *BBBBBBBBBCCBBCCBBCCCC &BDBBC&&AACC) )r$   rU   c                >    t        | t              r| j                  S | S r.   rW   rU   rX   )os    r"   _unwrap_if_constexprr     s     I.1775A5r$   c           	         t        | t              rkt        |t              rZ| j                  j                  j
                  }|j                  |k\  r*t        d|j                   d| d| j                   d       y y y y )NzValue z exceeds the maximum bitwidth (z) for type 'z)'. This may result in undefined behavior.)	rW   r?   rU   rY   scalarprimitive_bitwidthrX   r   dtype)rX   shift_valuebitwidths      r"   check_bit_widthr     s    % ZY%G::$$77(**++J8*T`afalal`m  nW  X ) &H r$   c                  $    e Zd ZU dZded<   ddZy)
base_valuezLBase class of values that exist in the triton IR (i.e. not constexprs).
    	base_typerY   c                    t         )znFlatten frontend value into a sequence of mlir handles, which are appended
        to the output list
        NotImplementedErrorr0   handless     r"   _flatten_irzbase_value._flatten_ir   s
     "!r$   Nr   List[ir.value]r   None)r9   rR   rS   r8   __annotations__r   r/   r$   r"   r   r     s    
O"r$   r   c                       e Zd Zd Zd ZddZy)r   c                    t        d      )NzTypes must implement __eq__r   rf   s     r"   r   zbase_type.__eq__)  s    !"?@@r$   c                    | |k(   S r.   r/   rf   s     r"   r   zbase_type.__ne__,      EM""r$   c                    t         )a  Build a frontend value with the current dtype, wrapping a list of existing handles.
        cursor is the index of the first handle relevant to this value, and the function
        should return the updated cursor position after any handles consumed by the created value.
        r   r0   r   cursors      r"   _unflatten_irzbase_type._unflatten_ir/  s
    
 "!r$   Nr   r   r   intr   zTuple[base_value, int])r9   rR   rS   r   r   r   r/   r$   r"   r   r   '  s    A#"r$   r   c                     e Zd Zg dZg dZg dZg dZdgZ G d de      Z	 G d d	e      Z
d
 Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Z d  Z!d! Z"d" Z#d# Z$d$ Z%d% Z&d& Z'e(d'        Z)e(d(        Z*e(d)        Z+e(d*        Z,e(d+        Z-e(d,        Z.d7d-Z/d. Z0e1d/        Z2d8d0Z3d1 Z4d2 Z5e1d9d3       Z6d4 Z7d:d5Z8y6);r   )int8int16int32int64)int1uint8uint16uint32uint64)	fp8e4b15fp8e4nvfp8e4b8fp8e5fp8e5b16fp16bf16fp32fp64)r   r   r   r   voidc                      e Zd ZdZdZy)dtype.SIGNEDNESSr   r   N)r9   rR   rS   SIGNEDUNSIGNEDr/   r$   r"   
SIGNEDNESSr   C  s    r$   r   c                      e Zd ZdZdZdZy)
dtype.KINDr   r   r   N)r9   rR   rS   BOOLEANINTEGRALFLOATINGr/   r$   r"   KINDr   G  s    r$   r   c                   t        |      }|| _        |t        j                  t        j                  z   t        j
                  z   t        j                  z   v sJ |       |t        j                  v rSt        j                  j                  | _	        t        |j                  d      d         | _        | j                  | _        y |t        j                  v rSt        j                  j                  | _	        t        |j                  d      d         | _        | j                  | _        y |t        j
                  v r|dk(  rd| _        d| _        d| _        y |dk(  rd| _        d| _        d| _        y |d	k(  rd| _        d| _        d| _        y |d
k(  rd| _        d| _        d| _        y |dk(  rd| _        d| _        d| _        y |dk(  rd| _        d| _        d| _        y |dk(  rd| _        d| _        d| _        y |dk(  rd| _        d| _        d| _        y |dk(  rd| _        d| _        d| _        y t#        d|       |dk(  rd| _        y y )Nr   r            r      r   r   r   r      r   
   r      r          r   4   @   i  z Unsupported floating-point type r   r   )r   r2   r   
SINT_TYPES
UINT_TYPESFP_TYPESOTHER_TYPESr   r   int_signednessr   splitint_bitwidthr   r   fp_mantissa_widthexponent_biasRuntimeErrorr0   r2   s     r"   rZ   zdtype.__init__L  s0   #D)	u''%*:*::U^^KeN_N___eaee_5###"'"2"2"9"9D #DJJu$5b$9 :D&*&7&7D#U%%%"'"2"2";";D #DJJu$5b$9 :D&*&7&7D#U^^#z!)*&*+'%'"")*&*+'%&"")*&*+'%&")*&*+'%'"#)*&*+'%'")+&*,'%'")*&*,'%(")+&*,'%(")+&*,'%)""%EdV#LMMV^&'D# r$   c                    d| j                   v S )Nfp8r1   r^   s    r"   is_fp8zdtype.is_fp8  s    		!!r$   c                     | j                   dk(  S )Nr   r1   r^   s    r"   
is_fp8e4nvzdtype.is_fp8e4nv      yyI%%r$   c                     | j                   dk(  S )Nr   r1   r^   s    r"   
is_fp8e4b8zdtype.is_fp8e4b8  r  r$   c                     | j                   dk(  S )Nr   r1   r^   s    r"   is_fp8e4b15zdtype.is_fp8e4b15      yyJ&&r$   c                     | j                   dk(  S )Nr   r1   r^   s    r"   is_fp8e5zdtype.is_fp8e5      yyG##r$   c                     | j                   dk(  S )Nr   r1   r^   s    r"   is_fp8e5b16zdtype.is_fp8e5b16  r!  r$   c                     | j                   dk(  S )Nr   r1   r^   s    r"   is_fp16zdtype.is_fp16      yyF""r$   c                     | j                   dk(  S )Nr   r1   r^   s    r"   is_bf16zdtype.is_bf16  r)  r$   c                     | j                   dk(  S )Nr   r1   r^   s    r"   is_fp32zdtype.is_fp32  r)  r$   c                     | j                   dk(  S )Nr   r1   r^   s    r"   is_fp64zdtype.is_fp64  r)  r$   c                     | j                   dk(  S )Nr   r1   r^   s    r"   is_int1zdtype.is_int1  r)  r$   c                     | j                   dk(  S )Nr   r1   r^   s    r"   is_int8zdtype.is_int8  r)  r$   c                     | j                   dk(  S )Nr   r1   r^   s    r"   is_int16zdtype.is_int16  r$  r$   c                     | j                   dk(  S )Nr   r1   r^   s    r"   is_int32zdtype.is_int32  r$  r$   c                     | j                   dk(  S )Nr   r1   r^   s    r"   is_int64zdtype.is_int64  r$  r$   c                     | j                   dk(  S )Nr   r1   r^   s    r"   is_uint8zdtype.is_uint8  r$  r$   c                     | j                   dk(  S )Nr   r1   r^   s    r"   	is_uint16zdtype.is_uint16      yyH$$r$   c                     | j                   dk(  S )Nr   r1   r^   s    r"   	is_uint32zdtype.is_uint32  r>  r$   c                     | j                   dk(  S )Nr   r1   r^   s    r"   	is_uint64zdtype.is_uint64  r>  r$   c                :    | j                   t        j                  v S r.   )r2   r   r  r^   s    r"   is_floatingzdtype.is_floating  s    yyENN**r$   c                :    | j                   t        j                  v S r.   )r2   r   STANDARD_FP_TYPESr^   s    r"   is_standard_floatingzdtype.is_standard_floating  s    yyE3333r$   c                :    | j                   t        j                  v S r.   )r2   r   r  r^   s    r"   is_int_signedzdtype.is_int_signed      yyE,,,,r$   c                :    | j                   t        j                  v S r.   )r2   r   r  r^   s    r"   is_int_unsignedzdtype.is_int_unsigned  rJ  r$   c                \    | j                   t        j                  t        j                  z   v S r.   )r2   r   r  r  r^   s    r"   is_intzdtype.is_int  s"    yyE,,u/?/????r$   c                "    | j                         S r.   )r1  r^   s    r"   is_boolzdtype.is_bool      ||~r$   c                   | j                         rt        j                  j                  S | j	                         rt        j                  j
                  S | j                         sJ t        j                  j                  S r.   )rP  r   r   r   rN  r   rD  r   r^   s    r"   kindz
dtype.kind  sV    <<>::%%%[[]::&&&##%%%::&&&r$   c                    | j                         rd| j                  dz
  z  dz
  S | j                         rd| j                  z  dz
  S J )Nr   r   rI  r  rL  r^   s    r"   get_int_max_valuezdtype.get_int_max_value  sP    t((1,-11!d'''!++ur$   c                n    | j                         rd| j                  dz
  z   S | j                         ryJ )Nr   r   r   rU  r^   s    r"   get_int_min_valuezdtype.get_int_min_value  s;    ))A-...!ur$   c                    | t         j                  t         j                  z   t         j                  z   t         j                  z   v S r.   )r   r  r  r  r  )type_strs    r"   is_dtypezdtype.is_dtype  s3    5++e.>.>>ORWRcRccccr$   c                     t        d      )NzNot implementedr  r/   r$   r"   is_voidzdtype.is_void  s    ,--r$   c                      yNFr/   r/   r$   r"   is_blockzdtype.is_block      r$   c                      yr`  r/   r/   r$   r"   is_ptrzdtype.is_ptr  rb  r$   c                      yr`  r/   r/   r$   r"   is_constzdtype.is_const  rb  r$   c                      yr`  r/   r/   r$   r"   is_tuplezdtype.is_tuple  rb  r$   c                V    t        |t              sy| j                  |j                  k(  S r`  )rW   r   r2   rf   s     r"   r   zdtype.__eq__  s"    %'yyEJJ&&r$   c                .    t        | j                  f      S r.   )hashr2   r^   s    r"   __hash__zdtype.__hash__  s    TYYM""r$   c                    | S r.   r/   r^   s    r"   r   zdtype.scalar      r$   c                   | j                   j                  d      r| j                   |j                  j                  vr%t	        d|  d|j                  j                         | j                   |j                  j
                  v rt        | j                    d       | j                   dk(  r|j                         S | j                   dk(  r|j                         S | j                   dv r|j                         S | j                   dv r|j                         S | j                   d	v r|j                         S | j                   d
v r|j                         S | j                   dk(  r|j                         S | j                   dk(  r|j                         S | j                   dk(  r|j                         S | j                   dk(  r|j!                         S | j                   dk(  r|j#                         S | j                   dk(  r|j%                         S | j                   dk(  r|j'                         S | j                   dk(  r|j)                         S | j                   dk(  r|j+                         S t	        d|  d      )Nr  ztype zB not supported in this architecture. The supported fp8 dtypes are zR is deprecated in this architecture and will be removed in a future triton releaser   r   )r   r   )r   r   )r   r   )r   r   r   r   r   r   r   r   r   r   r   zfail to convert z to ir type)r2   
startswithoptionssupported_fp8_dtypesr   deprecated_fp8_dtypesr   get_void_tyget_int1_tyget_int8_tyget_int16_tyget_int32_tyget_int64_tyget_fp8e5_tyget_fp8e5b16_tyget_fp8e4nv_tyget_fp8e4b8_tyget_fp8e4b15_tyget_half_tyget_bf16_tyget_float_tyget_double_tyr0   builders     r"   to_irzdtype.to_ir	  s3   99&yy D DD 5 /AAHAeAe@f"h i iyyGOOAAA		{"tuv99&&((YY& &&((YY++&&((YY--''))YY--''))YY--''))YY'!''))YY*$**,,YY)#))++YY)#))++YY*$**,,YY& &&((YY& &&((YY& ''))YY& ((**+D6=>>r$   c                    | j                   S r.   r1   r^   s    r"   __str__zdtype.__str__1      yyr$   c                    | j                   j                  d      rd| j                   dd  z   S | j                   j                  d      rd| j                   dd  z   S | j                   S )Nfpfloatr   bfbfloat)r2   rp  r^   s    r"   codegen_namezdtype.codegen_name4  sZ    99%TYYqr]**YY!!$'diim++99r$   c                    | j                   S )z"See cache_key_part() in triton.cc.r1   r^   s    r"   cache_key_partzdtype.cache_key_part<  s     yyr$   c                (    d| j                          S )z4Output of repr needs to be an evaluatable expressionztriton.language.)r  r^   s    r"   r_   zdtype.__repr__A  s    !$"3"3"5!677r$   c                *    t        ||   |       |dz   fS Nr   )r?   r   s      r"   r   zdtype._unflatten_irE  s    gfot,fqj88r$   N)rg   r   )r  
ir.builderr   zir.typer   r   )9r9   rR   rS   r  r  r  rF  r  r   r   r   rZ   r  r  r  r   r#  r&  r(  r+  r-  r/  r1  r3  r5  r7  r9  r;  r=  r@  rB  rD  rG  rI  rL  rN  rP  rS  rV  rX  staticmethodr[  r^  ra  rd  rf  rh  r   rl  propertyr   r  r  r  r  r_   r   r/   r$   r"   r   r   <  s   4J@JfH8(KT t 
4(l"&&'$'######$$$$%%%+4--@' d d . .        '
#  &?P  89r$   r   c                  N    e Zd Zd
ddZddZd Zd Zd Zd ZddZ	e
d        Zy	)pointer_typec                    t        |      }t        |t              s"t        dt	        |      j
                   d      || _        || _        || _        |sd| d| _	        y d| d| _	        y )Nzelement_ty has type `z`; expected `dtype`.zpointer<>zconst_pointer<)
r   rW   r   rG   rY   r9   
element_tyaddress_spacerQ   r2   )r0   r  r  rQ   s       r"   rZ   zpointer_type.__init__Q  sq    )*5
*e,3D4D4M4M3NNbcdd$*
49hzl!,	PZ|[\?]	r$   c                l    |j                  | j                  j                  |      | j                        S r.   )
get_ptr_tyr  r  r  r  s     r"   r  zpointer_type.to_irZ  s*    !!$//"7"7"@$BTBTUUr$   c                    | j                   S r.   r1   r^   s    r"   r  zpointer_type.__str__]  r  r$   c                "    | j                         S r.   r  r^   s    r"   r_   zpointer_type.__repr__`  rQ  r$   c                     yNTr/   r^   s    r"   rd  zpointer_type.is_ptrc      r$   c                    | j                   S r.   )rQ   r^   s    r"   rf  zpointer_type.is_constf  rb   r$   c                    t        |t              sy| j                  |j                  k(  xr4 | j                  |j                  k(  xr | j                  |j                  k(  S r`  )rW   r  r  r  rQ   rf   s     r"   r   zpointer_type.__eq__i  sR    %.%"2"22~t7I7IUM`M`7`~eieoeosxs~s~e~~r$   c                    | S r.   r/   r^   s    r"   r   zpointer_type.scalarn  rn  r$   N)r   F)r  r   r  r   rQ   r   )r  r  r   zir.pointer_type)rg   r  r   r   )r9   rR   rS   rZ   r  r  r_   rd  rf  r   r  r   r/   r$   r"   r  r  O  s<    ^V
  r$   r  c                        e Zd Zd fd	Z xZS )nv_tma_desc_typec                @    t         |   t        ||       d| _        y )N)rQ   r  r  )superrZ   r   r2   )r0   rQ   r  	__class__s      r"   rZ   znv_tma_desc_type.__init__u  s    e=I&	r$   )Tr   )r9   rR   rS   rZ   __classcell__r  s   @r"   r  r  s  s    ' 'r$   r  c                  N    e Zd Zd
dZddZd Zd Zd ZddZddZ	e
d        Zy	)
block_typec                   || _         t        |t        t        f      sJ t        t	        |            | _        | j
                  st        d      t        | j
                        | _        d| j
                   d| j                    d| _	        y )Nz0d block_type is forbidden<, r  )
r  rW   r:   tuple_unwrap_shapeshaperG   r   numelr2   )r0   r  r  s      r"   rZ   zblock_type.__init__|  st    $ 54-010 =/0
zz899)$**5


|2doo%6a8	r$   c                l    |j                  | j                  j                  |      | j                        S r.   )get_block_tyr  r  r  r  s     r"   r  zblock_type.to_ir  s(    ##DOO$9$9'$BDJJOOr$   c                    | j                   S r.   r1   r^   s    r"   r  zblock_type.__str__  r  r$   c                "    | j                         S r.   r  r^   s    r"   r_   zblock_type.__repr__  rQ  r$   c                     yr  r/   r^   s    r"   ra  zblock_type.is_block  r  r$   c                    | j                   S r.   r  r^   s    r"   get_block_shapeszblock_type.get_block_shapes  rb   r$   c                    t        |t              sy| j                  |j                  k(  xr | j                  |j                  k(  S r`  )rW   r  r  r  rf   s     r"   r   zblock_type.__eq__  s6    %,%"2"22PtzzU[[7PPr$   c                    | j                   S r.   )r  r^   s    r"   r   zblock_type.scalar  s    r$   N)r  r   r  r   )r  r  r   zir.block_type)r   z	List[int]r   r   )r9   rR   rS   rZ   r  r  r_   ra  r  r   r  r   r/   r$   r"   r  r  z  s;    9PQ
  r$   r  c                  D    e Zd Zd
dZd Zd ZddZddZd Zd Z	dd	Z
y)
tuple_typeNc           
         || _         |xs dgt        |      z  | _        ddj                  t	        | j                  | j                         D cg c]  \  }}| d|  c}}      z   dz   | _        y c c}}w )Nr+   [,:r\   )typesr6   fieldsjoinzipr2   )r0   r  r  kvs        r"   rZ   ztuple_type.__init__  sf    
1s5z 1#((3t{{DJJ;W#X41aqc1#J#XYY\__	#Xs   A3c                    | j                   S r.   r1   r^   s    r"   r  ztuple_type.__str__  r  r$   c                ,    t        | j                        S r.   )rF   r  r^   s    r"   r   ztuple_type.__iter__  r   r$   c                ^    | j                   D cg c]  }|j                  |       c}S c c}w r.   )r  r  )r0   r  tys      r"   r  ztuple_type.to_ir  s#    ,0JJ7b!777s   *c                     | j                   |   S r.   )r  )r0   indexs     r"   __getitem__ztuple_type.__getitem__  s    zz%  r$   c                     yr  r/   r^   s    r"   rh  ztuple_type.is_tuple  r  r$   c                    t        |       t        |      u xr4 | j                  |j                  k(  xr | j                  |j                  k(  S r.   )rY   r  r  rf   s     r"   r   ztuple_type.__eq__  s<    DzT%[(fTZZ5;;-Ff4;;Z_ZfZfKffr$   c                    g }| j                   D ](  }|j                  ||      \  }}|j                  |       * t        ||       |fS r.   )r  r   appendr  )r0   r   r   r;   r  rX   s         r"   r   ztuple_type._unflatten_ir  sP    ** 	!B,,Wf=ME6MM% 	! VT"F**r$   r.   r  r  )r  r   r   r   )r   r   r   r   r   zTuple[tuple, int])r9   rR   rS   rZ   r  r   r  r  rh  r   r   r/   r$   r"   r  r    s,    `
 8!g+r$   r  c                      e Zd Zd Zy)
slice_typec                    d| _         y )Nr  r1   r^   s    r"   rZ   zslice_type.__init__  s	     	r$   Nr9   rR   rS   rZ   r/   r$   r"   r  r    s    !r$   r  r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   c                
   | dk(  rt         S | dk(  r|rt        S | dk(  r|st        S | dk(  r|rt        S | dk(  r|st        S | dk(  r|rt
        S | dk(  r|st        S | dk(  r|rt        S | dk(  r|st        S t        d|  d|       )Nr   r  r  r	  r  zUnsupported bitwidth z and signedness )
r   r   r   r   r   r   r   r   r   r   )r   signeds     r"   get_int_dtyper    s    1}	Q6	Qv	RF	R	RF	R	RF	R0
:J6(STTr$   c                      e Zd ZdZd[ fdZd\dZd]dZed^d       Zed^d       Z	ed^d       Z
ed^d	       Zed^d
       Zed^d       Zed^d       Zed^d       Zed^d       Zed^d       Zed^d       Zed^d       Zed^d       Zed^d       Zed^d       Zed^d       Zed^d       Zed^d       Zed^d       Zed^d       Zed^d       Zed^d       Zed^d       Zed^d       Zed^d       Z ed^d       Z!ed^d        Z"ed^d!       Z#ed^d"       Z$ed^d#       Z%ed^d$       Z&ed^d%       Z'ed^d&       Z(ed^d'       Z)ed^d(       Z*ed^d)       Z+ed^d*       Z,ed^d+       Z-ed^d,       Z.ed^d-       Z/e0d.        Z1ed_d`d/       Z2dad0Z3dad1Z4dad2Z5dbd3Z6dad4Z7dad5Z8dad6Z9dcdad7Z:dddad8Z;dad9Z<dedad:Z=dfdad;Z>dfdad<Z?dfdad=Z@dfdad>ZAdfdad?ZBdfdad@ZCdfdadAZDdadBZEdadCZFdadDZGdadEZHdadFZIdadGZJdadHZKdgdadIZLdgdadJZMdadKZNdadLZOdadMZPdadNZQdgdadOZRdadPZSdhdadQZTdidadRZUdhdadSZVdidadTZWd_dadUZXdcdadVZYdjdadWZZdjdadXZ[de\fdkdYZ]d^dadZZ^ xZ_S )lr?   a  Represents an N-dimensional array of values or pointers.

    :code:`tensor` is the fundamental data structure in Triton programs.  Most
    functions in :py:mod:`triton.language` operate on and return tensors.

    Most of the named member functions here are duplicates of the free functions
    in :code:`triton.language`.  For example, :code:`triton.language.sqrt(x)` is
    equivalent to :code:`x.sqrt()`.

    :code:`tensor` also defines most of the magic/dunder methods, so you can
    write :code:`x+y`, :code:`x << 2`, etc.

    .. rubric:: Constructors
    ..
       For some reason Sphinx includes __init__ before printing the full table
       of methods.  Not what I want, but I can't figure out how to fix it.  Give
       it its own section so it looks intentional. :)
    c                   t         |           || _        |j                         r|j                  nd| _        d| _        | j                  D ]  }| xj
                  |z  c_         t        | j
                        | _        || _        |j                  | _	        t        | j                  D cg c]  }t        |       c}      | _        yc c}w )Not called by user code.r/   r   N)r  rZ   handlera  r  r  rU   rY   r   r   r  )r0   r  rY   sr  s       r"   rZ   ztensor.__init__  s    #'==?TZZ

 	AJJ!OJ	tzz*
	[[
$**=QIaL=>
=s   ,Cc                :    |j                  | j                         y r.   r  r  r   s     r"   r   ztensor._flatten_ir#      t{{#r$   c                ~    t        | j                        dz   dj                  d | j                  D              z   dz   S )Nr  r  c              3  2   K   | ]  }t        |        y wr.   )r   .0r  s     r"   	<genexpr>z!tensor.__str__.<locals>.<genexpr>(  s     0LAQ0Ls   r\   )r   r   r  r  r^   s    r"   r  ztensor.__str__&  s2    4::$tyy0L0L'LLsRRr$   Nc                     t        | |d|      S NT)sanitize_overflowr   addr0   rg   r   s      r"   rh   ztensor.__add__*      4$JJr$   c                     t        || d|      S r  r  r  s      r"   rl   ztensor.__radd__.      5$$JJr$   c                     t        | |d|      S r  subr  s      r"   ro   ztensor.__sub__2  r  r$   c                     t        || d|      S r  r  r  s      r"   rq   ztensor.__rsub__6  r  r$   c                     t        | |d|      S r  mulr  s      r"   rs   ztensor.__mul__:  r  r$   c                     t        || d|      S r  r  r  s      r"   rw   ztensor.__rmul__>  r  r$   c                F    t        |      }t        j                  | ||      S r.   r   r   truedivr  s      r"   ry   ztensor.__truediv__B  s!    $U+eX66r$   c                F    t        |      }t        j                  || |      S r.   r  r  s      r"   r{   ztensor.__rtruediv__G  s!    $U+tX66r$   c                F    t        |      }t        j                  | ||      S r.   r   r   floordivr  s      r"   r}   ztensor.__floordiv__L  s!    $U+  uh77r$   c                F    t        |      }t        j                  || |      S r.   r  r  s      r"   r   ztensor.__rfloordiv__Q  s!    $U+  h77r$   c                F    t        |      }t        j                  | ||      S r.   r   r   modr  s      r"   ru   ztensor.__mod__V      $U+||D%22r$   c                F    t        |      }t        j                  || |      S r.   r	  r  s      r"   __rmod__ztensor.__rmod__[      $U+||E422r$   c                .    t        j                  | |      S r.   )r   minusr0   r   s     r"   r   ztensor.__neg__a  s    ~~dH--r$   c                .    t        j                  | |      S r.   )r   invertr  s     r"   r   ztensor.__invert__e  s    tX..r$   c                F    t        |      }t        j                  | ||      S r.   r   r   and_r  s      r"   r   ztensor.__and__k      $U+}}T5(33r$   c                F    t        |      }t        j                  || |      S r.   r  r  s      r"   __rand__ztensor.__rand__p      $U+}}UD(33r$   c                F    t        |      }t        j                  | ||      S r.   r   r   or_r  s      r"   r   ztensor.__or__u  r  r$   c                F    t        |      }t        j                  || |      S r.   r  r  s      r"   __ror__ztensor.__ror__z  r  r$   c                F    t        |      }t        j                  | ||      S r.   r   r   xor_r  s      r"   r   ztensor.__xor__  r  r$   c                F    t        |      }t        j                  || |      S r.   r!  r  s      r"   __rxor__ztensor.__rxor__  r  r$   c                ^    t        | |       t        |      }t        j                  | ||      S r.   r   r   r   shlr  s      r"   r   ztensor.__lshift__  s)    e$$U+||D%22r$   c                ^    t        ||        t        |      }t        j                  || |      S r.   r&  r  s      r"   __rlshift__ztensor.__rlshift__  s)    t$$U+||E422r$   c                    t        | |       t        |      }| j                  j                         rt	        j
                  | ||      S t	        j                  | ||      S r.   r   r   r   rI  r   ashrlshrr  s      r"   r   ztensor.__rshift__  sL    e$$U+::##%==uh77==uh77r$   c                    t        ||        t        |      }| j                  j                         rt	        j
                  || |      S t	        j                  || |      S r.   r+  r  s      r"   __rrshift__ztensor.__rrshift__  sL    t$$U+::##%==h77==h77r$   c                \    t        j                  ||      }t        j                  | ||      S r.   r   rO   greater_thanr  s      r"   r   ztensor.__gt__  s)    ""5(3$$T5(;;r$   c                \    t        j                  ||      }t        j                  || |      S r.   r1  r  s      r"   r   ztensor.__rgt__  s)    ""5(3$$UD(;;r$   c                \    t        j                  ||      }t        j                  | ||      S r.   r   rO   greater_equalr  s      r"   r   ztensor.__ge__  s)    ""5(3%%dE8<<r$   c                \    t        j                  ||      }t        j                  || |      S r.   r5  r  s      r"   r   ztensor.__rge__  s)    ""5(3%%eT8<<r$   c                \    t        j                  ||      }t        j                  | ||      S r.   r   rO   	less_thanr  s      r"   r   ztensor.__lt__  )    ""5(3!!$x88r$   c                \    t        j                  ||      }t        j                  || |      S r.   r9  r  s      r"   r   ztensor.__rlt__  )    ""5(3!!%x88r$   c                \    t        j                  ||      }t        j                  | ||      S r.   r   rO   
less_equalr  s      r"   r   ztensor.__le__  )    ""5(3""499r$   c                \    t        j                  ||      }t        j                  || |      S r.   r?  r  s      r"   r   ztensor.__rle__  s)    ""5(3""5$99r$   c                \    t        j                  ||      }t        j                  | ||      S r.   r   rO   equalr  s      r"   r   ztensor.__eq__  s'    ""5(3~~dE844r$   c                \    t        j                  ||      }t        j                  || |      S r.   rD  r  s      r"   __req__ztensor.__req__  s'    ""5(3~~eT844r$   c                \    t        j                  ||      }t        j                  | ||      S r.   r   rO   	not_equalr  s      r"   r   ztensor.__ne__  r;  r$   c                \    t        j                  ||      }t        j                  || |      S r.   rI  r  s      r"   __rne__ztensor.__rne__  r=  r$   c                \    t        j                  ||      }t        j                  | ||      S r.   )r   rO   r   r  s      r"   r   ztensor.logical_and  s)    ""5(3##D%::r$   c                \    t        j                  ||      }t        j                  | ||      S r.   )r   rO   r   r  s      r"   r   ztensor.logical_or  rA  r$   c                .    t        j                  | |      S r.   )r   not_r  s     r"   r   ztensor.__not__  s    }}T8,,r$   c                   dd l }t        ||j                  t        t        f      s||g}t        |t              r|j
                  }| }t        |      D ]  \  }}|t        |t              r$|j                  t        j                  |||      }<t        ||j                  t        f      r%|j                  |j                  |j                  }t        d|        |S )Nr   zunsupported tensor index: )builtinsrW   slicerU   r  r;   	enumeraterX   r   expand_dimsstartstopstepr   )r0   slicesr   rR  retdimsls          r"   r  ztensor.__getitem__  s    fx~~ui@AV^XFfe$]]F ( 	DGCzZI6288;K**3X>B 78RXX=MRTRYRYRafhfmfmfu #=bT!BCC	D 
r$   c                    J d       )zTransposes a 2D tensor.z0Transposition must be created by the AST Visitorr/   r^   s    r"   r   ztensor.T	  s     	IHHur$   c                "    t        | ||||      S )z3
        Alias for :py:func:`tensor.cast`.
        r   )cast)r0   r   fp_downcast_roundingbitcastr   s        r"   toz	tensor.to  s    
 D%!5wRRr$   c                     y r.   r/   r0   r  s     r"   broadcast_toztensor.broadcast_to      r$   c                     y r.   r/   r0   dimss     r"   transztensor.trans  rg  r$   c                     y r.   r/   ri  s     r"   permuteztensor.permute   rg  r$   c                     y r.   r/   r^   s    r"   r  ztensor.split#  rg  r$   c                     y r.   r/   re  s     r"   viewztensor.view&  rg  r$   c                     y r.   r/   re  s     r"   reshapeztensor.reshape)  rg  r$   c                     y r.   r/   )r0   axiss     r"   rU  ztensor.expand_dims,  rg  r$   c                     y r.   r/   )r0   r   ra  rb  s       r"   r`  ztensor.cast/  rg  r$   c                     y r.   r/   )r0   rX   maskboundary_checkcache_modifiereviction_policys         r"   storeztensor.store2  rg  r$   c                     y r.   r/   )r0   offsetss     r"   advanceztensor.advance5  rg  r$   c                     y r.   r/   )r0   cmpvalsemscopes        r"   
atomic_casztensor.atomic_cas8  rg  r$   c                     y r.   r/   r0   r  rw  r  r  s        r"   atomic_xchgztensor.atomic_xchg;  rg  r$   c                     y r.   r/   r  s        r"   
atomic_addztensor.atomic_add>  rg  r$   c                     y r.   r/   r  s        r"   
atomic_maxztensor.atomic_maxA  rg  r$   c                     y r.   r/   r  s        r"   
atomic_minztensor.atomic_minD  rg  r$   c                     y r.   r/   r  s        r"   
atomic_andztensor.atomic_andG  rg  r$   c                     y r.   r/   r  s        r"   	atomic_orztensor.atomic_orJ  rg  r$   c                     y r.   r/   r  s        r"   
atomic_xorztensor.atomic_xorM  rg  r$   c                     y r.   r/   r^   s    r"   expz
tensor.expP  rg  r$   c                     y r.   r/   r^   s    r"   logz
tensor.logS  rg  r$   c                     y r.   r/   r^   s    r"   cosz
tensor.cosV  rg  r$   c                     y r.   r/   r^   s    r"   sinz
tensor.sinY  rg  r$   c                     y r.   r/   r^   s    r"   sqrtztensor.sqrt\  rg  r$   c                     y r.   r/   r^   s    r"   rsqrtztensor.rsqrt_  rg  r$   c                     y r.   r/   r^   s    r"   absz
tensor.absb  rg  r$   c                     y r.   r/   )r0   rt  
combine_fn	keep_dimss       r"   reduceztensor.reducee  rg  r$   c                     y r.   r/   )r0   rt  r  reverses       r"   associative_scanztensor.associative_scanh  rg  r$   c                     y r.   r/   )r0   indicesrt  s      r"   gatherztensor.gatherk  rg  r$   c                     y r.   r/   )r0   num_binss     r"   	histogramztensor.histogramn  rg  r$   c                     y r.   r/   )r0   divs     r"   cdivztensor.cdivq  rg  r$   c                     y r.   r/   r^   s    r"   sigmoidztensor.sigmoidt  rg  r$   c                     y r.   r/   )r0   ieee_roundings     r"   softmaxztensor.softmaxw  rg  r$   c                     y r.   r/   r^   s    r"   ravelztensor.ravelz  rg  r$   c                     y r.   r/   r0   rt  return_indicesreturn_indices_tie_break_leftr  s        r"   maxz
tensor.max}  rg  r$   c                     y r.   r/   r0   rt  tie_break_leftr  s       r"   argmaxztensor.argmax  rg  r$   c                     y r.   r/   r  s        r"   minz
tensor.min  rg  r$   c                     y r.   r/   r  s       r"   argminztensor.argmin  rg  r$   c                     y r.   r/   )r0   rt  r  r   s       r"   sumz
tensor.sum  rg  r$   c                     y r.   r/   )r0   rt  r  s      r"   xor_sumztensor.xor_sum  rg  r$   c                     y r.   r/   r0   rt  r  s      r"   cumsumztensor.cumsum  rg  r$   c                     y r.   r/   r  s      r"   cumprodztensor.cumprod  rg  r$   c                     y r.   r/   )r0   r[  
descendings      r"   sortztensor.sort  rg  r$   c                     y r.   r/   )r0   r[  s     r"   flipztensor.flip  rg  r$   )rY   r   r   r   r.   NFNr   r   ra  zOptional[str]rb  r   r   r?   r   ztuple[tensor, tensor]r`  )Nr/   r+   r+   NNNNNF)NFTF)TF)r   F)r[  rU   r  rU   r   r?   )`r9   rR   rS   r8   rZ   r   r  r(   rh   rl   ro   rq   rs   rw   ry   r{   r}   r   ru   r  r   r   r   r  r   r  r   r$  r   r)  r   r/  r   r   r   r   r   r   r   r   r   rG  r   rL  r   r   r   r  r  r   rc  rf  rk  rm  r  rp  rr  rU  r`  r{  r~  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  CONSTEXPR_0r  r  r  r  s   @r"   r?   r?     sS   &? $S K K K K K K K K K K K K 7 7 7 7 8 8 8 8 3 3 3 3
 . . / /
 4 4 4 4 3 3 3 3 4 4 4 4 3 3
 3 3
 8 8 8 8 < < < <
 = = = =
 9 9 9 9
 : : : :
 5 5 5 5 9 9 9 9 ; ; : : - -    I I S S %)+  r$   r?   c                  ^    e Zd ZdddZddZd ZddZd Zd Zd Z	d	 Z
d
 Zd Zd ZddZy)r  Nc                    |D cg c]  }| c}| _         d }|xs) t        | j                   D cg c]
  } ||       c}      | _        y c c}w c c}w )Nc                r    t        | t              rt        S t        | t              rt        S | j                  S r.   )rW   r   r   rU   rY   rH   s    r"   get_typez tuple.__init__.<locals>.get_type  s*    !U#!S!  66Mr$   )r;   r  rY   )r0   r   rY   ir  rI   s         r"   rZ   ztuple.__init__  sJ    "&'Qq'	 JJT[['I'IJ	 ( (Js
   	AAc                ,   t        |t              rt        |      }t        |t              r| j                  |   S dd l}t        |t
        |j
                  f      sJ t        | j                  |j                  |j                  |j                           S Nr   )
rW   r   rU   r;   rR  rS  r  rV  rW  rX  )r0   idxrR  s      r"   r  ztuple.__getitem__  sq    c3C.Cc9%;;s##cE8>>#:;;;SYYsxx%@ABBr$   c                f    | j                   | j                  j                  j                  |         S r.   )r;   rY   r  r  r  s     r"   __getattr__ztuple.__getattr__  s&    {{499++11$788r$   c                |    t        |t              rt        |      }t        |t              sJ || j                  |<   y r.   )rW   r   rU   r;   )r0   r  rX   s      r"   __setitem__ztuple.__setitem__  s3    c3C.C#y))) Cr$   c                |    t        |t              rt        |      }t        | j                  |j                  z         S r.   )rW   r:   r  r;   rf   s     r"   rh   ztuple.__add__  s-    eT"%LET[[5<</00r$   c                j    t        |t              sJ t        | j                  |j                  z        S r.   )rW   rU   r  r;   rX   rf   s     r"   rs   ztuple.__mul__  s*    %+++T[[5;;.//r$   c                    dd l }t        |t        |j                  f      rt        |      }t	        | j
                  |j
                  k(        S r  )rR  rW   r:   r  rU   r;   )r0   rg   rR  s      r"   r   ztuple.__eq__  s9    edHNN34%LE455r$   c                T    dd l }t         |j                  | j                              S r  )rR  rk  r  r;   )r0   rR  s     r"   rl  ztuple.__hash__  s    NHNN4;;/00r$   c                d    t        | j                  D cg c]  }t        |       c}      S c c}w r.   )r   r;   )r0   rI   s     r"   r  ztuple.__str__  s#    DKK0qCF0110s   -c                ,    t        | j                        S r.   )rF   r;   r^   s    r"   r   ztuple.__iter__  s    DKK  r$   c                ,    t        | j                        S r.   )r6   r;   r^   s    r"   __len__ztuple.__len__  s    4;;r$   c                H    | j                   D ]  }|j                  |        y r.   )r;   r   )r0   r   r  s      r"   r   ztuple._flatten_ir  s!     	#AMM'"	#r$   r.   )r   r:   rY   r  )r  rU   )r   r   )r9   rR   rS   rZ   r  r  r  rh   rs   r   rl  r  r   r  r   r/   r$   r"   r  r    s@    
KC9!10612! #r$   r  c                      e Zd Zd Zy)rS  c                L    || _         || _        || _        t               | _        y r.   )rV  rW  rX  r  rY   )r0   rV  rW  rX  s       r"   rZ   zslice.__init__  s     
		L	r$   Nr  r/   r$   r"   rS  rS    s    !r$   rS  c                  <    e Zd ZddZd	dZd
dZddZddZddZy)tensor_descriptor_base_typec                    || _         y r.   r  )r0   r  s     r"   rZ   z$tensor_descriptor_base_type.__init__  s	    $r$   c                B    t        ||   | j                        }||dz   fS r  )$_experimental_tensor_descriptor_baser  )r0   r   r   rX   s       r"   r   z)tensor_descriptor_base_type._unflatten_ir  s%    4WV_dooVfqj  r$   c                V    |j                  | j                  j                  |            S r.   )create_tensor_descriptor_typer  r  r  s     r"   r  z!tensor_descriptor_base_type.to_ir  s"    44T__5J5J75STTr$   c                "    d| j                    dS )Nztensor_descriptor<r  r  r^   s    r"   r  z#tensor_descriptor_base_type.__str__  s    #DOO#4A66r$   c                b    t        |      t        |       ury| j                  |j                  k(  S r`  rY   r  rf   s     r"   r   z"tensor_descriptor_base_type.__eq__  s*    ;d4j(%"2"222r$   c                    | |k(   S r.   r/   rf   s     r"   __neq__z#tensor_descriptor_base_type.__neq__  r   r$   Nr  r  r   r   r   r   r   z0Tuple[_experimental_tensor_descriptor_base, int]r  r   r  )	r9   rR   rS   rZ   r   r  r  r   r  r/   r$   r"   r  r    s!    %!U73
#r$   r  c                       e Zd ZdZd fdZddZed        Zed        Zed        Z	ddZ
eddd	       Zeddd
       Zedddd       Zedddd       Z xZS )r  z="
    A tensor descriptor with unknown shape and strides
    c                P    t         |           || _        t        |      | _        y)r  N)r  rZ   r  r  rY   )r0   r  r  r  s      r"   rZ   z-_experimental_tensor_descriptor_base.__init__  s!    /
;	r$   c                :    |j                  | j                         y r.   r  r   s     r"   r   z0_experimental_tensor_descriptor_base._flatten_ir  r  r$   c                .    | j                   j                  S r.   r  r^   s    r"   r  z/_experimental_tensor_descriptor_base.block_type  s    yy###r$   c                B    | j                   j                  j                  S r.   )rY   r  r  r^   s    r"   block_shapez0_experimental_tensor_descriptor_base.block_shape  s    yy##)))r$   c                B    | j                   j                  j                  S r.   )rY   r  r  r^   s    r"   r   z*_experimental_tensor_descriptor_base.dtype  s    yy##...r$   c                ,    t        | j                        S r.   )r   rY   r^   s    r"   r  z,_experimental_tensor_descriptor_base.__str__  s    499~r$   Nc                4    t        j                  | |dd|      S )zLoad a block from the descriptor starting at the given element offsets.

        Values outside of the tensor bounds will be filled with zeros.

        :note: Offset must be a multiple of 16-bytes
        r+   )r   descriptor_load)r0   r}  r   s      r"   loadz)_experimental_tensor_descriptor_base.load   s     ''gr2xHHr$   c                2    t        j                  | |||      S )zStore a block from the descriptor starting at the given element offsets.

        Values outside of the tensor bounds will be ignored.

        :note: Offset must be a multiple of 16-bytes
        )r   descriptor_store)r0   r}  rX   r   s       r"   r{  z*_experimental_tensor_descriptor_base.store*  s     ((ugxHHr$   r_  c                   t        |      dk(  sJ dt        |              |d   }|d   }t        j                  | ||dd|      S )z)Gather multiple descriptors worth of datar   z5descriptor gather only supports 2D indexing, but got r   r   r+   )r6   r   descriptor_gather)r0   r   r   	x_offsetsy_offsets        r"   r  z+_experimental_tensor_descriptor_base.gather4  sV     4yA~b!VWZ[_W`Vabb~G	7))$	8RXVVr$   c                   t        |      dk(  sJ dt        |              |d   }|d   }t        j                  | ||||      S )z*Scatter multiple descriptors worth of datar   z6descriptor scatter only supports 2D indexing, but got r   r   )r6   r   descriptor_scatter)r0   rX   r   r   r  r  s         r"   scatterz,_experimental_tensor_descriptor_base.scatter<  sT     4yA~c!WX[\`XaWbcc~G	7**4	8XVVr$   r   r   r   r.   )r}  Sequence[constexpr | tensor]r   r?   )r}  r  rX   r?   r   r?   r  )r9   rR   rS   r8   rZ   r   r  r  r  r   r  r(   r  r{  r  r  r  r  s   @r"   r  r    s    <$ $ $ * * / / I I I I %) W W -1 W Wr$   r  c                  :     e Zd ZddZddZd fdZ fdZ xZS )tensor_descriptor_typec                .    || _         || _        || _        y r.   )r  
shape_typestrides_type)r0   r  r  r  s       r"   rZ   ztensor_descriptor_type.__init__G  s    $$(r$   c                    ||   }|dz  }| j                   j                  ||      \  }}| j                  j                  ||      \  }}|j                  }|j                  }t	        |||| j
                        }||fS r  )r  r   r  r;   _experimental_tensor_descriptorr  )r0   r   r   r  r  stridesrX   s          r"   r   z$tensor_descriptor_type._unflatten_irL  sz    !55gvFv++99'6J../wXf}r$   c                    t         |   |      g| j                  j                  |      | j                  j                  |      S r.   )r  r  r  r  )r0   r  r  s     r"   r  ztensor_descriptor_type.to_irV  s@    g&k)>)>w)Gk$J[J[JaJabiJjkkr$   c                    t         |   |      xr4 | j                  |j                  k(  xr | j                  |j                  k(  S r.   )r  r   r  r  )r0   rg   r  s     r"   r   ztensor_descriptor_type.__eq__Y  sK    w~e$ k$//U=M=M*M kTXTeTeW\WiWiUj 	kr$   )r  r  r  r  r  r  r  r  )r9   rR   rS   rZ   r   r  r   r  r  s   @r"   r  r  E  s    )
lk kr$   r  c                  ,     e Zd ZdZd fdZddZ xZS )r  z9A descriptor representing a tensor in global memory.
    c           
         t         |   ||       t        |t        |D cg c]  }|j                   c}      t        |D cg c]  }|j                   c}            | _        || _        || _        yc c}w c c}w )r  )r  r  N)r  rZ   r  r  rY   r  r  )r0   r  r  r  r  r  r  s         r"   rZ   z(_experimental_tensor_descriptor.__init__b  si     	,*!5"9a166"9:#W$=QVV$=>
	 
 #:$=s   A6A;c                    |j                  | j                         |j                  d | j                  D               |j                  d | j                  D               y )Nc              3  4   K   | ]  }|j                     y wr.   r  r  s     r"   r  z>_experimental_tensor_descriptor._flatten_ir.<locals>.<genexpr>q  s     4Aqxx4   c              3  4   K   | ]  }|j                     y wr.   r%  r  s     r"   r  z>_experimental_tensor_descriptor._flatten_ir.<locals>.<genexpr>r  s     6Aqxx6r&  )r  r  extendr  r  r   s     r"   r   z+_experimental_tensor_descriptor._flatten_iro  s>    t{{#444666r$   )r  List[tensor]r  r)  r  r  r   )r9   rR   rS   r8   rZ   r   r  r  s   @r"   r  r  ^  s    7r$   r  c                T    t        j                  | d      }|dk(  xs |dk(  xs |dk(  S )N01trueon)osgetenv)var_namer  s     r"   get_bool_env_varr2  u  s.    
		(C A8/qF{/a4i/r$   c                >    t        | t              r| j                  S | S r.   r   )r  s    r"   re   re   }  s    !YwwHr$   c                D    t        |       } t        j                  | |      S )z
    Returns the id of the current program instance along the given :code:`axis`.

    :param axis: The axis of the 3D launch grid. Must be 0, 1 or 2.
    :type axis: int
    )re   r   
program_idrt  r   s     r"   r5  r5    s!     t$DtX..r$   c                D    t        |       } t        j                  | |      S )z
    Returns the number of program instances launched along the given :code:`axis`.

    :param axis: The axis of the 3D launch grid. Must be 0, 1 or 2.
    :type axis: int
    )re   r   num_programsr6  s     r"   r8  r8    s!     t$D  x00r$   c                \    t        |       } t        |      }t        j                  | ||      S r.   )re   r   arange)rV  endr   s      r"   r:  r:    s)    &E
c
"C??5#x00r$   z
    Returns contiguous values within the half-open interval :code:`[start,
    end)`.  :code:`end - start` must be less than or equal to
    :code:`TRITON_MAX_TENSOR_NUMEL = z`

    :param start: Start of the interval. Must be a power of two.
    :type start: int32
    :param end: End of the interval. Must be a power of two greater than
        :code:`start`.
    :type end: int32
c                T    t        |       } | D cg c]  }t        |       c}S c c}w r.   )re   )r  r  s     r"   r  r    s&    &E,12q"222s   %c                2    t        |       } t        |        | S r.   )r  r   r  s    r"   _shape_check_implr>    s    % ELr$   c                t    t        |       } t        |      }t        |      }t        j                  | |||      S )ax  
    Returns a tensor filled with the scalar value for the given :code:`shape` and :code:`dtype`.

    :param shape: Shape of the new array, e.g., (8, 16) or (8, )
    :type shape: tuple of ints
    :param value: A scalar value to fill the array with
    :type value: scalar
    :param dtype: Data type of the new array, e.g., :code:`tl.float16`
    :type dtype: tl.dtype
    )r>  re   r   full)r  rX   r   r   s       r"   r@  r@    s7     e$E&E&E==uh77r$   c                0    t        j                  | ||      S )z
    Tries to broadcast the two given blocks to a common compatible shape.

    :param input: The first input tensor.
    :type input: Block
    :param other: The second input tensor.
    :type other: Block
    )r   broadcast_impl_value)inputrg   r   s      r"   	broadcastrD    s     ((x@@r$   r_  c               X    t        t        |            }t        j                  | ||      S )ae  
    Tries to broadcast the given tensor to a new :code:`shape`.

    :param input: The input tensor.
    :type input: Block
    :param shape: The desired shape.
    :type shape:

    :code:`shape` can be passed as a tuple or as individual parameters: ::

        # These are equivalent
        broadcast_to(x, (32, 32))
        broadcast_to(x, 32, 32)
    )r>  rJ   r   broadcast_impl_shaperC  r   r  s      r"   rf  rf    s)    " .u56E((x@@r$   c               N    t        |      }|sd}t        j                  | ||      S )a  
    Permutes the dimensions of a tensor.

    If the parameter :code:`dims` is not specified, the function defaults to a (1,0) permutation,
    effectively transposing a 2D tensor.

    :param input: The input tensor.
    :param dims: The desired ordering of dimensions.  For example,
        :code:`(2, 1, 0)` reverses the order dims in a a 3D tensor.

    :code:`dims` can be passed as a tuple or as individual parameters: ::

        # These are equivalent
        trans(x, (2, 1, 0))
        trans(x, 2, 1, 0)

    :py:func:`permute` is equivalent to this function, except it doesn't
    have the special case when no permutation is specified.
    )r   r   rJ   r   rm  rC  r   rj  s      r"   rk  rk    s+    , D!DE422r$   c               F    t        |      }t        j                  | ||      S )a  
    Permutes the dimensions of a tensor.

    :param input: The input tensor.
    :type input: Block
    :param dims: The desired ordering of dimensions.  For example,
        :code:`(2, 1, 0)` reverses the order dims in a a 3D tensor.

    :code:`dims` can be passed as a tuple or as individual parameters: ::

        # These are equivalent
        permute(x, (2, 1, 0))
        permute(x, 2, 1, 0)

    :py:func:`trans` is equivalent to this function, except when
    :code:`dims` is empty, it tries to do a (1,0) permutation.
    rI  rJ  s      r"   rm  rm    s#    ( D!DE422r$   Fc                2    t        j                  | |||      S )a  
    Concatenate the given blocks

    :param input: The first input tensor.
    :type input: Tensor
    :param other: The second input tensor.
    :type other: Tensor
    :param reorder: Compiler hint. If true, the compiler is
        allowed to reorder elements while concatenating inputs.  Only use if the
        order does not matter (e.g., result is only used in reduction ops).
        Current implementation of `cat` supports only can_reorder=True.
    )r   cat)rC  rg   can_reorderr   s       r"   rM  rM  3  s     <<uk8<<r$   c                0    t        j                  | ||      S )aZ  
    Join the given tensors in a new, minor dimension.

    For example, given two tensors of shape (4,8), produces a new tensor of
    shape (4,8,2).  Given two scalars, returns a tensor of shape (2).

    The two inputs are broadcasted to be the same shape.

    If you want to join more than two elements, you can use multiple calls to
    this function.  This reflects the constraint in Triton that tensors must
    have power-of-two sizes.

    join is the inverse of split.

    :param a: The first input tensor.
    :type a: Tensor
    :param b: The second input tensor.
    :type b: Tensor
    )r   r  )abr   s      r"   r  r  D  s    * ==Ax((r$   c                    | S r.   r/   )rP  rQ  s     r"   _take_firstrS  \  s    Hr$   c           
     R   t        | j                        dk(  }|rt        j                  | d|      } t        j                  | |      \  }}|rXt        j                  t        t        |dt        ||            }t        j                  t        t        |dt        ||            }||fS )a  
    Split a tensor in two along its last dim, which must have size 2.

    For example, given a tensor of shape (4,8,2), produces two tensors of shape
    (4,8).  Given a tensor of shape (2), returns two scalars.

    If you want to split into more than two pieces, you can use multiple calls
    to this function (probably plus calling reshape).  This reflects the
    constraint in Triton that tensors must have power-of-two sizes.

    split is the inverse of join.

    :param a: The tensor to split.
    :type a: Tensor
    r   r   Nr   r*   )
r6   r  r   rU  r  typingr`  r?   r  rS  )rP  r   r*   
was_rank_1out_lhsout_rhss         r"   r  r  a  s    * QWW"J  Ax0~~a2GW++ffWdKRZgq&rs++ffWdKRZgq&rsGr$   c               r    t        d       t        t        |            }t        j                  | |d|      S )a  
    Returns a tensor with the same elements as `input` but a different shape.
    The order of the elements may not be preserved.

    :param input: The input tensor.
    :type input: Block
    :param shape: The desired shape.

    :code:`shape` can be passed as a tuple or as individual parameters: ::

        # These are equivalent
        view(x, (32, 32))
        view(x, 32, 32)
    zCview is deprecated, please use reshape with can_reorder being true.T)rN  r  )r   r>  rJ   r   rr  rG  s      r"   rp  rp    s4    " 		NO.u56EE5dHMMr$   )rN  r   c               Z    t        t        |            }t        j                  | |||      S )af  
    Returns a tensor with the same number of elements as input but with the
    provided shape.

    :param input: The input tensor.
    :type input: Block
    :param shape: The new shape.

    :code:`shape` can be passed as a tuple or as individual parameters: ::

        # These are equivalent
        reshape(x, (32, 32))
        reshape(x, 32, 32)
    )r>  rJ   r   rr  )rC  rN  r   r  s       r"   rr  rr    s+    " .u56EE5+x@@r$   c                `    | | cxk  r|k  sn t        d|  d|  d|       | dk\  r| S | |z   S )Nzinvalid axis z. Expected z <= axis < r   r   )rt  ndims     r"   
_wrap_axisr^    sF    ET D =k4%D6RSS194-$+-r$   c                   t        j                  | |      } t        |      }t        |t        t
        f      rt        |      n|g}t        | j                        t        |      z   }|D cg c]  }t        t        |      |       }}t        t        |            t        |      k7  rt        d|       | }t        |      D ]  }t        j                  |||      } |S c c}w )aR  
    Expand the shape of a tensor, by inserting new length-1 dimensions.

    Axis indices are with respect to the resulting tensor, so
    ``result.shape[axis]`` will be 1 for each axis.

    :param input: The input tensor.
    :type input: tl.tensor
    :param axis: The indices to add new axes
    :type axis: int | Sequence[int]

    z7expand_dims received duplicate axes, normalized axes = )r   rO   re   rW   r   r  r:   r6   r  r^  setr   sortedrU  )rC  rt  r   axesnew_ndimdrZ  rP  s           r"   rU  rU    s     uh/Et$D#D8U*;<4:4&D5;;#d)+HBFGQJ*1-x8GDG
3t9~T"RSWRXYZZ
CD\ 5""3845J Hs   +C"c                    t        j                  | |      } t        |      }t        |      }t        |      }|rt        j                  | ||      S t        j                  | |||      S )a  
    Casts a tensor to the given :code:`dtype`.

    :param dtype: The target data type.
    :type dtype: tl.dtype
    :param fp_downcast_rounding: The rounding mode for downcasting
        floating-point values. This parameter is only used when self is a
        floating-point tensor and dtype is a floating-point type with a
        smaller bitwidth. Supported values are :code:`"rtne"` (round to
        nearest, ties to even) and :code:`"rtz"` (round towards zero).
    :type fp_downcast_rounding: str, optional
    :param bitcast: If true, the tensor is bitcasted to the given
        :code:`dtype`, instead of being numerically casted.
    :type bitcast: bool, optional
    )r   rO   re   rb  r`  )rC  r   ra  rb  r   s        r"   r`  r`    sd    $ uh/E&E./CD!'*Guh77==x1EFFr$   c           	        |	|J d       |<|xr d|j                   j                  v }|r|s|dnd}	t        j                  d|	      }t	        |      }t	        |      }t	        |      }t        j                  | ||||||      S )a(  
    Returns the matrix product of two blocks.

    The two blocks must both be two-dimensional or three-dimensional and have compatible inner dimensions.
    For three-dimensional blocks, `tl.dot` performs the batched matrix product,
    where the first dimension of each block represents the batch dimension.

    :param input: The first tensor to be multiplied.
    :type input: 2D or 3D tensor of scalar-type in {:code:`int8`, :code:`float8_e5m2`, :code:`float16`, :code:`bfloat16`, :code:`float32`}
    :param other: The second tensor to be multiplied.
    :type other: 2D or 3D tensor of scalar-type in {:code:`int8`, :code:`float8_e5m2`, :code:`float16`, :code:`bfloat16`, :code:`float32`}
    :param acc: The accumulator tensor. If not None, the result is added to this tensor.
    :type acc: 2D or 3D tensor of scalar-type in {:code:`float16`, :code:`float32`, :code:`int32`}
    :param input_precision: How to exercise the Tensor Cores for f32 x f32. If
      the device does not have Tensor Cores or the inputs are not of dtype f32,
      this option is ignored. For devices that do have tensor cores, the
      default precision is tf32.
    :type input_precision: string. Available options for nvidia: :code:`"tf32"`, :code:`"tf32x3"`, :code:`"ieee"`. Default: :code:`"tf32"`. Available options for amd: :code:`"ieee"`, (CDNA3 only) :code:`"tf32"`.
    :param allow_tf32: *Deprecated.* If true, input_precision is set to "tf32".
      Only one of :code:`input_precision` and :code:`allow_tf32` can be
      specified (i.e. at least one must be :code:`None`).
    z;Only one of input_precision and allow_tf32 can be specifiedtf32ieeeTRITON_F32_DEFAULT)rq  allowed_dot_input_precisionsr/  r0  re   r   dot)
rC  rg   accinput_precision
allow_tf32max_num_imprecise_acc	out_dtyper   supports_tf32default_precisions
             r"   rk  rk    s    2 "j&8w:ww8 \Vx/?/?/\/\%\'4*
HZFbh))$8:KL)/:O#I.I/0EF<<uc?<QS\^fggr$   c
                t    t        |      }|t        k(  sJ d       t        j                  | |||||||||	
      S )aQ  
    Returns the matrix product of two blocks in microscaling format.

    lhs and rhs use microscaling formats described here:
    https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf

    Software emulation enables targeting hardware architectures without native microscaling
    operation support. Right now for such case, microscaled lhs/rhs are upcasted to
    :code:`bf16` element type beforehand for dot computation, with one exception:
    for AMD CDNA3 specifically, if one of the inputs is of :code:`fp16` element type,
    the other input is also upcasted to :code:`fp16` element type instead.
    This behavior is experimental and may be subject to change in the future.

    :param lhs: The first tensor to be multiplied.
    :type lhs: 2D tensor representing fp4, fp8 or bf16 elements. Fp4 elements are packed into uint8 inputs with the first element in lower bits. Fp8 are stored as uint8 or the corresponding fp8 type.
    :param lhs_scale: Scale factor for lhs tensor.
    :type lhs_scale: e8m0 type represented as an uint8 tensor.
    :param lhs_format: format of the lhs tensor. Available formats: {:code:`e2m1`, :code:`e4m3`, :code:`e5m2`, :code:`bf16`, :code:`fp16`}.
    :type lhs_format: str
    :param rhs: The second tensor to be multiplied.
    :type rhs: 2D tensor representing fp4, fp8 or bf16 elements. Fp4 elements are packed into uint8 inputs with the first element in lower bits. Fp8 are stored as uint8 or the corresponding fp8 type.
    :param rhs_scale: Scale factor for rhs tensor.
    :type rhs_scale: e8m0 type represented as an uint8 tensor.
    :param rhs_format: format of the rhs tensor. Available formats: {:code:`e2m1`, :code:`e4m3`, :code:`e5m2`, :code:`bf16`, :code:`fp16`}.
    :type rhs_format: str
    :param acc: The accumulator tensor. If not None, the result is added to this tensor.
    z5Only float32 is supported for out_dtype at the moment)re   float32r   
dot_scaled)
lhs	lhs_scale
lhs_formatrhs	rhs_scale
rhs_formatrl  	fast_mathrp  r   s
             r"   ru  ru    sM    < $I.IX!XXsIz3	:WZ\egp') )r$   c	                    t        |      }t        |      }|t        j                  ||      }|t        j                  ||      }t        |      }t        |      }t        |      }t        |      }t        j                  | ||||||||	      S )a;	  
    Return a tensor of data whose values are loaded from memory at location defined by `pointer`:

        (1) If `pointer` is a single element pointer, a scalar is be loaded.  In
            this case:

            - `mask` and `other` must also be scalars,
            - `other` is implicitly typecast to `pointer.dtype.element_ty`, and
            - `boundary_check` and `padding_option` must be empty.

        (2) If `pointer` is an N-dimensional tensor of pointers, an
            N-dimensional tensor is loaded.  In this case:

            - `mask` and `other` are implicitly broadcast to `pointer.shape`,
            - `other` is implicitly typecast to `pointer.dtype.element_ty`, and
            - `boundary_check` and `padding_option` must be empty.

        (3) If `pointer` is a block pointer defined by `make_block_ptr`, a
            tensor is loaded.  In this case:

            - `mask` and `other` must be `None`, and
            - `boundary_check` and `padding_option` can be specified to control the behavior of out-of-bound access.

    :param pointer: Pointer to the data to be loaded
    :type pointer: `triton.PointerType`, or block of `dtype=triton.PointerType`
    :param mask: if `mask[idx]` is false, do not load the data at address `pointer[idx]`
        (must be `None` with block pointers)
    :type mask: Block of `triton.int1`, optional
    :param other: if `mask[idx]` is false, return `other[idx]`
    :type other: Block, optional
    :param boundary_check: tuple of integers, indicating the dimensions which should do the boundary check
    :type boundary_check: tuple of ints, optional
    :param padding_option: should be one of {"", "zero", "nan"}, the padding value to use while out of bounds. "" means an undefined value.
    :param cache_modifier: changes cache option in NVIDIA PTX
    :type cache_modifier: str, optional, should be one of {"", "ca", "cg"}, where "ca" stands for
        cache at all levels and "cg" stands for cache at global level (cache in L2 and below, not L1), see
        `cache operator <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cache-operators>`_ for more details.
    :param eviction_policy: changes eviction policy in NVIDIA PTX
    :type eviction_policy: str, optional
    :param volatile: changes volatile option in NVIDIA PTX
    :type volatile: bool, optional
    )re   r   rO   r  )	pointerrw  rg   rx  padding_optionry  rz  volatiler   s	            r"   r  r  B  s    \ t$D&E!!$1""5(3(8N(8N)/:O"8,H==$~~~_n!8- -r$   c                Z    t        t        |      |      }t        j                  | ||      S )zQ
    Reinterpret a generic pointer as a TMA-backed tensor descriptor object.
    )r  re   r   reinterpret_tensor_descriptor)desc_ptrr  r   r   block_tys        r"   +_experimental_reinterpret_tensor_descriptorr  ~  s+     -e4kBH11(HhOOr$   c                F    t        | |||      }|j                  ||      S )a  
    Experimental feature to access TMA descriptors loads. This is an escape hatch to easily exercise TTGIR operations.
    This will be removed in the future and shouldn't be used in production code.

    This loads a tensor of data based on the descriptor and offsets.
    r_  )r  r  )desc_pointerr}  r  r   r   descs         r"   _experimental_descriptor_loadr    s(     7|UE\deD99Wx900r$   c                p    t        | |j                  |j                  |      }|j                  |||      S )a  
    Experimental feature to access TMA descriptors stores. This is an escape hatch to easily exercise TTGIR operations.
    This will be removed in the future and shouldn't be used in production code.

    This stores a tensor of data based on the descriptor and offsets.
    r_  )r  r  r   r{  )r  rX   r}  r   r  s        r"   _experimental_descriptor_storer    s5     7|U[[RWR]R]hpqD::gux:88r$   c           	         t        j                  ||      }t        |      }|t        j                  ||      }t        |      }t        |      }t        j                  | ||||||      S )a  
    Store a tensor of data into memory locations defined by `pointer`.

        (1) If `pointer` is a single element pointer, a scalar is stored.  In
            this case:

            - `mask` must also be scalar, and
            - `boundary_check` and `padding_option` must be empty.

        (2) If `pointer` is an N-dimensional tensor of pointers, an
            N-dimensional block is stored.  In this case:

            - `mask` is implicitly broadcast to `pointer.shape`, and
            - `boundary_check` must be empty.

        (3) If `pointer` is a block pointer defined by `make_block_ptr`, a block
            of data is stored.  In this case:

            - `mask` must be None, and
            - `boundary_check` can be specified to control the behavior of out-of-bound access.

    `value` is implicitly broadcast to `pointer.shape` and typecast to `pointer.dtype.element_ty`.

    :param pointer: The memory location where the elements of `value` are stored
    :type pointer: `triton.PointerType`, or block of `dtype=triton.PointerType`
    :param value: The tensor of elements to be stored
    :type value: Block
    :param mask: If `mask[idx]` is false, do not store `value[idx]` at `pointer[idx]`
    :type mask: Block of triton.int1, optional
    :param boundary_check: tuple of integers, indicating the dimensions which should do the boundary check
    :type boundary_check: tuple of ints, optional
    :param cache_modifier: changes cache option in NVIDIA PTX
    :type cache_modifier: str, optional, should be one of {"", ".wb", ".cg", ".cs", ".wt"}, where ".wb" stands for
        cache write-back all coherent levels, ".cg" stands for cache global, ".cs" stands for cache streaming, ".wt"
        stands for cache write-through, see `cache operator <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cache-operators>`_ for more details.
    :param eviction_policy: changes eviction policy in NVIDIA PTX
    :type eviction_policy: str, optional, should be one of {"", "evict_first", "evict_last"}
    )r   rO   re   r{  )r~  rX   rw  rx  ry  rz  r   s          r"   r{  r{    sj    T uh/Et$D!!$1(8N)/:O>>'5$P_aijjr$   c           	     8    t        j                  | ||||||      S )ak  
    Returns a pointer to a block in a parent tensor

    :param base: The base pointer to the parent tensor
    :param shape: The shape of the parent tensor
    :param strides: The strides of the parent tensor
    :param offsets: The offsets to the block
    :param block_shape: The shape of the block
    :param order: The order of the original data format
    )r   make_block_ptr)baser  r  r}  r  orderr   s          r"   r  r    s"     ""4+uV^__r$   c                0    t        j                  | ||      S )z
    Advance a block pointer

    :param base: the block pointer to advance
    :param offsets: the offsets to advance, a tuple by dimension
    )r   r~  )r  r}  r   s      r"   r~  r~    s     D'844r$   c                4    t        j                  | ||||      S )a  Make an experimental tensor descriptor object

    :param base: the base pointer of the tensor, must be 16-byte aligned
    :param shape: A list of non-negative integers representing the tensor shape
    :param strides: A list of tensor strides. Leading dimensions must be multiples
        of 16-byte strides and the last dimension must be contiguous.
    :param block_shape: The shape of block to be loaded/stored from global memory

    Notes
    *****
    On NVIDIA GPUs with TMA support, this will result in a TMA descriptor object
    and loads and stores from the descriptor will be backed by the TMA hardware.

    Currently only 2-5 dimensional tensors are supported.

    Example
    *******
    .. code-block:: python

        @triton.jit
        def inplace_abs(in_out_ptr, M, N, M_BLOCK: tl.constexpr, N_BLOCK: tl.constexpr):
            desc = tl._experimental_make_tensor_descriptor(
                in_out_ptr,
                shape=[M, N],
                strides=[N, 1],
                block_shape=[M_BLOCK, N_BLOCK],
            )

            moffset = tl.program_id(0) * M_BLOCK
            noffset = tl.program_id(1) * N_BLOCK

            value = desc.load([moffset, noffset])
            desc.store([moffset, noffset], tl.abs(value))

        # TMA descriptors require a global memory allocation
        def alloc_fn(size: int, alignment: int, stream: Optional[int]):
            return torch.empty(size, device="cuda", dtype=torch.int8)

        triton.set_allocator(alloc_fn)

        M, N = 256, 256
        x = torch.randn(M, N, device="cuda")
        M_BLOCK, N_BLOCK = 32, 32
        grid = (M / M_BLOCK, N / N_BLOCK)
        inplace_abs[grid](x, M, N, M_BLOCK, N_BLOCK)

    )r   make_tensor_descriptor)r  r  r  r  r   s        r"   $_experimental_make_tensor_descriptorr    s    n **4hWWr$   c                     d fd}|S )Nc                :    d d}r|dz  }|dz  }|| _         | S )Nz
    Performs an atomic z at the memory location specified by :code:`pointer`.

    Return the data stored at :code:`pointer` before the atomic operation.

    :param pointer: The memory locations to operate on
    :type pointer: Block of dtype=triton.PointerDTypez|
    :param cmp: The values expected to be found in the atomic object
    :type cmp: Block of dtype=pointer.dtype.element_tya  
    :param val: The values with which to perform the atomic operation
    :type val: Block of dtype=pointer.dtype.element_ty
    :param sem: Specifies the memory semantics for the operation. Acceptable values are "acquire",
        "release", "acq_rel" (stands for "ACQUIRE_RELEASE"), and "relaxed". If not provided,
        the function defaults to using "acq_rel" semantics.
    :type sem: str, optional
    :param scope: Defines the scope of threads that observe the synchronizing effect of the atomic operation.
        Acceptable values are "gpu" (default), "cta" (cooperative thread array, thread block), or "sys" (stands for "SYSTEM"). The default value is "gpu".
    :type scope: str, optional
    )r8   )funcdocstrhas_cmpr2   s     r"   
_decoratorz&_add_atomic_docstr.<locals>._decorator/  sL    v 69  : :F 	 
 
	 r$   r  r   r   r   r/   )r2   r  r  s   `` r"   _add_atomic_docstrr  -  s    4 r$   zcompare-and-swapT)r  c                    t        j                  ||      }t        j                  ||      }t        |      }t        |      }t        j                  | |||||      S r.   )r   rO   re   r  )r~  r  r  r  r  r   s         r"   r  r  L  sW     

S(
+C


S(
+C
c
"C&EwS#uhGGr$   exchangec                    t        j                  ||      }t        |      }t        |      }t        |      }t        j                  | |||||      S r.   )r   rO   re   r  r~  r  rw  r  r  r   s         r"   r  r  W  sO     

S(
+C
c
"C&Et$DdCIIr$   r  c                    t        j                  ||      }t        |      }t        |      }t        |      }t        j                  | |||||      S r.   )r   rO   re   r  r  s         r"   r  r  b  O     

S(
+C
c
"C&Et$DwT3xHHr$   r  c                    t        j                  ||      }t        |      }t        |      }t        |      }t        j                  | |||||      S r.   )r   rO   re   r  r  s         r"   r  r  m  r  r$   r  c                    t        j                  ||      }t        |      }t        |      }t        |      }t        j                  | |||||      S r.   )r   rO   re   r  r  s         r"   r  r  x  r  r$   zlogical andc                    t        j                  ||      }t        |      }t        |      }t        |      }t        j                  | |||||      S r.   )r   rO   re   r  r  s         r"   r  r    r  r$   z
logical orc                    t        j                  ||      }t        |      }t        |      }t        |      }t        j                  | |||||      S r.   )r   rO   re   r  r  s         r"   r  r    sO     

S(
+C
c
"C&Et$DgsD#uhGGr$   zlogical xorc                    t        j                  ||      }t        |      }t        |      }t        |      }t        j                  | |||||      S r.   )r   rO   re   r  r  s         r"   r  r    r  r$   c                    t        j                  | |      } t        |      }t        |      }t        j                  | |||      S )a  
    Returns a tensor of elements from either :code:`x` or :code:`y`, depending on :code:`condition`.

    Note that :code:`x` and :code:`y` are always evaluated regardless of the value of :code:`condition`.

    If you want to avoid unintended memory operations, use the :code:`mask` arguments in `triton.load` and `triton.store` instead.

    The shape of :code:`x` and :code:`y` are both broadcast to the shape of :code:`condition`.
    :code:`x` and :code:`y` must have the same data type.

    :param condition: When True (nonzero), yield x, otherwise yield y.
    :type condition: Block of triton.bool
    :param x: values selected at indices where condition is True.
    :param y: values selected at indices where condition is False.
    )r   rO   r   where)	conditionrI   yr   s       r"   r  r    s?    " ""9h7IQAQA>>)Q844r$   c                ^    t        |       } t        |      }t        j                  | |||      S r.   )r   r   r  rI   r  r  r   s       r"   r  r    ,    QAQA<<1/::r$   c                ^    t        |       } t        |      }t        j                  | |||      S r.   )r   r   r  r  s       r"   r  r    r  r$   c                ^    t        |       } t        |      }t        j                  | |||      S r.   )r   r   r  r  s       r"   r  r    r  r$   c                    t        j                  | |      } t        j                  ||      }t        | |      } t        ||      }t        |      }t        j                  | |||      S )aH  
    Computes the element-wise minimum of :code:`x` and :code:`y`.

    :param x: the first input tensor
    :type x: Block
    :param y: the second input tensor
    :type y: Block
    :param propagate_nan: whether to propagate NaN values.
    :type propagate_nan: tl.PropagateNan

    .. seealso:: :class:`tl.PropagateNan`
    r_  )r   rO   _promote_bfloat16_to_float32re   minimumrI   r  propagate_nanr   s       r"   r  r    a     	1h'A1h'A$Q:A$Q:A'6MAq-::r$   c                    t        j                  | |      } t        j                  ||      }t        | |      } t        ||      }t        |      }t        j                  | |||      S )aH  
    Computes the element-wise maximum of :code:`x` and :code:`y`.

    :param x: the first input tensor
    :type x: Block
    :param y: the second input tensor
    :type y: Block
    :param propagate_nan: whether to propagate NaN values.
    :type propagate_nan: tl.PropagateNan

    .. seealso:: :class:`tl.PropagateNan`
    r_  )r   rO   r  re   maximumr  s       r"   r  r    r  r$   c                   t        j                  | |      } t        j                  ||      }t        j                  ||      }t        | |      } t        ||      }t        ||      }t        |      }t        j                  | ||||      S )a<  
    Clamps the input tensor :code:`x` within the range [min, max].
    Behavior when :code:`min` > :code:`max` is undefined.

    :param x: the input tensor
    :type x: Block
    :param min: the lower bound for clamping
    :type min: Block
    :param max: the upper bound for clamping
    :type max: Block
    :param propagate_nan: whether to propagate NaN values. Applies only to the :code:`x` tensor.
        If either :code:`min` or :code:`max` is NaN, the result is undefined.
    :type propagate_nan: tl.PropagateNan

    .. seealso:: :class:`tl.PropagateNan`
    r_  )r   rO   r  re   clamp)rI   r  r  r  r   s        r"   r  r  	  s    $ 	1h'A


S(
+C


S(
+C$Q:A
&sX
>C
&sX
>C'6M>>!S#}h??r$   c                      d fd}|S )Nc                    d}|d d d dz  }|d d d dz  }|d d d	 d
z  }|j                        | _        | S )Na~  
    Returns the {name} of all elements in the :code:`input` tensor along the provided :code:`axis`

    :param input: the input values
    :type input: Tensor
    :param axis: the dimension along which the reduction should be done. If None, reduce all dimensions
    :type axis: int
    :param keep_dims: if true, keep the reduced dimensions with length 1
    :type keep_dims: boolz
    :param z-: if true, return index corresponding to the z value
    :type z: boolzC: if true, in case of a tie (i.e., multiple elements have the same zI value), return the left-most index for values that aren't NaN
    :type zc: the desired data type of the returned tensor. If specified, the input tensor is casted to :code:`z` before the operation is performed. This is useful for preventing data overflows. If not specified, integer and bool dtypes are upcasted to :code:`tl.int32` and float dtypes are upcasted to at least :code:`tl.float32`.
    :type z
: tl.dtyper1   formatr8   )r  r  	dtype_argr2   return_indices_argtie_break_args     r"   r  z)_add_reduction_docstr.<locals>._decorator,	  s     ) LTF S
f( (F $ ?]^b]c d/# #F   ;y  {D  zE E+Z# #F }}$}/r$   r  r/   )r2   r  r  r  r  s   ```` r"   _add_reduction_docstrr  )	  s     4 r$   c              #  V   K   | j                         }d  | j                  |       y wr.   )get_insertion_pointrestore_insertion_point)r  ips     r"   _insertion_guardr  I	  s%     		$	$	&B	##B's   ')c                z    t         t              rt         f|      d   S  fd}fdt              t        |      }"t	        t         d   j                              t        j                   |      }|r.t        fd|D              }|S t         fd|D              }|S )a'  Applies the combine_fn to all elements in :code:`input` tensors along the provided :code:`axis`

    :param input: the input tensor, or tuple of tensors
    :type input: Tensor
    :param axis: the dimension along which the reduction should be done. If None, reduce all dimensions
    :type axis: int | None
    :param combine_fn: a function to combine two groups of scalar tensors (must be marked with @triton.jit)
    :type combine_fn: Callable
    :param keep_dims: if true, keep the reduced dimensions with length 1
    :type keep_dims: bool

    r  r   r*   r   c           
     J   D cg c]  }|j                   j                   c}dz  }| j                  d      }t              5  fd}j	                  |t        t        ||                  }t        |      D cg c]   \  }}t        |j                  |      |      " }}}j                  |i       }	t        |	t              r|	j                  g}
n|	D cg c]  }|j                   }
} j                  |
  d d d        y c c}w c c}}w c c}w # 1 sw Y   y xY w)Nr   r   c                &    | j                        S r.   r  r   r   s    r"   <lambda>z5reduce.<locals>.make_combine_region.<locals>.<lambda>f	      aggh/ r$   r    )rY   r   
get_regionr  create_block_with_parentr:   maprT  r?   argcall_JitFunctionrW   r  create_reduce_ret)	reduce_optparam_typesregionr  blockr  r  r   resultsr   rr   r*   r  rC  s               r"   make_combine_regionz#reduce.<locals>.make_combine_regionb	  s   .34qvv}}4q8%%a(h' 		1/E55fd3ukCZ>[\E:CK:PQBF599Q<,QDQ 11*d21NG'6*">>*-4518855&H&&0		1 		1 5
 R
 6		1 		1/   D	9D>%D#9DD/DDD"c                X    t        j                  |      D ]  }t        | d      }  | S )Nr   r_  )rR  rangerU  )r  ndims_r   s      r"   expand_ndimszreduce.<locals>.expand_ndimsp	  s.    & 	5AAq84A	5r$   c              3  :   K   | ]  }t        |         yw)r_  N)rU  )r  r  r   rt  s     r"   r  zreduce.<locals>.<genexpr>|	  s     MAAth??Ms   c              3  \   K   | ]#  } |t        d    j                               % yw)r   N)r6   r  )r  r  r  rC  s     r"   r  zreduce.<locals>.<genexpr>~	  s%     JQE!HNN(;<Js   ),)
rW   r?   r  re   r^  r6   r  r   	reductionr  )	rC  rt  r  r  r   r*   r  rZ  r  s	   ``` ``  @r"   r  r  P	  s     % uizYQYfpqrstt1
 t$D#I.I$E!HNN 34


UD*=x
HCMMMC J JcJJCJr$   c                p    | j                   j                  }|t        u r| j                  t        |      S | S )Nr_  )rY   r   bfloat16rc  rt  )r  r   	scalar_tys      r"   r  r  	  s1    I HttGht//Hr$   c                   t        |      }| j                  |   }t        d||      }t        | j                        dkD  rgt	        j
                  t        | j                              D cg c]  }t        |       }	}|	|= t        ||	|      }t        || j                  |      }t        | |f|||||      \  }
}|
|fS c c}w )Nr   r_  r   r  )
re   r  r:  r6   rR  r  rU   rU  rf  r  )rC  rt  r  r  r   r*   nr  rd  axes_to_expandrvaluerindicess               r"   _reduce_with_indicesr  	  s    t$DDA1a(+E
5;;!08s5;;?O0PQ1)A,QQ4 E>HEUEKK(CuendJ)^f)35FH8 Rs   +Cc                     d fd}|S )Nc                :    d}|j                        | _        | S )Nz
    Returns the {name} of all elements in the :code:`input` tensor along the provided :code:`axis`

    :param input: the input values
    :type input: Tensor
    :param axis: the dimension along which the scan should be done
    :type axis: intr1   r  )r  r  r2   s     r"   r  z$_add_scan_docstr.<locals>._decorator	  s#     }}$}/r$   r  r/   )r2   r  s   ` r"   _add_scan_docstrr  	  s    	 r$   c                     t         t              rt         f||      d   S  fd}t        |      }|"t	        |t         d   j                              }t        j                   |||      S )a6  Applies the combine_fn to each elements with a carry in :code:`input` tensors along the provided :code:`axis` and update the carry

    :param input: the input tensor, or tuple of tensors
    :type input: Tensor
    :param axis: the dimension along which the reduction should be done
    :type axis: int
    :param combine_fn: a function to combine two groups of scalar tensors (must be marked with @triton.jit)
    :type combine_fn: Callable
    :param reverse: whether to apply the associative scan in the reverse direction along axis
    :type reverse: bool

    rU  r   c           
     J   D cg c]  }|j                   j                   c}dz  }| j                  d      }t              5  fd}j	                  |t        t        ||                  }t        |      D cg c]   \  }}t        |j                  |      |      " }}}j                  |i       }	t        |	t              r|	j                  g}
n|	D cg c]  }|j                   }
} j                  |
  d d d        y c c}w c c}}w c c}w # 1 sw Y   y xY w)Nr   r   c                &    | j                        S r.   r  r  s    r"   r  z?associative_scan.<locals>.make_combine_region.<locals>.<lambda>	  r  r$   r  )rY   r   r  r  r  r:   r  rT  r?   r  r  rW   r  create_scan_ret)scan_opr  r  r  r  r  r  r  r   r  r   r  r   r*   r  rC  s               r"   r  z-associative_scan.<locals>.make_combine_region	  s   .34qvv}}4q8##A&h' 		//E55fd3ukCZ>[\E:CK:PQBF599Q<,QDQ 11*d21NG'6*">>*-4518855$H$$g.		/ 		/ 5
 R
 6		/ 		/r  )rW   r?   r  re   r^  r6   r  r   )rC  rt  r  r  r   r*   r  s   ` ` `` r"   r  r  	  sx     % 	4Wxdnopqrr/ t$D$E!HNN 34$$UD2EwPXYYr$   c                F    t        |      }t        j                  | ||      S )zcomputes an histogram based on input tensor with num_bins bins, the bins have a width of 1 and start at 0.

    :param input: the input tensor
    :type input: Tensor
    :param num_bins: number of histogram bins
    :type num_bins: int

    )re   r   r  )rC  r  r   r*   s       r"   r  r  	  s#     #8,HeXx88r$   c                H    t        |      }t        j                  | |||      S )zGather from a tensor along a given dimension.

    :param src: the source tensor
    :type src: Tensor
    :param index: the index tensor
    :type index: Tensor
    :param axis: the dimension to gather along
    :type axis: int

    )re   r   r  )srcr  rt  r   s       r"   r  r  	  s#     t$D??3tX66r$   c                ,    t        j                  |       S )zA
    Insert a barrier to synchronize all threads in a block.
    )r   debug_barrierr_  s    r"   r  r  	  s    
 !!(++r$   c           	     r   t        |t              r|g}t        |      D ]c  \  }}t        |t              st        d| d      t        |j                  t
              r@t        d| dt        |j                         d       |D cg c]  }|j                   }}t        j                  | |      S c c}w )zd
    Let the compiler know that the values in :code:`input` are all multiples of :code:`value`.
    values element  must have type `constexpr`1 must have type `constexpr[int]`, got `constexpr[r\   )	rW   rU   rT  rG   rX   r   rY   r   multiple_ofrC  r;   r   r  rd  rI   s         r"   r  r  
  s    
 &)$&! t1!Y'oaS0KLMM!''3'oaS0abfghgngnboappqrss	t
  &&!agg&F&v.. '   	B4c           	     r   t        |t              r|g}t        |      D ]c  \  }}t        |t              st        d| d      t        |j                  t
              r@t        d| dt        |j                         d       |D cg c]  }|j                   }}t        j                  | |      S c c}w )z^
    Let the compiler know that the `value` first values in :code:`input` are contiguous.
    r  r   r  r\   )	rW   rU   rT  rG   rX   r   rY   r   max_contiguousr  s         r"   r  r  
  s    
 &)$&! t1!Y'oaS0KLMM!''3'oaS0abfghgngnboappqrss	t
  &&!agg&F&""5&11 'r  c           	     r   t        |t              r|g}t        |      D ]c  \  }}t        |t              st        d| d      t        |j                  t
              r@t        d| dt        |j                         d       |D cg c]  }|j                   }}t        j                  | |      S c c}w )z
    Let the compiler know that the `value` first values in :code:`input` are constant.

    e.g. if :code:`values` is [4], then each group of 4 values in :code:`input` should all be equal,
    for example [0, 0, 0, 0, 1, 1, 1, 1].
    r  r   r  r\   )	rW   rU   rT  rG   rX   r   rY   r   max_constancyr  s         r"   r  r  &
  s     &)$&! t1!Y'oaS0KLMM!''3'oaS0abfghgngnboappqrss	t
  &&!agg&F&!!%00 'r  c                V    t        j                  t        j                  | |      |      S )z<
    Allow compiler to assume the :code:`cond` is True.
    )r   assumerO   )condr   s     r"   r
  r
  9
  s"    
 ??8--dH=xHHr$    
)sepr;  fileflushr   c                     y)a  
    Print the values at compile time.  The parameters are the same as the builtin :code:`print`.

    NOTE: Calling the Python builtin :code:`print` is not the same as calling this, it instead maps to :code:`device_print`,
    which has special requirements for the arguments.

    .. highlight:: python
    .. code-block:: python

        tl.static_print(f"BLOCK_SIZE={BLOCK_SIZE}")
    Nr/   )r  r;  r  r  r   r;   s         r"   static_printr  F
  s     	r$   c                     y)z
    Assert the condition at compile time.  Does not require that the :code:`TRITON_DEBUG` environment variable
    is set.

    .. highlight:: python
    .. code-block:: python

        tl.static_assert(BLOCK_SIZE == 1024)
    Nr/   r  msgr   s      r"   static_assertr  V
  s     	r$   )hexr   c               .   ddl }t        |       } t        | t              s
J |  d       d}| D ]  }||j                  vsd} n |s
J |  d       g }|D ]'  }|j                  t        j                  ||             ) t        j                  | |||      S )a  
    Print the values at runtime from the device.  String formatting does not work for runtime values, so you should
    provide the values you want to print as arguments.  The first value must be a string, all following values must
    be scalars or tensors.

    Calling the Python builtin :code:`print` is the same as calling this function, and the requirements for the arguments will match
    this function (not the normal requirements for :code:`print`).

    .. highlight:: python
    .. code-block:: python

        tl.device_print("pid", pid)
        print("pid", pid)

    On CUDA, printfs are streamed through a buffer of limited size (on one host,
    we measured the default as 6912 KiB, but this may not be consistent across
    GPUs and CUDA versions).  If you notice some printfs are being dropped, you
    can increase the buffer size by calling

    .. highlight:: python
    .. code-block:: python

        triton.runtime.driver.active.utils.set_printf_fifo_size(size_bytes)

    CUDA may raise an error if you try to change this value after running a
    kernel that uses printfs.  The value set here may only affect the current
    device (so if you have multiple GPUs, you'd need to call it multiple times).

    :param prefix: a prefix to print before the values. This is required to be a string literal.
    :param args: the values to print. They can be any tensor or scalar.
    :param hex: print all values as hex instead of decimal
    r   Nz is not stringTFz is not an ascii string)	stringre   rW   r   	printabler  r   rO   device_print)	prefixr  r   r   r  b_asciichnew_argsr  s	            r"   r  r  d
  s    D  (Ffc"=vhn$=="G V%%%G 6vh5667H ;**39:;  3AAr$   c                n    t        |      }t        j                  t        j                  | |      ||      S )a(  
    Assert the condition at runtime from the device.  Requires that the environment variable :code:`TRITON_DEBUG`
    is set to a value besides :code:`0` in order for this to have any effect.

    Using the Python :code:`assert` statement is the same as calling this function, except that the second argument
    must be provided and must be a string, e.g. :code:`assert pid == 0, "pid != 0"`.  The environment variable must
    be set for this :code:`assert` statement to have any effect.

    .. highlight:: python
    .. code-block:: python

        tl.device_assert(pid == 0)
        assert pid == 0, f"pid != 0"

    :param cond: the condition to assert. This is required to be a boolean tensor.
    :param msg: the message to print if the assertion fails. This is required to be a string literal.
    )re   r   device_assertrO   r  s      r"   r!  r!  
  s0    & c
"C!!("4"4T8"Dc8TTr$   c                |   t        |       } t        |      }t        |      }t        |      }	 t        |       d}t        j                  t
        t           |      }|}|D 	cg c]  }	t        j                  |	|       c}	x}
rt        t        j                  |ddd      }|
d   }|
D ]  } |||      \  }} |j                  rEt        |
      D ]  \  }} |||      \  |
|<   } |D cg c]  }t        ||j                         }}|
D cg c]  }|j                   }}|j                  | |||D cg c]  }|j!                  |       c}||      |st#        j%                  d      |d         S t'        fdt        |      D              S # t        $ r	 d}|f}Y jw xY wc c}	w c c}w c c}w c c}w )ac  
        Execute inline assembly over a tensor.  Essentially, this is :code:`map`
        where the function is inline assembly.

        The input tensors :code:`args` are implicitly broadcasted to the same shape.

        :code:`dtype` can be a tuple of types, in which case the output is a
        tuple of tensors.

        Each invocation of the inline asm processes :code:`pack` elements at a
        time.  Exactly which set of inputs a block receives is unspecified.
        Input elements of size less than 4 bytes are packed into 4-byte
        registers.

        This op does not support empty :code:`dtype` -- the inline asm must
        return at least one tensor, even if you don't need it.  You can work
        around this by returning a dummy tensor of arbitrary type; it shouldn't
        cost you anything if you don't use it.

        Example using
        `PTX <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html>`_
        assembly:

        .. highlight:: python
        .. code-block:: python

            @triton.jit
            def kernel(A, B, C, D, BLOCK: tl.constexpr):
                a = tl.load(A + tl.arange(0, BLOCK)) # uint8 tensor
                b = tl.load(B + tl.arange(0, BLOCK)) # float32 tensor

                # For each (a,b) in zip(a,b), perform the following:
                # - Let ai be `a` converted to int32.
                # - Let af be `a` converted to float.
                # - Let m be the max of ai and b.
                # - Return ai and mi.
                # Do the above 4 elements at a time.
                (c, d) = tl.inline_asm_elementwise(
                    asm="""
                    {
                        // Unpack `a` into `ai`.
                        .reg .b8 tmp<4>;
                        mov.b32 {tmp0, tmp1, tmp2, tmp3}, $8;
                        cvt.u32.u8 $0, tmp0;
                        cvt.u32.u8 $1, tmp1;
                        cvt.u32.u8 $2, tmp2;
                        cvt.u32.u8 $3, tmp3;
                    }
                    // Convert `ai` to float.
                    cvt.rn.f32.s32 $4, $0;
                    cvt.rn.f32.s32 $5, $1;
                    cvt.rn.f32.s32 $6, $2;
                    cvt.rn.f32.s32 $7, $3;
                    // Take max of `ai` and `b`.
                    max.f32 $4, $4, $9;
                    max.f32 $5, $5, $10;
                    max.f32 $6, $6, $11;
                    max.f32 $7, $7, $12;
                    """,
                    constraints=(
                        # 8 output registers, namely
                        #   $0=ai0, $1=ai1, $2=ai2, $3=ai3,
                        #   $4=m0,  $5=m1,  $6=m2,  $7=m3.
                        "=r,=r,=r,=r,=r,=r,=r,=r,"
                        # 5 input registers, namely
                        #   $8=ai,
                        #   $9=b0, $10=b1, $11=b2, $12=b3.
                        # The four elements from `a` are all packed into one register.
                        "r,r,r,r,r"),
                    args=[a, b],
                    dtype=(tl.int32, tl.float32),
                    is_pure=True,
                    pack=4,
                )
                tl.store(C + tl.arange(0, BLOCK), c)
                tl.store(D + tl.arange(0, BLOCK), d)

        :param asm: assembly to run.  Must match target's assembly format.
        :param constraints: asm constraints in
            `LLVM format <https://llvm.org/docs/LangRef.html#inline-asm-constraint-string>`_
        :param args: the input tensors, whose values are passed to the asm block
        :param dtype: the element type(s) of the returned tensor(s)
        :param is_pure: if true, the compiler assumes the asm block has no side-effects
        :param pack: the number of elements to be processed by one instance of inline assembly
        :param _builder: the builder
        :return: one tensor or a tuple of tensors of the given dtypes
    TF)r  arithmetic_checkallow_lhs_ptrallow_rhs_ptrr   c              3  Z   K   | ]"  \  }}t        j                  |      |       $ y wr.   )r?   
get_result)r  r  r  calls      r"   r  z)inline_asm_elementwise.<locals>.<genexpr>,  s%     PEAr*B/Ps   (+)re   rF   rG   rV  r`  r   _DtypeClassr   rO   r   binary_op_type_checking_implr  rT  r  r  create_inline_asmr  r?   r'  r  )asmconstraintsr   r   is_purepackr   has_multiple_outputsres_tysr  dispatch_argsbin_op_type_checkingbroadcast_argitemr  r  dtr  r   r  r(  s                       @r"   inline_asm_elementwiser7  
  s   t c
"C%k2Kt$D!'*GU#
 KK-u5EGFJKs++C:KK}K&11" 
 &a(! 	ID3D-HA}	I$]3 P4&:4&O#a !PEJKrz"m&9&9:KGK!./Aqxx/G/%%c;_fAgY["((8BTAgiprvwDdooa('!*55PYw=OPPP;  $	 L  L/Ags)   F #F*-F/F47F9F'&F'c                  $    e Zd ZdZddZd Zd Zy)static_rangea  
    Iterator that counts upward forever.

    .. highlight:: python
    .. code-block:: python

        @triton.jit
        def kernel(...):
            for i in tl.static_range(10):
                ...
    :note: This is a special iterator used to implement similar semantics to Python's :code:`range` in the context of
        :code:`triton.jit` functions. In addition, it also guides the compiler to unroll the loop aggressively.
    :param arg1: the start value.
    :param arg2: the end value.
    :param step: the step value.
    Nc                $   t        |t              s
J | d       |t        d      | _        n!t        |t              s
J | d       || _        |t        d      | _        || _        y t        |t              s
J | d       || _        || _        y )Nz7 used as tl.static_range start value is not a constexprr   z6 used as tl.static_range step value is not a constexprr   z5 used as tl.static_range end value is not a constexpr)rW   rU   rX  rV  r;  )r0   arg1arg2rX  s       r"   rZ   zstatic_range.__init__F  s    $	*ltf4k,ll*<!!DIdI.o4&8n0oo.DI<"1DJDHdI.n4&8m0nn.DJDHr$   c                    t        d      Nz8static_range can only be used in @triton.jit'd functionsr]  r^   s    r"   r   zstatic_range.__iter__U      UVVr$   c                    t        d      r>  r]  r^   s    r"   __next__zstatic_range.__next__X  r?  r$   r  r9   rR   rS   r8   rZ   r   rA  r/   r$   r"   r9  r9  4  s    "WWr$   r9  c                  (    e Zd ZdZ	 	 ddZd Zd Zy)r  a  
    Iterator that counts upward forever.

    .. highlight:: python
    .. code-block:: python

        @triton.jit
        def kernel(...):
            for i in tl.range(10, num_stages=3):
                ...
    :note: This is a special iterator used to implement similar semantics to Python's :code:`range` in the context of
        :code:`triton.jit` functions. In addition, it allows user to pass extra attributes to the compiler.
    :param arg1: the start value.
    :param arg2: the end value.
    :param step: the step value.
    :param num_stages: pipeline the loop into this many stages (so there are
        :code:`num_stages` iterations of the loop in flight at once).

        Note this is subtly different than passing :code:`num_stages` as a
        kernel argument.  The kernel argument only pipelines loads that feed
        into :code:`dot` operations, while this attribute tries to pipeline most
        (though not all) loads in this loop.
    :param loop_unroll_factor: Tells the Triton IR level loop unroller how many
        times to unroll a for loop that this range is used with. Less than 2 for
        this value implies no unrolling.
    :param disallow_acc_multi_buffer: If true, prevent the accumulator of the dot
        operation in the loop to be multi-buffered, if applicable.
    :param flatten: automatically flatten the loop nest starting at this loop to
        create a single flattened loop. The compiler will try to pipeline the
        flattened loop which can avoid stage stalling.
    Nc                    |t        d      | _        n|| _        |t        d      | _        || _        n|| _        || _        || _        || _        || _        || _        y )Nr   r   )rU   rX  rV  r;  
num_stagesloop_unroll_factordisallow_acc_multi_bufferflatten)r0   r;  r<  rX  rE  rF  rG  rH  s           r"   rZ   zrange.__init__}  s\    <!!DIDI<"1DJDHDJDH$"4)B&r$   c                    t        d      Nz4tl.range can only be used in @triton.jit'd functionsr]  r^   s    r"   r   zrange.__iter__      QRRr$   c                    t        d      rJ  r]  r^   s    r"   rA  zrange.__next__  rK  r$   )NNNNFFrB  r/   r$   r"   r  r  \  s#    @ X\:?"SSr$   r  c                   t        |      dk(  rt        d      t        t        |j                               d         }t        |      |k7  rt        dt        |       d|       g }	g }
|D ]t  }t	        |t
              r7|	j                  |j                         |
j                  |j                         J|	j                  t        |             |
j                  |       v t        |	      }	|	|vrt        d|j                          d|	       ||	   d   }||	   d   }|rt        ||      }t         | ||||
|j                  |      |      |      S )a  
        Dispatch a function to a library
        :param func: the function to dispatch
        :param lib_name: the name of the library
        :param lib_path: the path of the library
        :param args: the arguments of the function
        :param arg_type_symbol_dict: the type of the arguments
        :param ret_shape: the shape of the return value
        :param _builder: the builder
        :return: the return value of the function
    r   zarg_type_symbol_dict is emptyz+length of input args does not match.Expect z, got z,input arg type does not match.Expect one of r   )r6   r   r:   r7   rW   r?   r  r   r  rY   r  r  r  )r  lib_namelib_pathr   arg_type_symbol_dict	ret_shaper.  r   num_args	arg_typesarg_listr  symbolret_types                 r"   dispatchrW    sp     A%8994,1134Q78H
4yH ##&t9+VH:? @ 	@ IH !c6"SYY'OOCJJ'T#Y'OOC ! i I,, **>*C*C*E)FfYKY Z 	Z &i03'	215!(I6Hd8XvxPXAY[bcemnnr$   c           
        |j                         }d}d}g }	t        j                  t        |            D ]\  }
t	        j
                  ||
   |      ||
<   |	j                  ||
   j                         ||
   j                  j                         s[d}^ t        |	      dkD  rt        |	      }	d}|	|v rd}|d   }|D ]  }t	        j                  ||||      \  }}  t        j                  t        |            D ]$  }
t	        j                  ||
   |||      \  ||
<   }& |s|j                  }|j                  }t        || ||||||      S )a  
        Dispatch an elementwise function to a library
        :param lib_name: the name of the library
        :param lib_path: the path of the library
        :param args: the arguments of the function
        :param arg_type_symbol_dict: the type of the arguments
        :param is_pure: whether the function is pure
        :param _builder: the builder
        :return: the return value of the function
    TNFr   )r#  )copyrR  r  r6   r   rO   r  r   rY   ra  r  r*  r  create_extern_elementwiserW  )rN  rO  r   rP  r.  r   r2  
all_scalarrQ  rS  r  r#  r4  r5  r  r  s                   r"   extern_elementwiser\    sy    IIKMJII^^C./ #--mA.>Iaq)//0  ))+J	
 9~)$	,,$%a(! 	hD'DDT=ZbVf hA}	h M 23 	kA"*"G"GVWHXZgiqYi#kM!a	k %++I--DD(Hm=QS\^egoppr$   c                0    t        j                  | ||      S )z
        Convert both operands to a single common type
        :param lhs: the left operand
        :param rhs: the right operand
        :param builder: the builder
    )r   r*  )rv  ry  r  s      r"   binary_op_type_legalizationr^    s     00c7CCr$   c                    t        |       S )z#A decorator for external functions.)r(   rM   s    r"   externr`    s    2;r$   )r!   r   r   r   r  r.   )r   r   r  r   r   r   )rC  r?   )FNr  r  r  r  )NNr/   r+   r+   r+   FN)r   r  )Nr/   r+   r+   N)r  r?   )
r  r?   r  r)  r  r)  r  zList[constexpr]r   r  r  )r2   r   r  r   r   Callable[[T], T]r  )NNNN)TN)r  rU   )r  rU   )
r2   r   r  r   r  r   r  r   r   ra  )FNN)r2   r   r   ra  )r  r   r;  r   )r+   N)r,  r   r-  r   r   r   r   zUnion[dtype, Sequence[dtype]]r.  r   r/  r   )rN  r   rO  r   r   r:   rP  dictrQ  r  r.  r   )
rN  r   rO  r   r   r:   rP  rb  r.  r   )
__future__r   warningsr   
contextlibr   enumr   	functoolsr   r   rV  r	   r
   r   r   r   r   r   rR  runtime.jitr   r4   r/  _C.libtritonr   r+   r   _utilsr   r   r   r'   PROPAGATE_NANPropagateNanr(   rD   rJ   r>   rO   rQ   rU   r  r   r   r   r   r   r)  r  r  r  r  r  r   r   r   r   r   r   r   r   r   r   float8e5float8e5b16
float8e4nv
float8e4b8float8e4b15float16r  rt  float64pi32_tr  r?   r  rS  r  r  r  r  r2  re   r5  r8  r:  r8   r  r>  r@  rD  rf  rk  rm  rM  r  rS  r  rp  rr  r^  rU  r`  rk  ru  r  r  r  r  r{  r  r~  r  r  r  r  r  r  r  r  r  r  r  r  r  r  NONEr  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r
  r  r  r  r!  r7  r9  r  rW  r\  r^  r`  r/   r$   r"   <module>rv     s	   "  %  $  L L L    	   ACL% )X,.
 	+ 	+	 	C) C)L l6	" 	"" "*J9I J9` !5 !H'| '' 'T+ +B! ! V}V}V}gggg	x	x	x>J9
9
J
-=
-
-	e	U8ZZ ZzA#J A#H! !#) #2@W: @WFk8 k27&J 7.0 	/ 	/$ 	1 	1  	1 	1& '>%> ?
3
 	8 	8, 		A 		A )- A 	 A& )- 3 	 34 #' 3 	 3, 	= 	=  	) 	).    	 B !% N 	 N( ',t A 	 A&.  	 8 G 	 G< 	]amt!h 	!hH 	KO[`ls ) 	 )P 	rt"&8- 	8-v 	9=PBfP 	P 	1 	1 	9 	9 .k 	 .kb 	` 	` 5 	 5 	 6X
6X6X 6X !	6X %6X 	6X|> &5H 6 	 H JJ   	 J EI  	 I EI  	 I EI  	 I M"I # 	 I L!H " 	 H M"I # 	 I 	5 	56 	; 	; 	; 	; 	; 	; 	-9->-> ; 	;* 	-9->-> ; 	;* 	2>2C2Cd @ 	@D [_+/%(4D@ ( ( - 	 -` 	 	 	 	,  !Z 	 !ZH 
9 	 
9 7 	 7( 	, 	, 	/ 	/ 	2 	2 	1 	1$ 	I 	I 	%(TE\` 	 		 	
	 	
	 	$)D -B 	-B` 	U 	U, 	>BQ$(Q03Q 	QN%W %WP6S 6S~ &*(o(oV 	 $'q 	'qTDr$   