
    VhM                     V   d dl Z d dlZd dlmZ d dlmZmZ d dlZd dlZd dl	m
Z
 d dlmZ d dlmZ ddlmZ dd	lmZmZmZmZmZ dd
lmZmZmZ ddlmZmZmZm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' ddl(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/  ej`                  e1      Z2ejf                  jh                  Z4dZ5dZ6dZ7 e de,e7e5z   e6z         Z8 e de+d      Z9 e de+d      Z: eejv                  de4jv                  jx                        Z=dee>   dee>   de?fdZ@dededdfd ZAd!ej                  j                  j                  d"ej                  j                  j                  d#ej                  j                  j                  d$ed%ed&ed'e?deEeFef   fd(ZGd!ej                  j                  j                  d"ej                  j                  j                  d#ej                  j                  j                  d$ed%ed&ed'e?deEeFef   fd)ZH ee4jv                  j                  e       d*ej                  j                  j                  d+e?de?fd,ZJ ee4jv                  j                  d-      	 	 	 	 	 d3deded%ed&ed.ee   d/ee   d0eej                     d'e?d$ee   defd1       ZLe j                  de?fd2       ZNy)4    N)Sequence)AnyOptional)counters)CKGemmTemplate)has_triton_tma_device   )triton)_IntLikeChoiceCallerLayout
StorageBox	TensorBox)add_layout_constraintconstrain_to_fx_stridesregister_lowering)autotune_select_algorithmExternKernelChoicerealize_inputsTritonTemplate)get_num_smsget_tma_workspace_argTMA_DESCRIPTOR_SIZEuse_aten_gemm_kernelsuse_ck_gemm_templateuse_triton_template   )_is_static_problemmm_argsmm_gridpersistent_mm_gridscaled_mm_configsscaled_persistent_mm_configsshould_fallback_to_atena  
@triton.jit
def load_scales(a_scale_ptr, b_scale_ptr, SCALING_ROWWISE: tl.constexpr):
    if SCALING_ROWWISE:
        # For row-wise scaling, we'll return the pointers
        return a_scale_ptr, b_scale_ptr
    else:
        # For per-tensor scaling, we'll load the scalar values
        a_scale = tl.load(a_scale_ptr)
        b_scale = tl.load(b_scale_ptr)
        return a_scale, b_scale
a'  
@triton.jit
def apply_scaling(
    accumulator,
    a_scale,
    b_scale,
    SCALING_ROWWISE: tl.constexpr,
    offs_cm,
    offs_cn,
    M,
    N,
    stride_a_scale_m,
    stride_b_scale_n,
):
    if SCALING_ROWWISE:
        # For row-wise scaling, we need to load the scales for each row/column
        a_scales = tl.load(
            a_scale + (offs_cm * stride_a_scale_m),
            mask=offs_cm < M,
            other=0.0,
        )
        b_scales = tl.load(
            b_scale + (offs_cn * stride_b_scale_n),
            mask=offs_cn < N,
            other=0.0,
        )
        acc_scale = a_scales[:, None] * b_scales[None, :]
    else:
        # For per-tensor scaling, we can directly use the loaded scalar values
        acc_scale = a_scale * b_scale

    return accumulator * acc_scale
aP  
{{def_kernel("A", "B", "A_inverse_scale", "B_inverse_scale")}}
    M = {{size("A", 0)}}
    N = {{size("B", 1)}}
    K = {{size("A", 1)}}
    if M * N == 0:
        # early exit due to zero-size input(s)
        return

    stride_am = {{stride("A", 0)}}
    stride_ak = {{stride("A", 1)}}
    stride_bk = {{stride("B", 0)}}
    stride_bn = {{stride("B", 1)}}

    if SCALING_ROWWISE:
        stride_a_scale_m = 1
        stride_b_scale_n = 1
    else:
        stride_a_scale_m = 0
        stride_b_scale_n = 0

    start_pid = tl.program_id(axis=0)
    num_pid_m = tl.cdiv(M, BLOCK_M)
    num_pid_n = tl.cdiv(N, BLOCK_N)
    k_tiles = tl.cdiv(K, BLOCK_K)
    num_tiles = num_pid_m * num_pid_n

    workspace_base = ws_ptr + start_pid * 2 * TMA_SIZE
    a_desc_ptr = workspace_base
    b_desc_ptr = workspace_base + TMA_SIZE

    triton.language.extra.cuda.experimental_device_tensormap_create2d(
        desc_ptr=a_desc_ptr,
        global_address=A,
        load_size=[BLOCK_M, BLOCK_K],
        global_size=[M, K],
        element_ty=A.dtype.element_ty,
    )
    triton.language.extra.cuda.experimental_device_tensormap_create2d(
        desc_ptr=b_desc_ptr,
        global_address=B,
        load_size=[BLOCK_N, BLOCK_K],
        global_size=[N, K],
        element_ty=B.dtype.element_ty,
    )

    tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(a_desc_ptr)
    tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(b_desc_ptr)

    tiles_per_SM = num_tiles // NUM_SMS
    if start_pid < num_tiles % NUM_SMS:
        tiles_per_SM += 1

    tile_id = start_pid - NUM_SMS
    ki = -1

    pid_m = 0
    pid_n = 0
    offs_am = 0
    offs_bn = 0

    num_pid_in_group = GROUP_M * num_pid_n
    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
    a_scale, b_scale = load_scales(A_inverse_scale, B_inverse_scale, SCALING_ROWWISE)

    for _ in range(0, k_tiles * tiles_per_SM):
        ki = tl.where(ki == k_tiles - 1, 0, ki + 1)
        if ki == 0:
            tile_id += NUM_SMS
            group_id = tile_id // num_pid_in_group
            first_pid_m = group_id * GROUP_M
            group_size_m = min(num_pid_m - first_pid_m, GROUP_M)
            pid_m = first_pid_m + (tile_id % group_size_m)
            pid_n = (tile_id % num_pid_in_group) // group_size_m

            offs_am = pid_m * BLOCK_M
            offs_bn = pid_n * BLOCK_N

        offs_k = ki * BLOCK_K

        a = tl._experimental_descriptor_load(
            a_desc_ptr, [offs_am, offs_k], [BLOCK_M, BLOCK_K],  A.dtype.element_ty
        )
        b = tl._experimental_descriptor_load(
            b_desc_ptr, [offs_bn, offs_k], [BLOCK_N, BLOCK_K],  B.dtype.element_ty
        )
        if USE_FAST_ACCUM:
            accumulator = tl.dot(a, b.T, accumulator)
        else:
            accumulator += tl.dot(a, b.T)

        if ki == k_tiles - 1:
            # Apply inverse scaling
            offs_cm = offs_am + tl.arange(0, BLOCK_M)
            offs_cn = offs_bn + tl.arange(0, BLOCK_N)
            # Apply scaling
            accumulator = apply_scaling(
                accumulator,
                a_scale,
                b_scale,
                SCALING_ROWWISE,
                offs_cm,
                offs_cn,
                M,
                N,
                stride_a_scale_m,
                stride_b_scale_n,
            )

            idx_m = offs_cm[:, None]
            idx_n = offs_cn[None, :]
            mask = (idx_m < M) & (idx_n < N)
            # inductor generates a suffix
            {{store_output(("idx_m", "idx_n"), "accumulator", "mask", indent_width=12)}}
            accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
scaled_mm_device_tma)namegridsource	scaled_mma	  
{{def_kernel("A", "B", "A_inverse_scale", "B_inverse_scale")}}
    M = {{size("A", 0)}}
    N = {{size("B", 1)}}
    K = {{size("A", 1)}}
    if M * N == 0:
        # early exit due to zero-size input(s)
        return
    stride_am = {{stride("A", 0)}}
    stride_ak = {{stride("A", 1)}}
    stride_bk = {{stride("B", 0)}}
    stride_bn = {{stride("B", 1)}}

    # based on triton.ops.matmul
    pid = tl.program_id(0)
    grid_m = (M + BLOCK_M - 1) // BLOCK_M
    grid_n = (N + BLOCK_N - 1) // BLOCK_N

    # re-order program ID for better L2 performance
    width = GROUP_M * grid_n
    group_id = pid // width
    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
    pid_m = group_id * GROUP_M + (pid % group_size)
    pid_n = (pid % width) // (group_size)

    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
    rk = tl.arange(0, BLOCK_K)
    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)

    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
    for k in range(K, 0, -BLOCK_K):
        if EVEN_K:
            a = tl.load(A)
            b = tl.load(B)
        else:
            a = tl.load(A, mask=rk[None, :] < k, other=0.)
            b = tl.load(B, mask=rk[:, None] < k, other=0.)
        if USE_FAST_ACCUM:
            acc = tl.dot(a, b, acc, out_dtype=ACC_TYPE)
        else:
            acc += tl.dot(a, b, out_dtype=ACC_TYPE)
        A += BLOCK_K * stride_ak
        B += BLOCK_K * stride_bk

    if SCALING_ROWWISE:
        inv_a_scale_row = tl.load(A_inverse_scale + rm, mask=rm < M)
        inv_b_scale_row = tl.load(B_inverse_scale + rn, mask=rn < N)
        inv_scale_row = inv_a_scale_row[:, None] * inv_b_scale_row[None, :]
        acc *= inv_scale_row
    else:
        # for tensor-wise scaling, the scales are scalars
        inv_a_scale = tl.load(A_inverse_scale)
        inv_b_scale = tl.load(B_inverse_scale)
        inv_scale = inv_a_scale * inv_b_scale
        acc *= inv_scale

    # rematerialize rm and rn to save registers
    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)

    idx_m = rm[:, None]
    idx_n = rn[None, :]
    mask = (idx_m < M) & (idx_n < N)

    # inductor generates a suffix
    {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
scaled_mm_biasa 
  
{{def_kernel("A", "B", "A_inverse_scale", "B_inverse_scale", "bias_ptr")}}
    M = {{size("A", 0)}}
    N = {{size("B", 1)}}
    K = {{size("A", 1)}}
    if M * N == 0:
        # early exit due to zero-size input(s)
        return
    stride_am = {{stride("A", 0)}}
    stride_ak = {{stride("A", 1)}}
    stride_bk = {{stride("B", 0)}}
    stride_bn = {{stride("B", 1)}}

    # based on triton.ops.matmul
    pid = tl.program_id(0)
    grid_m = (M + BLOCK_M - 1) // BLOCK_M
    grid_n = (N + BLOCK_N - 1) // BLOCK_N

    # re-order program ID for better L2 performance
    width = GROUP_M * grid_n
    group_id = pid // width
    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
    pid_m = group_id * GROUP_M + (pid % group_size)
    pid_n = (pid % width) // (group_size)

    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
    rk = tl.arange(0, BLOCK_K)
    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)

    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
    for k in range(K, 0, -BLOCK_K):
        if EVEN_K:
            a = tl.load(A)
            b = tl.load(B)
        else:
            a = tl.load(A, mask=rk[None, :] < k, other=0.)
            b = tl.load(B, mask=rk[:, None] < k, other=0.)
        if USE_FAST_ACCUM:
            acc = tl.dot(a, b, acc, out_dtype=ACC_TYPE)
        else:
            acc += tl.dot(a, b, out_dtype=ACC_TYPE)
        A += BLOCK_K * stride_ak
        B += BLOCK_K * stride_bk

    if SCALING_ROWWISE:
        inv_a_scale_row = tl.load(A_inverse_scale + rm, mask=rm < M)
        inv_b_scale_row = tl.load(B_inverse_scale + rn, mask=rn < N)
        inv_scale_row = inv_a_scale_row[:, None] * inv_b_scale_row[None, :]
        acc *= inv_scale_row
    else:
        # for tensor-wise scaling, the scales are scalars
        inv_a_scale = tl.load(A_inverse_scale)
        inv_b_scale = tl.load(B_inverse_scale)
        inv_scale = inv_a_scale * inv_b_scale
        acc *= inv_scale

    # rematerialize rm and rn to save registers
    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)

    # bias
    bias = tl.load(bias_ptr + rn, mask=rn < N)
    acc += bias

    idx_m = rm[:, None]
    idx_n = rn[None, :]
    mask = (idx_m < M) & (idx_n < N)

    # inductor generates a suffix
    {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
zat::_scaled_mm_out)op_overloadsize_asize_breturnc                 n    t        |       t        |      k(  ryt        |       dk  rt        |      dk  ryy)NTr   F)len)r,   r-   s     P/home/dcms/DCMS/lib/python3.12/site-packages/torch/_inductor/kernel/mm_scaled.pyare_compatible_scalesr2     s4    
6{c&k! 6{aCK1,    mat_amat_bc                     dt         t           dt        fd}dt         t           dt        fd}dt         t           dt        fd}t        j                   | j                               xs  | j                                fd       t        j                   |j                               xs  |j                               fd       y )	Nstrider.   c                     | d   dk(  S )Nr    r7   s    r1   is_row_majorz.check_supported_striding.<locals>.is_row_major      ayA~r3   c                     | d   dk(  S Nr   r   r9   r:   s    r1   is_col_majorz.check_supported_striding.<locals>.is_col_major  r<   r3   sizec                 8    t        | d   dk(  xs | d   dk(        S r>   )bool)r@   s    r1   has_zero_dimz.check_supported_striding.<locals>.has_zero_dim  s"    DGqL0DGqL11r3   c                  *    d j                          S )Nz$mat_a must be row_major, got stride 
get_stride)r4   s   r1   <lambda>z*check_supported_striding.<locals>.<lambda>      6u7G7G7I6JK r3   c                  *    d j                          S )Nz$mat_b must be col_major, got stride rE   )r5   s   r1   rG   z*check_supported_striding.<locals>.<lambda>  rH   r3   )r   r   rB   torch_checkrF   get_size)r4   r5   r;   r?   rC   s   ``   r1   check_supported_stridingrM     s    Xh/ D Xh/ D 28H- 2$ 2 
LLU%%'(JL9I,JK 
LLU%%'(JL9I,JKr3   sym_msym_nsym_klayoutscale_ascale_buse_fast_accumc                    t        j                  || j                  d         | j                  d   k(  }|j                         |j                         }
}	t	        |	|
      s J dt        |	       dt        |
       d       t        d	d|d|| j                  | j                  t        |j                               dk(  t        t               d	| j                  S )
NBLOCK_KExpect scale_a and scale_b to be either both scalars (including single-element tensors) or 1-dimensional tensors with the same size. Got scale_a:  and scale_b: .   
tl.float32r	   )	GROUP_MEVEN_KACC_TYPEUSE_FAST_ACCUM
num_stages	num_warpsSCALING_ROWWISETMA_SIZENUM_SMSr9   )sympygcdkwargsrL   r2   r0   dictr`   ra   r   r   configrN   rO   rP   rQ   rR   rS   rT   even_k_symbolicr,   r-   s              r1   scaled_mm_options_device_tmarl     s     			%y12fmmI6NN  %%')9)9);FF 0 	EEH[MQ_`cdj`k_llm	o0  %$$""G,,./14$ -- r3   c                    t        j                  || j                  d         | j                  d   k(  }|j                         |j                         }
}	t	        |	|
      s J dt        |	       dt        |
       d       t        d	d|d|| j                  | j                  t        |j                               dk(  d| j                  S )
NrV   rW   rX   rY   rZ   r[   r	   )r\   r]   r^   r_   r`   ra   rb   r9   )	re   rf   rg   rL   r2   r0   rh   r`   ra   ri   s              r1   scaled_mm_optionsrn     s     			%y12fmmI6NN  %%')9)9);FF 0 	EEH[MQ_`cdj`k_llm	o0  
%$$""G,,./14
 --
 
r3   khas_biasc                 Z    t               xr t        j                  }| dk\  }|xr |xr | S )N    )r   triton_configenable_persistent_tma_matmul)ro   rp   	availablemin_ks       r1   use_persistent_tmarw     s2    %'VM,V,VI GE//x</r3   )type_promotion_kindbiasscale_result	out_dtypec	                 <   t        | |||      \  }	}
}}} }t        d   d|	 d|
 d| xx   dz  cc<   t        j                  d|	|
|| j	                         |j	                         |       t        | |       t        ||      \  }}|| |||f}t        }nt        |      }| ||||f}t        }t        j                  ||||      }g }t               r|j                  |       t        |      \  }}|rt        |d	      rt        ||d u      r^t!        |	|
|      D ]M  }t#        ||	|
|||||      }| |||f}t%        j&                  |f||t)        d
| j+                               d| O nbt-        |	|
|      D ]R  }|dk(  r|j.                  d   dk\  rt1               r|dk  r+t3        ||	|
|||||      } |j&                  |f||d| T |r%t5        ||	|
|      rt7        j8                  |||       t;        |      r|j=                         S t?        d|||      S )N)rQ   r{   aten_mm_infozaten._scaled_mm.default__r   z_Tuned aten._scaled_mm.default: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%s)r{   rT   T)enable_float8r	   )num_tma_descriptorsdevice)input_nodesrQ   workspace_arg   BLOCK_M@   rr   )r   rQ   r)   ) r   r   loginfo	get_dtyperM   r   scaled_mm_templatescaled_mm_bias_templateaten__fp8_mmbindr   appendr   r   rw   r#   rl   scaled_mm_device_tma_templatemaybe_append_choicer   
get_devicer"   rg   
using_b200rn   r   r   add_ck_gemm_choicesr$   output_noder   )r4   r5   rR   rS   ry   rz   r{   rT   rQ   mnro   r   triton_templateaten_choicechoicesr~   
is_nonzerorj   rg   s                       r1   tuned_scaled_mmr     s    %,uVy%!Aq!VUE ^7s!A3asCDIDHHi			 UE*%gw7GW |eWg6,d#eWgt<1##Vy $ K #%G{#&v.MAz)&EaT!126q!Q? 5Aq!VWg~  %eWg>-AA	 +!"7,-$//1#		 	  ,Aq!4 7v}}Y72= <AF*Aq!VWg~ 433 +! 	( *61a;**7FKHw'&&(($[';OOr3   c                      t         j                  j                         syt         j                  j                  t         j                  j	                               } | j
                  dk(  S )zEReturns true if the device is a NVIDIA B200, otherwise returns false.F
   )rJ   cudais_availableget_device_propertiescurrent_devicemajor)device_propertiess    r1   r   r   R  sJ     ::""$

889R9R9TU""b((r3   )NNNFN)O	functoolsloggingcollections.abcr   typingr   r   re   rJ   torch._dynamo.utilsr   7torch._inductor.codegen.rocm.ck_universal_gemm_templater   torch.utils._tritonr   rj   r
   rs   irr   r   r   r   r   loweringr   r   r   select_algorithmr   r   r   r   utilsr   r   r   r   r   r   	mm_commonr   r   r    r!   r"   r#   r$   	getLogger__name__r   opsatenload_scalesapply_scaling
device_tmar   r   r   
_scaled_mmoutr   intrB   r2   rM   corenumbersIntegerrh   strrl   rn   defaultrw   dtyper   	lru_cacher   r9   r3   r1   <module>r      s     $     ( R 5 , F F X X     g!yy~~ Fs
l !/		#m3!  $		FJ ` )		JN b "	*8K8K
	(3- 	# 	4 	I i D .::%% ::%% ::%%	
     
#s(^D::%% ::%% ::%%	
     
#s(^@ doo--/F G0%**,,44 0 0 0 4??**E !%(,'+ #^P^P^P ^P 	^P
 9
^P 9%^P $^P ^P V^P ^P F^PB )D ) )r3   