
    VhLs                     (   d dl Z d dlZd dlZd dlmZ d dlZd dlmZ d dlm	Z	 d dl
mZmZmZmZ d dlmZ d dlmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZ ddlmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z% ddl&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/ ddl0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z= de>dee?e@df      fdZA	 d dlBZB eAeBj                        ZDdZEeDeD\  ZFZGnd ZFd ZG ej                  eJ      ZKej                  j                  ZM e%de7ej                  j                  eEreFdk\  reGdk\  rdnd      ZP e%de:d      ZQ e j                  d      d         ZS e$ej                  d!      ZU e$ej                  d"eMj                  j                  #      ZX e$ej                  d$      ZZ e$ej                  d%d&      Z\d' Z]d( Z^d) Z_dddd*d+Z` e$e`d      Za e!eMj                  d,      dd-d.       Zb e!eMj                  d,      dd-d/       Zc e!eMj                  d,      dddd0d1       Zd e!eMj                  d,      ddd2d3       Ze e j                  d      d4ee@   deffd5       Zgd6 Zhd7 Zi	 	 d<d8ee@   fd9Zjd: Zkd; Zly# eH$ r dZDdZEd ZFd ZGY w xY w)=    N)Optional)counters)AutoHeuristicSelectAlgorithm)	AHContextcontext_add_stridescontext_add_using_tf32mm_operations)CppGemmTemplate)V   )configir)CUTLASS2xGemmTemplateCUTLASS3xGemmTemplate)CKGemmTemplate)PythonWrapperCodegen)FlexibleLayout	is_triton)register_lowering)autotune_select_algorithmExternKernelChoiceTritonTemplate)	get_gpu_shared_memoryget_tma_workspace_arguse_aten_gemm_kernelsuse_ck_gemm_templateuse_cpp_gemm_templateuse_cutlass_templateuse_max_autotuneuse_triton_templateuse_triton_tma_template   )_is_static_problemaddmm_epilogueextra_mm_configsint8_mm_configsmm_args
mm_configsmm_grid
mm_optionspersistent_mm_configspersistent_mm_gridpersistent_mm_optionsshould_fallback_to_atentriton_configversion_stringreturn.c                 x    d}t        j                  ||       }|r t        d |j                         D              S y )Nz(\d+)\.(\d+)?c              3   2   K   | ]  }t        |        y wN)int).0groups     I/home/dcms/DCMS/lib/python3.12/site-packages/torch/_inductor/kernel/mm.py	<genexpr>z parse_version.<locals>.<genexpr>?   s     <ESZ<s   )rematchtuplegroups)r0   patternr;   s      r8   parse_versionr?   :   s4    GHHWn-E<U\\^<<<    TFmm   aX	  
{{def_kernel("A", "B")}}
    M = {{size("A", 0)}}
    N = {{size("B", 1)}}
    K = {{size("A", 1)}}
    if M * N == 0:
        # early exit due to zero-size input(s)
        return
    stride_am = {{stride("A", 0)}}
    stride_ak = {{stride("A", 1)}}
    stride_bk = {{stride("B", 0)}}
    stride_bn = {{stride("B", 1)}}

    # based on triton.ops.matmul
    pid = tl.program_id(0)
    grid_m = (M + BLOCK_M - 1) // BLOCK_M
    grid_n = (N + BLOCK_N - 1) // BLOCK_N

    # re-order program ID for better L2 performance
    width = GROUP_M * grid_n
    group_id = pid // width
    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
    pid_m = group_id * GROUP_M + (pid % group_size)
    pid_n = (pid % width) // (group_size)

    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    if ((stride_am == 1 and stride_ak == M) or (stride_am == K and stride_ak == 1)) and M >= BLOCK_M:
        offs_a_m = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
    else:
        offs_a_m = rm % M
    if ((stride_bk == 1 and stride_bn == K) or (stride_bk == N and stride_bn == 1)) and N >= BLOCK_N:
        offs_b_n = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
    else:
        offs_b_n = rn % N
    offs_k = tl.arange(0, BLOCK_K)
    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)

    for k_idx in range(0, tl.cdiv(K, BLOCK_K)):
        {% if not EVEN_K %}
        a_mask = offs_k[None, :] < (K - k_idx * BLOCK_K)
        b_mask = offs_k[:, None] < (K - k_idx * BLOCK_K)
        {% endif %}
        a_k_idx_vals = offs_k[None, :] + (k_idx * BLOCK_K)
        b_k_idx_vals = offs_k[:, None] + (k_idx * BLOCK_K)

        idx_m = offs_a_m[:, None]
        idx_n = a_k_idx_vals
        {{load_input("A", "a", ("idx_m", "idx_n"), mask=None if EVEN_K else "a_mask", indent_width=8)}}

        idx_m = b_k_idx_vals
        idx_n = offs_b_n[None, :]
        {{load_input("B", "b", ("idx_m", "idx_n"), mask=None if EVEN_K else "b_mask", indent_width=8)}}
        acc += tl.dot(a, b, allow_tf32=ALLOW_TF32)

    # rematerialize rm and rn to save registers
    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    idx_m = rm[:, None]
    idx_n = rn[None, :]
    mask = (idx_m < M) & (idx_n < N)

    # inductor generates a suffix
    {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
a2	  
{{def_kernel("A", "B")}}
    M = {{size("A", 0)}}
    N = {{size("B", 1)}}
    K = {{size("A", 1)}}
    if M * N == 0:
        # early exit due to zero-size input(s)
        return
    stride_am = {{stride("A", 0)}}
    stride_ak = {{stride("A", 1)}}
    stride_bk = {{stride("B", 0)}}
    stride_bn = {{stride("B", 1)}}

    # based on triton.ops.matmul
    pid = tl.program_id(0)
    grid_m = (M + BLOCK_M - 1) // BLOCK_M
    grid_n = (N + BLOCK_N - 1) // BLOCK_N

    # re-order program ID for better L2 performance
    width = GROUP_M * grid_n
    group_id = pid // width
    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
    pid_m = group_id * GROUP_M + (pid % group_size)
    pid_n = (pid % width) // (group_size)

    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    if (stride_am == 1 and stride_ak == M) or (stride_am == K and stride_ak == 1):
        offs_a_m = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
    else:
        offs_a_m = rm % M
    if (stride_bk == 1 and stride_bn == K) or (stride_bk == N and stride_bn == 1):
        offs_b_n = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
    else:
        offs_b_n = rn % N
    offs_k = tl.arange(0, BLOCK_K)
    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)

    for k_idx in range(0, tl.cdiv(K, BLOCK_K)):
        {% if not EVEN_K %}
        a_mask = offs_k[None, :] < (K - k_idx * BLOCK_K)
        b_mask = offs_k[:, None] < (K - k_idx * BLOCK_K)
        {% endif %}
        a_k_idx_vals = offs_k[None, :] + (k_idx * BLOCK_K)
        b_k_idx_vals = offs_k[:, None] + (k_idx * BLOCK_K)

        idx_m = offs_a_m[:, None]
        idx_n = a_k_idx_vals
        {{load_input("A", "a", ("idx_m", "idx_n"), mask=None if EVEN_K else "a_mask", indent_width=8)}}

        idx_m = b_k_idx_vals
        idx_n = offs_b_n[None, :]
        {{load_input("B", "b", ("idx_m", "idx_n"), mask=None if EVEN_K else "b_mask", indent_width=8)}}
        acc += tl.dot(a, b, allow_tf32=ALLOW_TF32)

    # rematerialize rm and rn to save registers
    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    idx_m = rm[:, None]
    idx_n = rn[None, :]
    mask = (idx_m < M) & (idx_n < N)

    # inductor generates a suffix
    {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
)namegridsourcemm_persistent_tmaal  
{{def_kernel("A", "B")}}
    M = {{size("A", 0)}}
    N = {{size("B", 1)}}
    K = {{size("A", 1)}}
    if M * N == 0:
        # early exit due to zero-size input(s)
        return

    start_pid = tl.program_id(0)
    grid_m = tl.cdiv(M, BLOCK_M)
    grid_n = tl.cdiv(N, BLOCK_N)
    k_tiles = tl.cdiv(K, BLOCK_K)
    num_tiles = grid_m * grid_n
    tiles_per_SM = num_tiles // NUM_SMS
    if start_pid < num_tiles % NUM_SMS:
        tiles_per_SM += 1

    tile_id = start_pid - NUM_SMS
    ki = -1

    width = GROUP_M * grid_n
    rk_for_mask = tl.arange(0, BLOCK_K)
    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)

    workspace_base = ws_ptr + start_pid * 2 * TMA_SIZE
    a_desc_ptr = workspace_base
    b_desc_ptr = workspace_base + TMA_SIZE

    triton.language.extra.cuda.experimental_device_tensormap_create2d(
        desc_ptr=a_desc_ptr,
        global_address=A,
        load_size=[BLOCK_M, BLOCK_K] if A_ROW_MAJOR else [BLOCK_K, BLOCK_M],
        global_size=[M, K] if A_ROW_MAJOR else [K, M],
        element_ty=A.dtype.element_ty,
    )
    triton.language.extra.cuda.experimental_device_tensormap_create2d(
        desc_ptr=b_desc_ptr,
        global_address=B,
        load_size=[BLOCK_K, BLOCK_N] if B_ROW_MAJOR else [BLOCK_N, BLOCK_K],
        global_size=[K, N] if B_ROW_MAJOR else [N, K],
        element_ty=B.dtype.element_ty,
    )

    tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(a_desc_ptr)
    tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(b_desc_ptr)

    pid_m = 0
    pid_n = 0
    rm = 0
    rn = 0

    for _ in range(0, k_tiles * tiles_per_SM):
        ki = tl.where(ki == k_tiles - 1, 0, ki + 1)
        if ki == 0:
            tile_id += NUM_SMS
            # re-order program ID for better L2 performance
            group_id = tile_id // width
            group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
            pid_m = group_id * GROUP_M + (tile_id % group_size)
            pid_n = (tile_id % width) // (group_size)

            rm = pid_m * BLOCK_M
            rn = pid_n * BLOCK_N

        rk = ki * BLOCK_K

        a = tl._experimental_descriptor_load(
            a_desc_ptr,
            [rm, rk] if A_ROW_MAJOR else [rk, rm],
            [BLOCK_M, BLOCK_K] if A_ROW_MAJOR else [BLOCK_K, BLOCK_M],
            A.dtype.element_ty,
        )
        b = tl._experimental_descriptor_load(
            b_desc_ptr,
            [rk, rn] if B_ROW_MAJOR else [rn, rk],
            [BLOCK_K, BLOCK_N] if B_ROW_MAJOR else [BLOCK_N, BLOCK_K],
            B.dtype.element_ty,
        )
        acc += tl.dot(
            a if A_ROW_MAJOR else a.T,
            b if B_ROW_MAJOR else b.T,
            allow_tf32=ALLOW_TF32,
        )

        if ki == k_tiles - 1:
            # rematerialize rm and rn to save registers
            rcm = rm + tl.arange(0, BLOCK_M)
            rcn = rn + tl.arange(0, BLOCK_N)
            idx_m = rcm[:, None]
            idx_n = rcn[None, :]
            mask = (idx_m < M) & (idx_n < N)

            # inductor generates a suffix
            {{store_output(("idx_m", "idx_n"), "acc", "mask", indent_width=12)}}
            acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
c                     t        |       S r4   )r   )fns    r8   lazy_register_extern_choicerI   M  s    b!!r@   z
at::mm_outzat::addmm_out)op_overloadzat::_int_mm_outzat::_sparse_semi_structured_mm)has_out_variantc                 b    | j                         t        j                  t        j                  fv S r4   )	get_dtypetorchint8uint8)mats    r8   _is_int8_matrR   a  s     ==?uzz5;;777r@   c                     | |z  dkD  S )Ni     )mnks      r8   _is_large_block_for_cpurX   e  s    q55=r@   c                 "    | dk(  r	dt         dS i S )Ncpug      ?)scaleexclude)rX   )devices    r8   mm_config_kwargsr^   j  s     .
 	
 Ir@   outalphabetac                    | j                  d      dk(  s| j                  d      dk(  rt        j                  | d   |||||      S t        j                  | |||||      S )z
    Giving torch.addmm a 1D tensor calls a different (faster) cublasLt
    kernel under the hood.  There are a few shapes where this is slower,
    but they are rare.
    r   r"   r_   )stridesizerN   addmm)inpmat1mat2r`   ra   rb   s         r8   
bias_addmmrj   s  sY     zz!}SXXa[A-{{3q643e$OO;;sD$Cu4HHr@   )type_promotion_kindlayoutc                   t        | ||      \  }}}}} }d}t        d   d| d| d| xx   dz  cc<   t        j                  d|||| j	                         |j	                         |       |}t               s,t        |j                  |j                  |j                        }t               rt        j                  | |f|      gng }t        |      \  }	}
|
rt        |      rt        |||fi t!        t#        j$                  |             D ]*  }t'        j(                  |f| |f|d	t+        |||||       , t-        | |      r}t/        |||fi t!        t#        j$                  |             D ]P  }t1        j(                  |f| |f|t3        d
| j5                               dt+        |||||      t7        | |       R |
r't9        ||||      rt;        j<                  ||| |g       |
r't?        ||||      rtA        jB                  ||| |g       tE        || |      rtG        jH                  ||| |g       | |g}|
r1t        |      r%tJ        jL                  jN                  jQ                  |      rtS        |       rg }t               r|jU                  d       tW        |      }tY        |||fi t!        t#        j$                  |             D ]*  }t'        j(                  |f| |f|d	t+        |||||       , t[        | |||||||t]               d d|      }tJ        jL                  jN                  j_                  |      s*|#tW        |      dkD  r|D cg c]	  }||v s| }}n|d | }t`        jb                  D ].  }|jU                  te        |      j                  | |f|             0 tg        |      r&t        j                  | |f|      ji                         S tk        ||| |g|      S c c}w )Nrl   rA   aten_mm_infozaten.mm__r"   zOTuned aten.mm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%sr]   dtypere   input_nodesrm   r   num_tma_descriptorsr]   rt   rm   workspace_arg	extern_mm
   )top_kalways_includedr   )6r'   r   loginforM   r   r   r]   rr   re   r   aten_mmbindr#   r    r(   r^   r   get_device_typemm_templatemaybe_append_choicer*   r!   r+   persistent_tma_mm_templater   
get_devicer-   r   r   add_cutlass_gemm_choicesr   r   add_ck_gemm_choicesr   r
   add_choicesrN   	_inductorr   run_autoheuristicr   appendlenr%   mm_autoheuristicr	   collect_autoheuristicinductor_configexternal_matmulrI   r.   output_noder   )rh   ri   rm   rU   rV   rW   rC   aten_layoutchoicesstatic_shape
is_nonzeror   rt   r|    num_choices_before_extra_configs
ah_choiceschoices                    r8   tuned_mmr     s)   ")$V"DAq!VT4D ^xs!A3as3494HHY			 K$==6;;
 6K5LtTlK	01RT   2&9L*)&1 AqW,<R=O=OPT=U,VW 	F++!4L VQ1f5		 #4./1a+B,>,>t,DE  +>>
!%t!"7,-#0#	
 !Aq&9
 ,D$7
 *61a;66wtU*61a;**7FT4LIVT40##4L	
 ,K'OO""44T:dO """;/+.w<(&q!
'(:(:4(@A
 	F ++!4L VQ1f5		 &O+

 %%;;DA%#j/A*=
 18Pf6Z;O6PP!"C#CD,, R215::D$<PQR w'||T4L+6BBDD$T7T4L&II Qs   	O6(O6c                   t        | ||t        j                        \  }}}}} }t        d   d| d| d| xx   dz  cc<   t        j                  d|||| j                         |j                         |       t        |      \  }}|xr |xr t        ||||      }t               rt        j                  | |f|      gng }	|rt        j                  |	|| |gdd       |rdt        |d	      rWt        |||fi t!        t#        j$                  |             D ]*  }
t'        j(                  |	f| |f|d
t+        |
||||       , t-        |	      r&t        j                  | |f|      j/                         S t1        d|	| |g|      S )N)rm   	out_dtypero   zaten._int_mm_rp   r"   zTTuned aten._int_mm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%sTfuseablenon_fuseable)enable_int32rs   int_mm)r'   rN   int32r   r}   r~   rM   r#   r   r   aten__int_mmr   r   r   r    r&   r^   r   r   r   r   r*   r.   r   r   )rh   ri   rm   rU   rV   rW   r   r   use_cutlassr   r   s              r8   tuned_int_mmr     s   ")d6U[[#Aq!VT4
 ^}QCq1QC89Q>9HH^			  2&9L*W:W2FvqRSUV2WK 6K5L		D$<	01RT  66VdD\Dt	
 )&tD%q!
'(:(:4(@A
 	F ++!4L VQ1f5		 w'  $v6BBDD$XwtfMMr@   )ra   rb   rm   c                   d}t        ||| |      \  }}}	}}}}
t        |      \  }}t        d   d| d| d|	 xx   dz  cc<   t        j	                  d|||	|j                         |j                         |       |r
t               swdd	lm}m	} t        ||      r) ||j                  |j                  |j                  
      }t               rt        j!                  | ||f|||      gng }t#        d|| ||g|      S t               rt        j!                  |
||f|||      gng }t               ry|
j%                         d   dk(  rc|
j'                         j(                  dk(  rFt*        j,                  j.                  r,|j1                  dt2        j!                  |
||f|||             |r"t5        |      rt7        |||	fi t9        t;        j<                  |            D ]E  }t?        j@                  |f|
||f|dtC        ||||	|      dtE        |j                  ||      d G tG        ||      rtI        |||	fi t9        t;        j<                  |            D ]k  }tK        j@                  |f|
||f|tM        d|j'                               dtC        ||||	|      tO        ||      dtE        |j                  ||      d m |r_|r]tQ        ||||	      rOtS        jT                  |
jV                  jX                  d         dk7  r t[        j\                  |||||
g||g d       |r.t_        ||||	      r ta        jb                  |||||
g||g d       te        |||      rtg        jh                  |||
||g||d       tk        |      r|jm                  t        j!                  |
||f||||             |
j%                         d   dk(  rc|
j'                         j(                  dk(  rFt*        j,                  j.                  r,|j1                  dt2        j!                  |
||f|||             t#        d||
||g|      S )N)rb   ra   rl   ro   zaten.addmm_rp   r"   zRTuned aten.addmm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%sr   )FixedLayoutr   rq   )ra   rb   rf   cudars   )prefix_argsepilogue_fnr   ru   rw   )r   r   r"   )ra   rb   input_reorderT)ra   rb   has_bias)7r'   r#   r   r}   r~   rM   r   torch._inductor.irr   r   
isinstancer]   rr   re   r   
aten_addmmr   r   
get_strider   typer   tritonautotune_cublasLtinsertaten_bias_addmmr    r(   r^   r   r   r   r   r*   r$   r!   r+   r   r   r-   r   r   statically_known_int_or_nonerm   rd   r   r   r   r   r   r   r
   r   r.   r   )rg   rh   ri   ra   rb   rm   ordered_kwargs_for_cpp_kernelrU   rV   rW   inp_expandedr   r   r   r   r   r   s                    r8   tuned_addmmr   (  s   $5!07dCPV0W-Aq!VT41&9L* ^{1#Qqc1#671<7HH\			  0 2 	Cfk*#}}FLLv{{F %& $%	     	 )'Ct;LfUU !" OOtT*	  	
   	##%a(A-##%**f4""44 	  tT*F%d ! 	
 )&1 AqW,<R=O=OPT=U,VW 	F++)46 VQ1f5	
 *6<<E	 #4./1a+B,>,>t,DE  +>>!-tT :!"7,-#0#	 !Aq&9 ,D$7 !" .v||UD I" 
';FAq!'L
 !==##**2. 
 "::t\*' *61a;**4&#	
 VT40##4&	
 w'OOtT*-  	
 ##%a(A-'')..&8&&88 NN$$!4.e$ %  %<t4f r@   )r   rm   c                   ddl m}  || ||      \  } }}| j                         \  }}|j                         \  }}	|j                         \  }
}t        j                  j
                  j                  ||      }t        j                  j
                  j                  d|z  |
      }|6ddlm}  ||j                         |r|n|j                         ||g|dg      }n	|J d       t               rt        j                  | ||f||      gng }||z  dk7  r+t        ||||      rt        j                   ||| ||gdd	       t#        d
|| ||g|      S )Nr   )realize_inputsr   )r   r"   z,out_dtype is ignored if layout is specified.)r   Tr   sparse_semi_structured_mm) torch._inductor.select_algorithmr   get_sizer   graphsizevarsguard_equalsr   r   r   rM   r   aten__sparse_semi_structured_mmr   r   r   r   r   )rh   	mat1_metari   r   rm   r   m1k1m2rp   k2rV   rU   rW   r   r   s                   r8   tuned_sparse_semi_structured_mmr     sj    @*4DAD)T]]_FB EBMMOEB	%%b"-A	%%a"fb1A~2OO"I(8FF	
  P"PP  !"	 ,00y$'9 1 	
   	1uz*61a;66VdD)4tRV	
 %#WtY.Ev r@   indexc                 f    t         j                  j                  | xs d      }|j                  dk  S )Nr      )rN   r   get_device_propertiesmajor)r   propss     r8   _is_sm7x_or_older_gpur     s)    JJ,,UZa8E;;!r@   c                 &    t        d | D              S )Nc              3   <   K   | ]  }t        |t                y wr4   )r   r5   )r6   dims     r8   r9   zdims_are_int.<locals>.<genexpr>  s     4z#s#4s   )all)dimss    r8   dims_are_intr      s    4t444r@   c                    t        ||| ||      \  } }}t        | ||g      sy |j                  t        j                  k7  ry t        j
                  j                         dk\  rt               dk7  ry | dk(  r|dz  dk7  s|dz  dk7  ry | dk  r|dk\  r|dk\  rt        dddd	d
      S | dkD  r| dk  r|dk\  r|dk\  rt        dddd	d
      S | dkD  r| dk  r|dk\  r|dk\  rt        dddd	d
      S y )N)   r   i  r"      r   i   @            )BLOCK_MBLOCK_NBLOCK_K
num_stages	num_warps    )	get_size_hintsr   rr   rN   float16r   get_device_capabilityr   r/   )rU   rV   rW   r   rh   ri   
mat2_dtyperm   s           r8   try_heuristicr     s#   T4Aq1GAq!Aq	"zzU]]" JJ,,.&8		 F	*Av1r6Q;!b&A+Bw19d
 	
 
RAGT	a4i
 	
 
RAGT	a4i
 	
 r@   r{   c           	          t        | ||||      \  }}}t        |||g      sy t        | |      \  }}fd}d } ||||| |||      }t        ||||||	      }|
|j	                  |
|      S |j                         S )Nc                 V   t               }|j                  d|        |j                  d|       |j                  d|       |j                  d|j                  j                  d       |j                  d|j                  j                  d       t	        |d|       t	        |d	|       |j                  d
|j                  j                         d       |j                  d|j                  j                         d       dk(  r t        ||j                  j                         |S )NrU   rW   rV   
mat1_dtypeT)is_categoricalr   rh   ri   mat1_iscontigmat2_iscontigrA   )r   add_featurerm   rr   r   is_contiguousr   )	rU   rW   rV   rh   ri   mat1_stridemat2_stridecontextrC   s	           r8   get_contextz%mm_autoheuristic.<locals>.get_contextE  s   +C#C#C#L$++*;*;DQL$++*;*;DQGV[9GV[9T[[668 	 	
 	T[[668 	 	
 4<"7DKK,=,=>r@   c                       y r4   rT   rT   r@   r8   fallbackz"mm_autoheuristic.<locals>.fallbackY  s    r@   )r   r   rt   r   rC   augment_contextprecondition)r|   )r   r   get_size_hints_stridesr   get_top_k_choices_callerget_choice_caller)rh   ri   rU   rV   rW   r   rC   rt   opsr   r{   r|   r   r   r   r   r   autoheuristics         `           r8   r   r   2  s     T4Aq1GAq!Aq	"5dDAK( !Q4{KHG0!M 55? 6 
 	
 **,,r@   c                    t        |t              rt        |t              s^t        j                  j                  j                  | j                         t        j                  j                  j                        \  }}t        |t              rt        |t              s^t        j                  j                  j                  |j                         t        j                  j                  j                        \  }}|||fS )Nr   )r   r5   r   r   r   
size_hintsr   rN   r   r   unbacked_symint_fallback)rh   ri   rU   rV   rW   s        r8   r   r   p  s    aZ3%7!!,,MMO__++DD - 
A
 aZ3%7!!,,MMO__++DD - 
A a7Nr@   c                 d   | j                   j                  }|j                   j                  }||g}g }|D ]p  }t        |t              sMt        j
                  j                  j                  |t        j                  j                  j                        }|j                  |       r |d   |d   fS )Nr   r   r"   )rm   rd   r   r5   r   r   r   r   rN   r   r   r   r   )rh   ri   r   r   stridesstrides_hintsrd   s          r8   r   r     s    ++$$K++$$KK(GM %&#&WW%%00//HH 1 F 	V$% ]1---r@   )NN)m	functoolsloggingr:   typingr   rN   torch._dynamo.utilsr   +torch._inductor.autoheuristic.autoheuristicr   1torch._inductor.autoheuristic.autoheuristic_utilsr   r   r   r	   )torch._inductor.codegen.cpp_gemm_templater
   torch._inductor.virtualizedr    r   r   r   codegen.cuda.gemm_templater   r   'codegen.rocm.ck_universal_gemm_templater   codegen.wrapperr   r   r   loweringr   select_algorithmr   r   r   utilsr   r   r   r   r   r   r   r    r!   	mm_commonr#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   strr<   r5   r?   r   __version__triton_version
has_tritontriton_majortriton_minorImportError	getLogger__name__r}   r   atenversionhipr   r   	lru_cacherI   rA   r   rf   defaultr   _int_mmr   _sparse_semi_structured_mmr   rR   rX   r^   rj   r   r   r   r   r   boolr   r   r   r   r   r   rT   r@   r8   <module>r%     s     	   ( T  F ) , U D 2 * ( 

 
 
   "# (5c?*C "6#5#56NJ!%3"l g!yy~~		F MM%<1,1BE@	L@UL\ ,		`d P T" " UXX|
4	KKdjj.@.@
 "%--1BC"4	$$$# 8
 (,11 I %Z6 4775#' uJ 6uJp 4<<T:'+ *N ;*NZ 4::48*+!D a 9aH 422M(,T) N)X T# 4  
5+r  ;- C=;-|.c  NJLL	s   4$J   JJ