
    Vh                        d dl Z d dlmZ d dlZd dlmZ ddlmZm	Z	 ddl
mZmZ ddlmZ ddl	mZmZ dd	lmZ dd
lmZmZmZ ddlmZmZmZ ddlmZ  e j:                  e      Z eej@                  dd      Z! eejD                  jF                  jH                  ddejJ                        Z&ejD                  jF                  Z#ejD                  jN                  Z'ejD                  jP                  Z(ddZ)ddZ*y)    N)Any)mm_args   )configlowering)CppGemmTemplateCppWoqInt4GemmTemplate)create_epilogue_with_attr)expandregister_lowering)WeightInt4PackMatmul)autotune_select_algorithmExternKernelChoicerealize_inputs)use_aten_gemm_kernelsuse_cpp_gemm_templateuse_max_autotune)Vzat::_weight_int8pack_mmF)has_out_variantz*at::native::_weight_int4pack_mm_cpu_tensor)r   kernel_creatorc                  Z   t        j                  t        j                  t        j
                  t        j                  g       t        j                  t        j                         t        j                  t        j
                         t        j                  t        j                         y N)r   add_needs_realized_inputs	quantized
max_pool2d
_quantized$wrapped_fbgemm_pack_gemm_matrix_fp16!wrapped_fbgemm_linear_fp16_weightmake_fallback     S/home/dcms/DCMS/lib/python3.12/site-packages/torch/_inductor/quantized_lowerings.pyregister_quantized_opsr#   '   sn    &&  ;;88	
 9//0:JJK:GGHr!   c                      t        t        j                  d       d ddt        j                  dt        j                  dt        j                  dt
        dt
        f
d       } t        t        j                  d       d ddt        j                  dt        j                  d	t        d
t        j                  dt
        dt
        fd       }t        j                  t        j                         t        j                  t        j                         y )N)type_promotion_kind)layoutinputweightscaler&   returnc                   t        | |d      \  }}}}}|j                         t        j                  t        j                  t        j
                  fv r!|j                         t        j                  k(  sJ }t               rt        j                  ||f|      gng }dt        j                  dt        ffd}	t        |||d      rt        j                  ||||gd|	       t        |      dk(  rVt         j"                  rFt               s<t$        j'                  d	       t        j                  ||f|      j)                         S t+        d
|||g|      S )NT)r&   mat2_transposedbufr*   c           
      Z    t        | dt        t        j                                    S )Nmul)other)r
   r   r   size)r-   r&   r)   s    r"   _mul_epiloguez?register_woq_mm_ops.<locals>.int8pack_mm.<locals>._mul_epilogueO   s'    ,U.v{{1K"L r!   )r,   )trans_wepilogue_creatorr   3No choices for GEMM, using ATen backend as fallback_weight_int8pack_mm)r   	get_dtypetorchbfloat16float16floatint8r   aten__weight_int8pack_mmbindTensorr   r   r   add_choicesleninductor_configautotune_fallback_to_atenlogwarningoutput_noder   )
r'   r(   r)   r&   _mat1mat2aten_layoutchoicesr2   s
     ``      r"   int8pack_mmz(register_woq_mm_ops.<locals>.int8pack_mm5   sS    '.6&$'
#1at NN LL EJJ.	
/ 
 %& &**D$+>LM 		u|| 	 	
 !dD$O''tU#!. LA99)+KKMN+00tU#[km )!7T4,?
 	
r!   
qGroupSizeqScaleAndZerosc                   t        | ||dd      \  }}}}}}|j                         t        j                  t        j                  t        j
                  fv r!|j                         t        j                  k(  sJ t        j                  j                  t        j                  |t        j                        d       }|}	t               rt        j                  ||||f|	      gng }
t               rMt!        |	||dd|      r<|j#                         j%                         rt&        |   j)                  |
|	||||g       t+        |
      dk(  rWt,        j.                  rGt               s=t0        j3                  d       t        j                  ||||f|	      j5                         S dt        j6                  j8                  j:                  d	t        j<                  fd
}|d d}t?        d|
||||g|	|      S )NT)r&   use_4x2_dimr,   )dtype)name)r,   is_woq_int4q_group_sizer   r5   xr*   c                     | j                         j                         sJ | j                         }| j                         }t	        j
                  dd|t        j                  |      S )Nr      )rQ   device)
get_layoutis_contiguousget_size
get_devicer8   randintuint8)rU   shaperX   s      r"   get_example_weightzHregister_woq_mm_ops.<locals>.int4pack_mm_cpu.<locals>.get_example_weight   sK    <<>//111JJLE\\^F==Cekk&QQr!   c                 X    t         j                  j                  | j                            S r   )r   graph	constantsget_name)rU   s    r"   <lambda>z>register_woq_mm_ops.<locals>.int4pack_mm_cpu.<locals>.<lambda>   s    **1::<8 r!   )r      _weight_int4pack_mm_for_cpu)input_gen_fns) r   r7   r8   r9   r:   r;   r^   r   rb   add_tensor_constanttensorint64r   aten__weight_int4pack_mm_cpur>   r   r   rY   rZ   r	   r@   rA   rB   rC   rD   rE   rF   	_inductorirIRNoder?   r   )r'   r(   rM   rN   r&   rG   rH   rI   
group_sizerJ   rK   r`   rh   s                r"   int4pack_mm_cpuz,register_woq_mm_ops.<locals>.int4pack_mm_cpuk   s    '.6&dD'
#1at NN LL EKK/	
0 WW00LL5;;7d 1 

  %&	 -114^<k  	 % $ ' !//1":.::tZ8 LA99)+KKMN/44tZ8+km	R%//"4"4";"; 	R 	R "8

 ))4^4'
 	
r!   )r   atenr6   r8   r?   r   rg   intr   r   _dyn_quant_matmul_4bit_dyn_quant_pack_4bit_weight)rL   rq   s     r"   register_woq_mm_opsrv   4   s    t//TJ 3
||3
3
 ||3

 3
 
3
 K3
j t77TR M
||M
M
 M
 	M
 M
 
M
 SM
^ 46674;;<r!   )r*   N)+loggingtypingr   r8    torch._inductor.kernel.mm_commonr    r   rB   r   codegen.cpp_gemm_templater   r	   codegen.cpp_utilsr
   r   r   	mkldnn_irr   select_algorithmr   r   r   utilsr   r   r   virtualizedr   	getLogger__name__rD   r6   r=   opsr   int4mm_packed_weight_cpucreaterl   r   rr   r#   rv   r    r!   r"   <module>r      s       4 1 N 8 / + 
 R Q  g!-	8%   2	II000'..	   II	YY!!
yy~~
IH=r!   