
    Vh-                     l   d dl mZ d dlmZmZ d dlZd dlZd dlmZ d dl	m
Z
 ddlmZmZmZmZmZmZmZmZmZmZmZmZmZ ddlmZmZ dd	lmZ 	 	 	 	 d4d
ddddddee   dee   dee   dededeee      dee d      ded   fdZ!	 	 	 d5d
ddddddee d      ded   defdZ"d Z# G d de      Z$ G d de      Z% G d de      Z& G d  d!e      Z' G d" d#e      Z( G d$ d%e      Z) G d& d'e      Z* G d( d)e      Z+ G d* d+e      Z, G d, d-e      Z- G d. d/e      Z. G d0 d1e      Z/ G d2 d3e      Z0y)6    )Sequence)AnyOptionalN)make_channels_last_strides_for
OrderedSet   )ExternKernelAllocFixedLayoutFlexibleLayoutget_device_typeir_node_to_tensor is_contiguous_storage_and_layoutLayoutmay_convert_to_optionalMultiOutputMultiOutputLayoutMutationOutput
NoneLayout	TensorBox)convert_shape_to_inductorpad_listlike)Vxr   weightbiaspaddingstridedilationgroups
transposedoutput_paddingquantize_argsotherc                 ~   d }d }|j                          |j                          ||j                          t        j                  j                  5  t	        |d      }t	        |d      }t        |j                               dz
  }dt        |      cxk  r|k  sJ  J dt        |      cxk  r|k  sJ  J dt        |      cxk  r|k  sJ  J t        ||      }t        ||      }t        ||      }|	t        dg|      }	n%dt        |	      cxk  r|k  sJ  J t        |	|      }	t        |t        t        j                  j                  j                  f      sJ |r( |||      }|j                         } |||||	|||      }nR|t	        |d      n|}t        j                  j                   j#                  ||||||||	|	      }|j                         }dgt%        t'        t)        dt        |      dz                     z   }t        |      g|z   }ddd       | j+                  |      }t-        d	 D               }|r!t/        |      rt1        j2                  |      }nt5        |      }t7        |      t7        |      k(  sJ t7        |      d
v sJ |g}|
X|
\  }}}}|j                          |j                          |j                          |j                          |||gz   |gz   ||gz   }n||gz  }|*| j+                  ||      }t        |t8              sJ ||gz  }t;        |j=                         |j?                         tA        |      tA        |            }||||g}|r|jC                  d|	       ||jE                  |       n|jC                  d|       |||||fS # 1 sw Y   xY w)a}  
    This function is a helper function to prepare inputs, layout and constant args
    for convolution post-op fusion's create function, including deciding the output
    layout (channels first or channels last), realizing inputs and make them etc. The
    function only supports the CPU/XPU device since conv post-op fusion kernel is only
    supported on CPU/XPU right now.
    c                    t        |       t        |      k(  sJ d       t        |       }|dkD  sJ d       d}d}	g }
|
j                  | |          |
j                  ||	   |z         t        d|      D ]P  }||   dz
  ||dz
     z  dz   }| |   dz
  ||dz
     z  ||dz
     dz  z
  |z   ||dz
     z   }|
j                  |       R t        t	        t
        |
            S )NzExpect input dim == weight dim   zExpect input dim > 2r   r	   )lenappendrangelistmapint)output_sizeweight_sizer   r"   r   r   r    dim	BATCH_DIMWEIGHT_INPUT_CHANNELS_DIM
input_sizedkernelinput_size_ds                 I/home/dcms/DCMS/lib/python3.12/site-packages/torch/_inductor/mkldnn_ir.py_conv_input_sizez<_prepare_convolution_fusion_create.<locals>._conv_input_size5   s!    ;3{#33U5UU3+Qw...w	$%!
+i01+&?@6IJq# 	,A!!nq(HQUO;a?FQ!#va!e}41q5>A%' !Q'(  l+	, CZ())    c                 L   | j                         t              }|dkD  sJ d       |dkD  rVg }|j                  d   |z         |j                  d   |z         |j                  fdt	        d|      D               |S | j                  dd      j                         }|S )Nr'   zExpect weight dim > 2r	   r   c              3   (   K   | ]	  }|     y wN ).0r4   prepacked_weight_sizes     r7   	<genexpr>z[_prepare_convolution_fusion_create.<locals>._original_deconv_weight_size.<locals>.<genexpr>[   s     OA4Q7Os   )sizer(   r)   extendr*   	transpose)prepacked_weightr    r0   r/   r?   s       @r7   _original_deconv_weight_sizezH_prepare_convolution_fusion_create.<locals>._original_deconv_weight_sizeP   s     !1 5 5 7'(Qw///wA:K4Q7&@A4Q7&@AOq#OO  +44Q:??AKr9   NT)guard_shaper'   r   r	   c              3   <   K   | ]  }t        |t                y wr<   )
isinstancer-   )r>   is     r7   r@   z5_prepare_convolution_fusion_create.<locals>.<genexpr>   s     GAZ3/Gs   cpuxpu)#realizer   graph	fake_moder   r(   rA   r   rH   r-   sympycorenumbersIntegertorchopsatenconvolutionr+   reversedr*   require_stride_orderallr   r   contiguous_stridesr   r   r   r   get_device_or_error	get_dtyper   insertr)   ) clsr   r   r   r   r   r   r    r!   r"   r#   r$   r8   rE   x_fakeweight_fakedimsr/   r3   r.   	bias_fakeoutputreq_stride_orderdynamic_shapesoutput_strideinputsx_scalex_zero_pointw_scalew_zero_pointkernel_layoutconstant_argss                                    r7   "_prepare_convolution_fusion_createro      s   .*6  IIK
NN	
		 2F"1$7'DA6;;=!A%3w<'4'''''3x=(D(((((3v;&$&&&&&w-$/fd+!)1#t4Ns>*2d22222).$?N&3

(:(:(B(B"CDDD 7{FKKJ*K >B=M!$D9SW  YY^^//
F !++-K3huQFa/H&I!JJ 0125EEe2Fh 	  $45A G+GGGN:1=&99+F6{C1!88881///SF 7D4w7L11VH<?VV6(((0@A%+++5'		!+.!-0	M fh7MQ/dQ%=-1A5HHI2F 2Fs   G
N22N<
binary_sumc           
         |j                          |j                          ||j                          |j                         ^ }}|j                         \  }}	t        |      |	gz   }
t        t        t	        t        |j                                                 }| j                  ||      }t        |      t        |      k(  sJ t        |      dv sJ |g}|X|\  }}}}|j                          |j                          |j                          |j                          |||gz   |gz   ||gz   }n||gz  }||r| j                  ||      }||gz   }t        j                  |
      }t        |j                         |j                         |
|      }g }||j                  |       n|j                  d|       |||||fS )z
    This function is a helper function to prepare inputs, layout and constant args
    for linear post-op fusion's create function. The function only supports the CPU device
    since linear post-op fusion kernel is only supported on CPU right now.
    rJ   r   )rM   get_sizer+   rX   r*   r(   rY   r   r   r[   r   
get_devicer]   r)   r^   )r_   r   r   r   r#   r$   rp   m_ocr.   re   rh   ri   rj   rk   rl   rg   rm   rn   s                       r7   _prepare_linear_fusion_createrw      s    IIK
NNJJLEQ OOEArq'RD.KHU3qzz|+<%=>?  $45A1!88881///SF 7D4w7L11VH<?VV6(,,U4DEE5'!"55kBM			M  "MdQ%=-1A5HHr9   c                     t        | j                         | g       }t        | j                               | _        |g| _        |S )Ndevice)r   
get_layoutr   rs   layoutoutputs)packed	output_irs     r7   _create_output_noder   
  sD    
I
 &V->->-@AFM[FNr9   c                        e Zd Z	 d	 d fdZ fdZedddddddee   dee   d	ee   d
edeee	      fd       Z
 xZS )ConvolutionUnaryc                     t         |   |||d t        j                  j                  j
                  j                  d       y )N,aoti_torch_cpu_mkldnn__convolution_pointwiseop_overloadcpp_kernel_name)super__init__rT   rU   mkldnn_convolution_pointwisedefaultselfr|   rh   rn   	__class__s       r7   r   zConvolutionUnary.__init__  s?     			((??GGJ 	 	
r9   c                 F    |j                  d       t        | 	  |       y Nz.torch/csrc/inductor/aoti_torch/c/shim_mkldnn.hinclude_extra_headerr   codegenr   wrapperr   s     r7   r   zConvolutionUnary.codegen%      $$%UV r9   r   r   r   r   padding_stride_	dilation_r    scalarsc           
          t        | |||||||      \  }}}}}||t        |	      |
gz   }t        |||      }t        |      S )Nr|   rh   rn   )ro   r   r   r   )r_   r   r   r   r   r   r   r    attrr   	algorithmrh   rn   rm   ru   r~   s                   r7   createzConvolutionUnary.create)  sr    ( /FD(GY
	
 &#G,)
 

 " '

 #6**r9   r=   returnN__name__
__module____qualname__r   r   classmethodr+   r-   r   r   r   __classcell__r   s   @r7   r   r     s    
 	

 

!  + +  + 	 +
 s) + c + 9 +  + $s)$ +  +r9   r   c                        e Zd Z	 	 d	 d fdZ fdZedddddddddee   d	ee   d
ee   dedede	e
   de	e   de	ee      de	e   fd       Z xZS )ConvolutionBinaryc                     t         |   |||d t        j                  j                  j
                  j                  d       || _        y )N3aoti_torch_cpu_mkldnn__convolution_pointwise_binaryr   )r   r   rT   rU   r   r   binarycpp_constant_args)r   r|   rh   rn   r   r   s        r7   r   zConvolutionBinary.__init__N  sI     			((??FFQ 	 	
 "3r9   c                 F    |j                  d       t        | 	  |       y r   r   r   s     r7   r   zConvolutionBinary.codegen_  r   r9   r   r   r$   r   r   r   r   r   r    binary_attrbinary_alpha
unary_attrunary_scalarsunary_algorithmc           
          t        | |||||||      \  }}}}}| j                  ||      }|j                  d|       ||	|
|t        |      |gz   }t	        |||      }t        |      S )Nr	   r   )ro   rY   r^   r   r   r   )r_   r   r$   r   r   r   r   r   r    r   r   r   r   r   rh   rn   rm   re   ru   r~   s                       r7   r   zConvolutionBinary.createc  s    . /FD(GY
	
 ((0@Aa%#M2)
 
 # '

 #6**r9   )r=   r=   r   )r   r   r   r   r   r   r+   r-   strr   floatr   r   r   r   s   @r7   r   r   M  s    
 3 
3"! '+'+ '+ 	'+
 '+ s)'+ c'+ 9'+ '+ '+ uo'+ SM'+  S	*'+ "#'+ '+r9   r   c                        e Zd Z	 d	 d fdZ fdZdeej                     fdZe	ddddddd	dd
e
e   de
e   de
e   dededee   dee   dee
e      dee   fd       Z xZS )ConvolutionBinaryInplacer   c                 ^   |d   |d   g|dd  z   }t         |   |||d t        j                  j                  j
                  j                  d       t        t        |d   j                               |d   |       t        t        |d   j                               |d   |       g| _
        y )Nr	   r   r'   4aoti_torch_cpu_mkldnn__convolution_pointwise_binary_r   ry   )r   r   rT   rU   r   _convolution_pointwise_r   r   r   rs   mutation_outputs)r   rm   rh   rn   reordered_inputsr   s        r7   r   z!ConvolutionBinaryInplace.__init__  s     #1Ivay1F12J>		((@@GGR 	 	
 :VAY-A-A-CDfQiQUV:VAY-A-A-CDfQiQUV!
r9   c                 F    |j                  d       t        | 	  |       y r   r   r   s     r7   r   z ConvolutionBinaryInplace.codegen  r   r9   c                     t               S r<   r   r   s    r7   get_unbacked_symbol_defsz1ConvolutionBinaryInplace.get_unbacked_symbol_defs  
    |r9   r   r   r$   r   r   r   r   r   r    r   r   r   r   r   c           
         t        | |||||||      \  }}}}}| j                  ||      }|j                  d|       ||	|
|t        |      |gz   }t	        t        |d   j                               ||      }|j                  d   S )Nr	   ry   )rm   rh   rn   r   )ro   rY   r^   r   r   r   rs   rh   )r_   r   r$   r   r   r   r   r   r    r   r   r   r   r   rh   rn   ru   re   r~   s                      r7   r   zConvolutionBinaryInplace.create  s    . /FD(GY
	
 ((0@Aa%#M2)
 
 *$F1I,@,@,BC'
 }}Qr9   r   r   )r   r   r   r   r   r   rP   Symbolr   r   r+   r-   r   r   r   r   r   r   r   s   @r7   r   r     s    
 	

 

.!*U\\*B  * *  *  	* 
 *  s)*  c*  9*  *  *  uo*  SM*   S	**  "#*  * r9   r   c                        e Zd Z	 d	 d fdZ fdZedddddddee   dee   d	ee   d
ee   dedeee	      fd       Z
 xZS )ConvolutionTransposeUnaryc                     t         |   |||d t        j                  j                  j
                  j                  d       y )N6aoti_torch_cpu_mkldnn__convolution_transpose_pointwiser   )r   r   rT   rU   r    _convolution_transpose_pointwiser   r   s       r7   r   z"ConvolutionTransposeUnary.__init__  s?     			((IIQQT 	 	
r9   c                 F    |j                  d       t        | 	  |       y r   r   r   s     r7   r   z!ConvolutionTransposeUnary.codegen  r   r9   r   r   r   r   r   output_padding_r   r   groups_r   c                     d}t        | |||||||||
      \  }}}}}||	t        |
      |gz   }t        |||      }t        |      S )NTr   )ro   r   r   r   )r_   r   r   r   r   r   r   r   r   r   r   r   r!   rh   rn   rm   ru   r~   s                     r7   r   z ConvolutionTransposeUnary.create  s     
 /
	
 &#G,)
 

 + '

 #6**r9   r   r   r   r   s   @r7   r   r     s    
 	

 

! ++++ ++ 	++
 s)++ c++ c++ 9++ ++ $s)$++ ++r9   r   c                        e Zd Z	 d	 d fdZ fdZeddddddddddd	dd
ddee   dee   dee   dededefd       Z	 xZ
S )QConvPointWisePT2Ec                     t        |      dk(  | _        t        |   |||dt        j
                  j                  j                  j                  d       y)a  
        if bias is not None
            - inputs = [x, w, b, weight_scale, weight_zp]
            - const_args is: [stride, padding, dilation, groups, x_scale, x_zp, o_scale, o_zp,
              fp32_output, unary_attr, unary_scalars, unary_algorithm]
        else
            - inputs = [x, w, weight_scale, weight_zp]
            - const_args is: [bias, stride, padding, dilation, groups, x_scale, x_zp, o_scale, o_zp,
              fp32_output, unary_attr, unary_scalars, unary_algorithm]
           N(aoti_torch_cpu__qconv2d_pointwise_tensorr   )	r(   has_biasr   r   rT   rU   onednnqconv2d_pointwiser   r   s       r7   r   zQConvPointWisePT2E.__init__  sO      Fq(		((::BBF 	 	
r9   c                     |j                  d       t        | 	  |       t        | j                  t
              r| j                  |       y y r   r   r   r   rH   r|   r   codegen_size_assertsr   s     r7   r   zQConvPointWisePT2E.codegen9  ?    $$%UV dkk6*%%g. +r9   qxr   ri   rj   qwrk   rl   r   r   r   r   r    output_scaleoutput_zero_pointc                 0   d}d }t        | ||||	||
|||||||g      \  }}}}}||d   |d   c|d<   |d<   n|d   |d   c|d<   |d<   |||||t        |      |gz   }|J |t        j                  t        j                  fv r||_        t        |||      S )NFr'   r	   r   r   )ro   r   rT   float32bfloat16dtyper   )r_   r   ri   rj   r   rk   rl   r   r   r   r   r    r   r   output_dtyper   r   r   r!   r"   rh   rn   rm   ru   s                           r7   r   zQConvPointWisePT2E.create?  s   * 
 /lG\:
	
 <1>q1A=QRCS.M!mA.1>q1A=QRCS.M!mA.%#G,)
 
 '''EMM5>>:: #/M! '
 	
r9   r   r   )r   r   r   r   r   r   r+   r-   r   r   r   r   s   @r7   r   r     s    
 	

 

4/ B
B
 B
 "	B

 B
 B
 "B
 B
 S	B
 cB
 s)B
 B
 B
 B
 B
r9   r   c                        e Zd Z	 d	 d fdZ fdZd Zdeej                     fdZ	e
ddddd	dd
ddddddee   dee   dee   deddddfd       Z xZS )QConvPointWiseBinaryPT2Er   c                     t        |      dk(  | _        d| _        t        |   |||dt
        j                  j                  j                  j                  d       y)ag  
        Needs input/weight/output qparams
        if bias is not None
            - inputs = [x, x_scale, x_zp, w,  w_scale, w_zp, accum, b]
            - const_args = [stride, padding, dilation, groups, o_scale, o_zp,
            output_dtype, accum_scale, accum_zp, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm]
        else
            - inputs = [x, x_scale, x_zp, w,  w_scale, w_zp, accum]
            - const_args [b, stride, padding, dilation, groups, o_scale, o_zp,
             output_dtype, accum_scale, accum_zp, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm]
              N/aoti_torch_cpu__qconv2d_pointwise_binary_tensorr   )
r(   r   idx_for_inplace_sumr   r   rT   rU   r   r   r   r   s       r7   r   z!QConvPointWiseBinaryPT2E.__init__  sW    " Fq(#$ 		((::AAN 	 	
r9   c                     |j                  d       t        | 	  |       t        | j                  t
              r| j                  |       y y r   r   r   s     r7   r   z QConvPointWiseBinaryPT2E.codegen  r   r9   c                 R    | j                   | j                     j                         gS r<   )rh   r   get_namer   s    r7   get_mutation_namesz+QConvPointWiseBinaryPT2E.get_mutation_names  s#    D445>>@AAr9   c                     t               S r<   r   r   s    r7   r   z1QConvPointWiseBinaryPT2E.get_unbacked_symbol_defs  r   r9   r   r   ri   rj   r   qaccumr   r   r   r   r    r   r   c                    d}d }t        | ||||
|	||||||||g|      \  }}}}}||d   |d   c|d<   |d<   n|d   |d   c|d<   |d<   |||||||||t        |      |g
z   }|dk(  sJ d       t        j                  j	                  |j                                t        t        |j                               ||      }|j                  |j                     S )	NFr'   r	   r   sumzCFor now, only post op sum is supported in QConvPointWiseBinaryPT2E.ry   r   )ro   r   r   rN   mark_buffer_mutatedr   r   r   rs   rh   r   )r_   r   ri   rj   r   rk   rl   r   r   r   r   r   r    r   r   r   accum_scaleaccum_zero_pointr   alphar   r   r   r!   r"   rh   rn   _kernel_layoutre   r~   s                                 r7   r   zQConvPointWiseBinaryPT2E.create  sA   4 
 /lG\:
	
" <1>q1A=QRCS.M!mA.1>q1A=QRCS.M!mA.%#M2)
 
 e# 	
Q	
# 	
##FOO$56)V%6%6%89'
 }}V7788r9   r   r   )r   r   r   r   r   r   r   rP   r   r   r   r+   r-   r   r   r   s   @r7   r   r     s    
 	

 

8/B*U\\*B  O9O9 O9 "	O9
 O9 O9 O9 S	O9 cO9 s)O9 O9 "O9 'O9 O9r9   r   c                   @     e Zd Z	 d	 d fdZ fdZed        Z xZS )MKLPackedLinearc                     t         |   |||d t        j                  j                  j
                  j                         y N)r   )r   r   rT   rU   mkl_mkl_linearr   r   s       r7   r   zMKLPackedLinear.__init__  s:     			1199 	 	
r9   c                 F    |j                  d       t        | 	  |       y r   r   r   s     r7   r   zMKLPackedLinear.codegen  r   r9   c                    | j                  | j                  |            }| j                  | j                  |            }|j                         ^ }}|j                         \  }}t        |      |gz   }	t	        j
                  |	      }
|||g}|g}|||gz  }n|j                  dd        t        t        |j                         |j                         |	|
      ||      S )Nr   r   )require_stride1realize_inputrr   r+   r   r[   r^   r   r   rs   r]   )r_   r   packed_worig_wB
batch_sizert   ru   rv   r.   rg   rh   rn   s                r7   r   zMKLPackedLinear.create  s     1 1! 45$$S%6%6v%>?

A!A1gn&99+FXv&#=qcMF  D){M '
 	
r9   r   r   r   r   r   r   r   r   r   r   r   s   @r7   r   r     s0    
 	

 

! 
 
r9   r   c                   F     e Zd Z	 d	 d fdZ fdZed        Zd Z xZS )LinearUnaryc                     t         |   |||d t        j                  j                  j
                  j                  d       y )N aoti_torch_cpu__linear_pointwiser   )r   r   rT   rU   r   _linear_pointwiser   r   s       r7   r   zLinearUnary.__init__-  s?     			((::BB> 	 	
r9   c                 F    |j                  d       t        | 	  |       y r   r   r   s     r7   r   zLinearUnary.codegen<  r   r9   c                    | j                  | j                  |            }| j                  | j                  |            }|j                         ^ }}|j                         \  }	}t        |      |	gz   }
||g}||r|ndg|g}|2| j                  | j                  |            }|j	                  |       n|j                  dd        t        t        |j                         |j                         |
      ||      }t        |      S )Nr   rz   r   rA   r   )require_contiguousr  rr   r+   r)   r^   r
  r   rs   r]   r   )r_   r   wr  r   r   r   rt   _icrv   r.   rh   rn   r~   s                 r7   r   zLinearUnary.create@  s    ""3#4#4Q#78""3#4#4Q#78**,C**,C1gnQ'wtYG=&&s'8'8';<AMM!  D)||~kkm 
 '
 #6**r9   c                      y r<   r=   r   s    r7   apply_constraintzLinearUnary.apply_constraint[      r9   r   r   )	r   r   r   r   r   r   r   r  r   r   s   @r7   r
  r
  ,  s5    
 	

 

! + +4r9   r
  c                   J     e Zd ZdZ	 d	 d fdZ fdZed        Zd Z xZ	S )LinearBinaryz)torch.ops.mkldnn._linear_pointwise.binaryc                     t         |   |||d t        j                  j                  j
                  j                  d       y )N'aoti_torch_cpu__linear_pointwise_binaryr   )r   r   rT   rU   r   r  r   r   s       r7   r   zLinearBinary.__init__b  s?     			((::AAE 	 	
r9   c                 F    |j                  d       t        | 	  |       y r   r   r   s     r7   r   zLinearBinary.codegenq  r   r9   c                 J   | j                  | j                  |            }| j                  | j                  |            }| j                  | j                  |            }|j                         ^ }}|j                         \  }}t        |      |gz   }	|||g}
|g}|2| j                  | j                  |            }|
j	                  |       n|j                  d|       t        t        |j                         |j                         |	      |
|      }t        |      S )Nr   r  r   )r  r  rr   r+   r)   r^   r  r   rs   r]   r   )r_   r   yr  r  r   rt   r  rv   r.   rh   rn   r~   s                r7   r   zLinearBinary.createu  s
   ""3#4#4Q#78""3#4#4Q#78""3#4#4Q#78**,C**,C1gnQ=&&s'8'8';<AMM!  A&||~kkm 
 '
 #6**r9   c                      y r<   r=   r   s    r7   r  zLinearBinary.apply_constraint  r  r9   r   r   )
r   r   r   r5   r   r   r   r   r  r   r   s   @r7   r  r  _  s:    8F 	

 

! + +6r9   r  c                   h     e Zd Z	 	 d	 d fdZ fdZeddddddddddd	dd
ddedefd       Z xZ	S )QLinearPointwisePT2Ec                     || _         t        | 	  |||dt        j                  j
                  j                  j                  d       y)a  
        if bias is not None
            - inputs = [x, w, b, weight_scale, weight_zp]
            - const_args is: [x_scale, x_zp, o_scale, o_zp,
              fp32_output, unary_attr, unary_scalars, unary_algorithm]
        else
            - inputs = [x, w, weight_scale, weight_zp]
            - const_args is: [bias, x_scale, x_zp, o_scale, o_zp,
              fp32_output, unary_attr, unary_scalars, unary_algorithm]
        N(aoti_torch_cpu__qlinear_pointwise_tensorr   )r   r   r   rT   rU   r   qlinear_pointwisetensorr   r|   rh   rn   r   r   s        r7   r   zQLinearPointwisePT2E.__init__  sF    " !));;BBG 	 	
r9   c                     |j                  d       t        | 	  |       t        | j                  t
              r| j                  |       y y r   r   r   s     r7   r   zQLinearPointwisePT2E.codegen  ?    $$%UV dkk6*%%g. +r9   r   r   ri   rj   r   rk   rl   r   r   r   c           
          t        | |||||||g      \  }}}}}|||	|
|t        |      |gz   }|
J |
t        j                  t        j                  fv r|
|_        t        ||||d u      S )Nr|   rh   rn   r   )rw   r   rT   r   r   r   r!  )r_   r   ri   rj   r   rk   rl   r   r   r   r   post_op_namepost_op_argspost_op_algorithmrh   rn   rm   ru   s                     r7   r   zQLinearPointwisePT2E.create  s    " 8UlG\:8
4q! &#L1)
 
 '''EMM5>>:: #/M# '$&	
 	
r9   r=   Tr   )
r   r   r   r   r   r   r   r-   r   r   r   s   @r7   r!  r!    s    
 
 

6/ ,
,
 ,
 "	,

 ,
 ,
 ",
 ,
 ,
 ,
 ,
r9   r!  c                   r     e Zd Z	 	 d	 d fdZ fdZd Zeddddddddd	dd
ddddddedefd       Z	 xZ
S )QLinearPointwiseBinaryPT2Ec                     || _         d| _        t        |   |||dt        j
                  j                  j                  j                  d       y)a  
        if bias is not None
            - inputs = [x, w, x_scale, x_zp, weight_scale, weight_zp, x2, bias]
            - const_args is: [o_scale, o_zp,
              fp32_output, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm]
        else
            - inputs = [x, w, x_scale, x_zp, weight_scale, weight_zp, x2]
            - const_args is: [bias, o_scale, o_zp,
              fp32_output, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm]
        r   N/aoti_torch_cpu__qlinear_pointwise_binary_tensorr   )	r   r   r   r   rT   rU   r   r$  binary_tensorr&  s        r7   r   z#QLinearPointwiseBinaryPT2E.__init__  sN    " !#$ ));;IIM 	 	
r9   c                     |j                  d       t        | 	  |       t        | j                  t
              r| j                  |       y y r   r   r   s     r7   r   z"QLinearPointwiseBinaryPT2E.codegen  r   r9   c                 ~    | j                   d   }|dk(  r(| j                  | j                     j                         gS g S )Nr   )rn   rh   r   r   )r   binary_post_ops     r7   r   z-QLinearPointwiseBinaryPT2E.get_mutation_names  s@    ++B/U"KK 8 89BBDEEIr9   r   r   ri   rj   r   rk   rl   r$   r   r   r   c                    t        | |||||||g||dk(        \  }}}}}||	|
||||||t        |      |g
z   }|dk(  rot        j                  j	                  |j                                t        t        |j                               |||d u      }|j                  |j                     S |J |t        j                  t        j                  fv r||_        t        ||||d u      S )Nr   ry   r*  )rw   r   r   rN   r   r   r0  r   rs   rh   r   rT   r   r   r   )r_   r   ri   rj   r   rk   rl   r$   r   r   r   r   other_scaleother_zpr7  r   unary_post_opunary_post_op_argsunary_post_op_algorithmrh   rn   rm   re   r~   s                           r7   r   z!QLinearPointwiseBinaryPT2E.create  s$   8 *lG\:e#
	
 &#$67#)
 
 U"GG''(89/!)9)9);<+d*	F ==!;!;<<'''EMM5>>:: #/M) '$&	
 	
r9   r.  r   )r   r   r   r   r   r   r   r   r-   r   r   r   s   @r7   r0  r0    s    
 
 

8/ H
H
 H
 "	H

 H
 H
 "H
 H
 H
 H
 H
 H
r9   r0  c            !            e Zd Z	 d	 d fdZeddddddddddddd	dd
edee   dededededededef d       Z fdZ	 xZ
S )MkldnnRnnLayerc                     t         |   |||d t        j                  j                  j
                  j                         y r   )r   r   rT   rU   rV   mkldnn_rnn_layerr   r   s       r7   r   zMkldnnRnnLayer.__init___  s:     			77?? 	 	
r9   r   r   w0w1w2w3hxcxreversebatch_sizesmodehidden_size
num_layers
has_biasesbidirectionalbatch_firsttrainc                    | j                  | j                  |            }|j                          | j                  | j                  |            }| j                  | j                  |            }| j                  | j                  |            }| j                  | j                  |            }| j                  | j                  |            }|j                          | j                  | j                  |            }|j                          |j                         }t	        |      dk(  sJ d       |\  }}}|||g}|j                         }|j                         }|||||||g}||	|
||||||g	}t        t        |j                               ||      }d }|||dgg} |||      t        j                  |      t        j                  |      dgg}t        t        ||            D cg c]D  \  }\  }}t        t        |j                         |j                         ||      |t        |fg      F } }}}| |_        | S c c}}}w )N   zExpect lstm input to be 3Dry   )rh   rn   c                 V    t        |       dk(  sJ d       t        j                  |       S )NrR  zExpect output_shape to be 3D)r(   r   r[   )output_shaperO  s     r7   get_strides_of_lstm_outputz9MkldnnRnnLayer.create.<locals>.get_strides_of_lstm_output  s,    |$)I+II)!44\BBr9   r	   )r  r  freeze_layoutrr   r(   r?  r   rs   r   r[   	enumeratezipr   r   r]   tupler}   )!r_   r   rB  rC  rD  rE  rF  rG  rH  rI  rJ  rK  rL  rM  rN  rO  rP  r3   
seq_length
mini_batchrT  hy_shapecy_shaperh   rn   r~   rU  output_sizesoutput_stridesrI   r.   rg   r   s!                                    r7   r   zMkldnnRnnLayer.createm  s]   (  1 1! 45 	
  !2!22!67  !2!22!67  !2!22!67  !2!22!67  !2!22!67
  !2!22!67
ZZ\
:!#A%AA# .8*
J
"J<;;=;;=RRR,

  Q\\^4'
	C %h1#>&|[A--h7--h7C	
" 4=L.14
 
 0/K LLNKKM!	 	
	 
 ##
s   2A	Ic                 D    |j                  d       t        | 	  |      S r   r   r   s     r7   r   zMkldnnRnnLayer.codegen  s!    $$%UVww''r9   r   r   )r   r   r   r   r   boolr+   r-   r   r   r   r   s   @r7   r?  r?  ^  s    
 	

 

 [[ [ 	[
 [ [ [ [ [ #Y[ [ [ [ [ [  ![" #[ [z( (r9   r?  c                   R     e Zd Z	 d	 d fdZ fdZe	 	 	 	 	 	 	 	 dd       Z xZS )WeightInt4PackMatmulc                     t        |      dk(  sJ t        |      dk(  sJ t        | 	  |||dt        j                  j
                  j                  j                  d       y)zY
        inputs = [x, w, qGroupSize, qScalesAndZeros]
        constant_args = ()
           r   N-aoti_torch_cpu__weight_int4pack_mm_cpu_tensorr   )r(   r   r   rT   rU   	quantizedint4mm_packed_weight_cpur   r   s       r7   r   zWeightInt4PackMatmul.__init__  sd     6{a=!Q&&&,,EEMML 	 	
r9   c                     |j                  d       t        | 	  |       t        | j                  t
              r| j                  |       y y r   r   r   s     r7   r   zWeightInt4PackMatmul.codegen  r(  r9   c                    ||||g}|j                         ^ }}|j                         \  }}t        |      |gz   }	t        j                  |	      }
t	        |j                         |j                         |	|
      }t        ||      S )N)r|   rh   )rr   r+   r   r[   r   rs   r]   rc  )r_   r   r  
qGroupSizeqScalesAndZerosrh   rt   ru   nr.   rg   rm   s               r7   r   zWeightInt4PackMatmul.create  s     Q
O4

Azz|11gm&99+F#LLNKKM	
 $ 
 	
r9   r   r   )r   r   r  r   rk  r   rl  r   r  r   s   @r7   rc  rc    sV    
 	

 

*/ 

 
  	

 %
 
r9   rc  )FNNN)NNF)1collections.abcr   typingr   r   rP   rT   torch._prims_commonr   torch.utils._ordered_setr   irr
   r   r   r   r   r   r   r   r   r   r   r   r   utilsr   r   virtualizedr   r-   ra  r+   ro   rw   r   r   r   r   r   r   r   r   r
  r  r!  r0  r?  rc  r=   r9   r7   <module>ru     s   $     > /    ;  .215#'jIjI jI 	jI
 c]jI SMjI smjI jI jI Xc]+jI D-.jI K jId 26#'<I<I <I 	<I
 D-.<I K <I <I~5+( 5+p>+) >+BJ 0 J Z@+ 1 @+Fd
* d
Ny90 y9x(
' (
V0# 0f3$ 3lP
, P
fs
!2 s
lo(& o(f3
, 3
r9   