
    Vhgp                        U d dl Z d dlmZmZmZ ddlmZ d dlmZm	Z	m
Z
mZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlZddgZ ed      Z ed      Ze j:                  j<                  Zd Zi Z e!eef   e"d<   d Z#d:deeeef   geeef   f   fdZ$ e$ejJ                        ddde&fd       Z' e$ejP                        d;de&fd       Z) e$ejT                        d;de&fd       Z+ e$ejX                        d;de&fd       Z- e$ej\                        	 	 	 	 	 d<de&fd       Z/	 d:de0e&   de0e&   de0e&   de1de&f
dZ2 e$ejf                  ejh                  g      ddde&fd        Z5 e$ejl                        de&fd!       Z7d" Z8 e$ejr                  ejt                  ejv                  g      ddde&fd#       Z<d$ Z=dd%dee>e>e&d&f   e>e&d&f   e>e&d&f   e	e>e&d&f      f      fd'Z?dd%dee>e>e&d&f   e>e&d&f   e>e&d&f   e	e>e&d&f      f      fd(Z@ e$ej                  d)*      ddde&fd+       ZB e$ej                  d)*      de&fd,       ZDd- ZE e$ej                  ej                  ej                  g      ddde&fd.       ZI e$ej                  d)*      de&fd/       ZK e$ej                  d)*      de&fd0       ZMi ejJ                  e'ejP                  e)ejT                  e+ejX                  e-ej\                  e/ejf                  e5ejh                  e5ejl                  e7ejr                  e<ejt                  e<ejv                  e<ej                  eIej                  eIej                  eIej                  eBej                  eDej                  eKej                  eMiZ d1 ZNg d2ZOd3 ZPd4 ZQd5 ZRd6 ZS G d7 d      ZT G d8 d9e      ZUy)=    N)tree_maptree_flattentree_unflatten   )ModuleTracker)AnyOptionalUnionTypeVarCallable)Iterator)	ParamSpec)defaultdict)TorchDispatchModeprodwrapsFlopCounterModeregister_flop_formula_T_Pc                 R    t        | t        j                        r| j                  S | S N)
isinstancetorchTensorshape)is    H/home/dcms/DCMS/lib/python3.12/site-packages/torch/utils/flop_counter.py	get_shaper!      s    !U\\"wwH    flop_registryc                 4     t               d d fd
       }|S )N)out_valc                 F    t        t        ||| f      \  }}} |d|i|S )N	out_shape)r   r!   )r%   argskwargsr'   fs       r    nfzshape_wrapper.<locals>.nf   s2    "*9tVW6M"Nfi$6)6v66r"   r   r*   r+   s   ` r    shape_wrapperr-      s#    
1X 7 7 Ir"   returnc                 d     dt         t        t        f   dt         t        t        f   f fd}|S )Nflop_formular.   c                      st                 fd}t        j                  j                  j	                  |        S )Nc                     t        | t        j                  j                        st	        d|  dt        |              | t        v rt        d|        t        | <   y )Nzlregister_flop_formula(targets): expected each target to be OpOverloadPacket (i.e. torch.ops.mylib.foo), got z which is of type zduplicate registrations for )r   r   _opsOpOverloadPacket
ValueErrortyper#   RuntimeError)targetr0   s    r    registerz=register_flop_formula.<locals>.register_fun.<locals>.register(   si    fejj&A&AB Hh0f@A A &"%A&#JKK$0M&!r"   )r-   r   utils_pytree	tree_map_)r0   r9   get_rawtargetss   ` r    register_funz+register_flop_formula.<locals>.register_fun$   s7    (6L	1 	%%h8r"   )r   r   r   )r>   r=   r?   s   `` r    r   r   #   s0    8BF#3 R8H & r"   )r'   c                :    | \  }}|\  }}||k(  sJ ||z  dz  |z  S )zCount flops for matmul.    )	a_shapeb_shaper'   r(   r)   mkk2ns	            r    mm_floprI   9   s3    
 DAqEB7N7q519q=r"   c                     t        ||      S )zCount flops for addmm.rI   
self_shaperC   rD   r'   r)   s        r    
addmm_floprN   D   s     7G$$r"   c                 V    | \  }}}|\  }}}	||k(  sJ ||k(  sJ ||z  |	z  dz  |z  }
|
S )z"Count flops for the bmm operation.rA   rB   )rC   rD   r'   r)   brE   rF   b2rG   rH   flops              r    bmm_floprS   I   sK    
 GAq!IBA7N77N7q519q=1DKr"   c                     t        ||      S )z&Count flops for the baddbmm operation.rS   rL   s        r    baddbmm_floprV   V   s    
 GW%%r"   c	                     t        | |      S )zCount flops for _scaled_mm.rK   )
rC   rD   scale_a_shapescale_b_shape
bias_shapescale_result_shape	out_dtypeuse_fast_accumr'   r)   s
             r    _scaled_mm_flopr^   ]   s     7G$$r"   x_shapew_shaper'   
transposedc                 t    | d   }|r| n|dd }|^}}}	 t        |      t        |      z  |z  |z  |z  dz  }	|	S )a  Count flops for convolution.

    Note only multiplication is
    counted. Computation for bias are ignored.
    Flops for a transposed convolution are calculated as
    flops = (x_shape[2:] * prod(w_shape) * batch_size).
    Args:
        x_shape (list(int)): The input shape before convolution.
        w_shape (list(int)): The filter shape.
        out_shape (list(int)): The output shape after convolution.
        transposed (bool): is the convolution transposed
    Returns:
        int: the number of flops
    r   rA   Nr   )
r_   r`   r'   ra   
batch_size
conv_shapec_outc_infilter_sizerR   s
             r    conv_flop_countrh   n   s]    * J''Y;J 'E4+ 
d;//*<uDtKaODKr"   c                     t        | |||      S )zCount flops for convolution.ra   )rh   )
r_   r`   _bias_stride_padding	_dilationra   r'   r(   r)   s
             r    	conv_flopro      s     7GY:NNr"   c                    d }d}	 |
d   r t        |d         }|t        | |||       z  }|
d   rZt        |d         }|r&|t         ||        ||       ||      d      z  }|S |t         ||       ||        ||      d      z  }|S )Nc                 4    | d   | d   gt        | dd        z   S )Nr   r   rA   )list)r   s    r    tzconv_backward_flop.<locals>.t   s$    a%(#d59o55r"   r   r   Frj   )r!   rh   )grad_out_shaper_   r`   rk   rl   rm   rn   ra   _output_padding_groupsoutput_maskr'   rs   
flop_countgrad_input_shapegrad_weight_shapes                   r    conv_backward_flopr{      s    6JDL 1~$Yq\2ong?OU_Q_``
1~%il3/!N*;QwZK\I]joppJ
  /!G*a6GK\I]joppJr"   c                     | \  }}}}|\  }}}	}
|\  }}}}||cxk(  r|k(  r"n J ||cxk(  r|k(  rn J ||
k(  r
|	|k(  r||
k(  sJ d}|t        ||z  ||f||z  ||	f      z  }|t        ||z  ||	f||z  |	|f      z  }|S )z^
    Count flops for self-attention.

    NB: We can assume that value_shape == key_shape
    r   rU   )query_shape	key_shapevalue_shaperP   hs_qd_q_b2_h2s_k_d2_b3_h3_s3d_vtotal_flopss                   r    sdpa_flop_countr     s     !NAq#s"Cc3$Cc3?s?[[qC3[[3#:#*QTX[Q[[[K8QUC-AsC/@AAK8QUC-AsC/@AAKr"   c                    t        | ||      S )Count flops for self-attention.r   )r}   r~   r   r'   r(   r)   s         r    	sdpa_flopr     s     ;	;??r"   c                     ddl m} ddlm} t	        | ||f      s7| j
                  j                  dk7  r| j                         j                         S |g| j                  d      dz
  z  S )z
    If the offsets tensor is fake, then we don't know the actual lengths.
    In that case, we can just assume the worst case; each batch has max length.
    r   )
FakeTensor)FunctionalTensormetar   )
torch._subclasses.fake_tensorr   #torch._subclasses.functional_tensorr   r   devicer6   difftolistsize)offsetsmax_lenr   r   s       r    _offsets_to_lengthsr     s[    
 9Dg
,<=>7>>CVCVZ`C`||~$$&&9Q!+,,r"   )grad_out.c              #   Z  K   |t        |j                        dk(  sJ t        |j                        dk(  sJ ||j                  | j                  k(  sJ | j                  \  }}	}
|j                  \  }}}|j                  \  }}}|J |J |j                  |j                  k(  sJ t        ||      }t        ||      }t        ||      D ]%  \  }}d|	||
f}d|||f}d|||f}||nd}||||f ' y| j                  |j                  |j                  ||j                  ndf yw)a;  
    Given inputs to a flash_attention_(forward|backward) kernel, this will handle behavior for
    NestedTensor inputs by effectively unbinding the NestedTensor and yielding the shapes for
    each batch element.

    In the case that this isn't a NestedTensor kernel, then it just yields the original shapes.
    N   r   lenr   r   zip)querykeyvaluer   	cum_seq_q	cum_seq_kmax_qmax_k_h_qr   h_kd_kh_vr   seq_q_lengthsseq_k_lengths	seq_q_len	seq_k_lennew_query_shapenew_key_shapenew_value_shapenew_grad_out_shapes                          r    %_unpack_flash_attention_nested_shapesr   *  s[    $  399~"""5;;1$$$8>>U[[#@@@kk3ii3kk3$$$$$$)//111+Iu=+Iu=&)-&G 	V"Y	 #y#6OY4M #y#6O4<4Hd!=/CUUU	V 	
++syy%++AUx~~[_
__s   D)D+c              #   `  K   |t        |j                        dk(  sJ t        |j                        dk(  sJ ||j                  | j                  k(  sJ | j                  \  }}}	}
|j                  \  }}}}|j                  \  }}}}|J |J |j                  |j                  k(  sJ t        ||      }t        ||      }t        ||      D ]%  \  }}d|	||
f}d|||f}d|||f}||nd}||||f ' y| j                  |j                  |j                  ||j                  ndf yw)a?  
    Given inputs to a efficient_attention_(forward|backward) kernel, this will handle behavior for
    NestedTensor inputs by effectively unbinding the NestedTensor and yielding the shapes for
    each batch element.

    In the case that this isn't a NestedTensor kernel, then it just yields the original shapes.
    N   r   r   )r   r   r   r   cu_seqlens_qcu_seqlens_kmax_seqlen_qmax_seqlen_kr   r   r   r   r   r   r   	seqlens_q	seqlens_klen_qlen_kr   r   r   r   s                          r    )_unpack_efficient_attention_nested_shapesr   X  sd    $  399~"""5;;1$$$8>>U[[#@@@1c31c31c3''''''!!\%7%7777'lC	'lC		95 	VLE5 #uc2OUC0M #uc2O4<4Hd!=/CUUU	V 	
++syy%++AUx~~[_
__s   D,D.T)r=   c          	      J    t        | ||||||      }
t        d |
D              S )r   )r   r   r   r   r   r   r   c              3   @   K   | ]  \  }}}}t        |||        y wr   r   .0r}   r~   r   r   s        r    	<genexpr>z0_flash_attention_forward_flop.<locals>.<genexpr>  )      2KK 	Y<   r   sum)r   r   r   r   r   r   r   r'   r(   r)   sizess              r    _flash_attention_forward_flopr     s?    " 2E  6;  r"   c           	      J    t        | ||||||      }
t        d |
D              S )r   )r   r   r   r   r   r   r   c              3   @   K   | ]  \  }}}}t        |||        y wr   r   r   s        r    r   z4_efficient_attention_forward_flop.<locals>.<genexpr>  r   r   r   r   )r   r   r   biasr   r   r   r   r(   r)   r   s              r    !_efficient_attention_forward_flopr     s?    " 6!!!!E  6;  r"   c                    d}|\  }}}}|\  }	}
}}|\  }}}}| \  }}}}||	cxk(  r|cxk(  r|k(  rn J ||
cxk(  r|cxk(  r|k(  r	n J ||k(  sJ ||k(  r
||k(  r||k(  sJ d}|t        ||z  ||f||z  ||f      z  }|t        ||z  ||f||z  ||f      z  }|t        ||z  ||f||z  ||f      z  }|t        ||z  ||f||z  ||f      z  }|t        ||z  ||f||z  ||f      z  }|S )Nr   rU   )rt   r}   r~   r   r   rP   r   r   r   r   r   r   r   r   r   r   r   _b4_h4_s4_d4s                        r    sdpa_backward_flop_countr     sf   K NAq#s"Cc3$Cc3'Cc3!s!c!KKa3&<#&<&<KKKK#:#*33K 8QUC-AsC/@AAK 8QUC-AsC/@AAK8QUC-AsC/@AAK 8QUC-AsC/@AAK8QUC-AsC/@AAKr"   c                    t        | |||      S )z(Count flops for self-attention backward.r   )rt   r}   r~   r   r'   r(   r)   s          r    sdpa_backward_flopr     s    
 $NKKXXr"   c
           
      L    t        |||| ||||	      }t        d |D              S )N)r   r   r   r   r   r   r   r   c              3   B   K   | ]  \  }}}}t        ||||        y wr   r   r   r}   r~   r   rt   s        r    r   z1_flash_attention_backward_flop.<locals>.<genexpr>  +      ?KK 	!iU   r   )r   r   r   r   out	logsumexpr   r   r   r   r(   r)   shapess                r    _flash_attention_backward_flopr     sB    " 3	F  CI  r"   c
           
      L    t        |||| ||||	      }t        d |D              S )N)r   r   r   r   r   r   r   r   c              3   B   K   | ]  \  }}}}t        ||||        y wr   r   r   s        r    r   z5_efficient_attention_backward_flop.<locals>.<genexpr>&  r   r   r   )r   r   r   r   r   r   r   r   r   r   r(   r)   r   s                r    "_efficient_attention_backward_flopr     sB    " 7!!!!	F  CI  r"   c                 ,    t        | t              s| fS | S r   )r   tuple)xs    r    normalize_tupler   A  s    atHr"   ) KMBTc                     t        dt        t        t              dz
  t        t	        |             dz
  dz              }t        |   S )Nr   r   rA   r   )maxminr   suffixesstr)numberindexs     r    get_suffix_strr   J  s=     3s8}q(3s6{+;a+?A*EFGEE?r"   c                 X    t         j                  |      }| d|z  z  d}|t         |   z   S )Ni  z.3f)r   r   )r   suffixr   r   s       r    convert_num_with_suffixr   Q  s2    NN6"E%c*E8E?""r"   c                     |dk(  ry| |z  dS )Nr   0%z.2%rB   )numdenoms     r    convert_to_percent_strr  X  s    zEk#r"   c                 .     t                fd       }|S )Nc                 B    t        |       \  }} | }t        ||      S r   )r   r   )r(   	flat_argsspecr   r*   s       r    r+   z)_pytreeify_preserve_structure.<locals>.nf^  s'    &t,	4mc4((r"   r   r,   s   ` r    _pytreeify_preserve_structurer  ]  s     
1X) )
 Ir"   c                        e Zd ZdZ	 	 	 	 ddeeej                  j                  e	ej                  j                     f      de
dedeeeef      f fdZde
fdZdeeeee
f   f   fd	Zdd
Zd Zd Zd Z xZS )r   a  
    ``FlopCounterMode`` is a context manager that counts the number of flops within its context.

    It does this using a ``TorchDispatchMode``.

    It also supports hierarchical output by passing a module (or list of
    modules) to FlopCounterMode on construction. If you do not need hierarchical
    output, you do not need to use it with a module.

    Example usage

    .. code-block:: python

        mod = ...
        with FlopCounterMode(mod) as flop_counter:
            mod.sum().backward()

    modsdepthdisplaycustom_mappingc                 d   t         |           t        d       | _        || _        || _        d | _        |i }|t        j                  dd       i t        |j                         D ci c]   \  }}|t        |dd      r|n
t        |      " c}}| _	        t               | _        y c c}}w )Nc                       t        t              S r   )r   intrB   r"   r    <lambda>z*FlopCounterMode.__init__.<locals>.<lambda>  s    +VYJZ r"   z<mods argument is not needed anymore, you can stop passing itrA   )
stacklevel_get_rawF)super__init__r   flop_countsr	  r
  modewarningswarnr#   itemsgetattrr-   r   mod_tracker)selfr  r	  r
  r  rF   v	__class__s          r    r  zFlopCounterMode.__init__{  s     	6ABZ6[
04	!NMMXefg

WeWkWkWmntqRSqwq*e4!-:JJn
 )? os   -%B,r.   c                 N    t        | j                  d   j                               S )NGlobal)r   r  valuesr  s    r    get_total_flopszFlopCounterMode.get_total_flops  s!    4##H-44677r"   c                 |    | j                   j                         D ci c]  \  }}|t        |       c}}S c c}}w )a  Return the flop counts as a dictionary of dictionaries.

        The outer
        dictionary is keyed by module name, and the inner dictionary is keyed by
        operation name.

        Returns:
            Dict[str, Dict[Any, int]]: The flop counts as a dictionary.
        )r  r  dict)r  rF   r  s      r    get_flop_countszFlopCounterMode.get_flop_counts  s3     (,'7'7'='='?@tq!47
@@@s   8c                    
 | j                   }|d}dd l}d|_        g d}g } j                         
t	        
      d
 fd}t         j                  j                               D ]?  }|dk(  r	|j                  d      d	z   }||kD  r# |||d	z
        }|j                  |       A d j                  v r s|D ]  }	d
|	d   z   |	d<     |dd      |z   }t        |      dk(  rg dg}|j                  ||d      S )Ni?B r   T)ModuleFLOPz% TotalFc           	         t        
j                  |    j                               }	|k\  z  	d|z  }g }|j                  || z   t	        |      t        |      g       
j                  |    j                         D ]<  \  }}|j                  |dz   t        |      z   t	        |      t        |      g       > |S )N z - )r   r  r   appendr   r  r  r   )mod_namer	  r   paddingr   rF   r  global_flopsglobal_suffixis_global_subsumedr  s          r    process_modz.FlopCounterMode.get_table.<locals>.process_mod  s     d..x8??ABK+"==EkGFMM("']C&{LA 
 ((288: 1eOc!f,+A}=*1l;  Mr"   r  .r   r*  )r  0r   )leftrightr5  )headerscolalign)r	  tabulatePRESERVE_WHITESPACEr"  r   sortedr  keyscountextendr   )r  r	  r8  headerr   r1  mod	mod_depth
cur_valuesr   r.  r/  r0  s   `         @@@r    	get_tablezFlopCounterMode.get_table  s.   =JJE=E'+$.++-&|4"	, $**//12 	&Ch		#*I5 $S)a-8JMM*%	& t'''0B *q>a* !1-6Fv;!+,F  B\ ]]r"   c                     | j                   j                          | j                  j                          t	        |       | _        | j
                  j                          | S r   )r  clearr  	__enter___FlopCounterModer  r!  s    r    rE  zFlopCounterMode.__enter__  sG     ""$$T*			r"   c                     | j                   J  | j                   j                  | }d | _         | j                  j                          | j                  r$t	        | j                  | j                               |S r   )r  __exit__r  r
  printrB  r	  )r  r(   rP   s      r    rH  zFlopCounterMode.__exit__  sb    yy$$$DII%	!!#<<$..,-r"   c                     || j                   v rY| j                   |   } ||i |d|i}t        | j                  j                        D ]  }| j                  |   |xx   |z  cc<    |S )Nr%   )r#   setr  parentsr  )r  func_packetr   r(   r)   flop_count_funcrx   pars           r    _count_flopszFlopCounterMode._count_flops  sx    $,,,"00=O($F&F#FJ4++334 A  %k2j@2A 
r"   )NrA   TNr   )__name__
__module____qualname____doc__r	   r
   r   nnr'  rr   r  boolr$  r   r  r"  r   r%  rB  rE  rH  rP  __classcell__)r  s   @r    r   r   g  s    * MQ 7;+5$uxx2G!GHI+ + 	+
 %T#s(^4+*8 8
Ac4S>&9!: 
A:^zr"   c                        e Zd ZdefdZddZy)rF  counterc                     || _         y r   )rY  )r  rY  s     r    r  z_FlopCounterMode.__init__  s	    r"   Nc                 x   |r|ni }|t         j                  j                  j                  j                  t         j                  j                  j                  j
                  t         j                  j                  j                  j                  t         j                  j                  j                  j                  t         j                  j                  j                  j                  t         j                  j                  j                  j                  t         j                  j                  j                  j                  t         j                  j                  j                  j                  t         j                  j                  j                  j                  t         j                  j                  j                  j                  t         j                  j                  j                  j                  t         j                  j                  j                  j                  t         j                  j                  j                   j                  t         j                  j"                  j$                  j                  hv rt&        S || j(                  j*                  vra|t         j                  j"                  j,                  j                  ur1| 5   |j.                  |i |}|t&        ur|cd d d        S 	 d d d         ||i |}| j(                  j1                  |j2                  |||      S # 1 sw Y   9xY wr   )r   opsatenis_contiguousdefaultmemory_formatis_strides_like_formatis_non_overlapping_and_denser   sym_sizestride
sym_stridestorage_offsetsym_storage_offsetnumel	sym_numeldimprimlayoutNotImplementedrY  r#   r   	decomposerP  _overloadpacket)r  functypesr(   r)   rr   s          r    __torch_dispatch__z#_FlopCounterMode.__torch_dispatch__  s	   !r EIINN0088IINN00>>IINN99AAIINN??GGIINN''//IINN++33IINN))11IINN--55IINN1199IINN55==IINN((00IINN,,44IINN&&..IINN))113 3 "! t||111d%))..BWBWB_B_6_ "DNND3F3N* * D#F#||(()=)=sD&QQ s   L00L9)rB   N)rQ  rR  rS  r   r  rs  rB   r"   r    rF  rF    s     Rr"   rF  )Fr   )NNNFN)Vr   torch.utils._pytreer   r   r   module_trackerr   typingr   r	   r
   r   r   collections.abcr   typing_extensionsr   collectionsr   torch.utils._python_dispatchr   mathr   	functoolsr   r  __all__r   r   r\  r]  r!   r#   r$  __annotations__r-   r   mmr  rI   addmmrN   bmmrS   baddbmmrV   
_scaled_mmr^   rr   rV  rh   convolution_convolutionro   convolution_backwardr{   r   '_scaled_dot_product_efficient_attention#_scaled_dot_product_flash_attention#_scaled_dot_product_cudnn_attentionr   r   r   r   r   _flash_attention_forwardr   _efficient_attention_forwardr   r   0_scaled_dot_product_efficient_attention_backward,_scaled_dot_product_flash_attention_backward,_scaled_dot_product_cudnn_attention_backwardr   _flash_attention_backwardr   _efficient_attention_backwardr   r   r   r   r   r  r  r   rF  rB   r"   r    <module>r     s    F F ) : : $ ' # :   5
6T]t_yy~~
 !#tCH~ "XxB?O>PRZ[]_a[aRb>b5c , tww/3 #    tzz"%# % #% txx 
C 
 !
 t||$&C & %& t' % 	% (%( 	%#Y%#Y% Cy% 	%
 	%N (($*;*;<=bf Oux O >O
 t001e e 2eN$ DD@@@@B C EI @WZ @C@	-" +` eE#s(OU38_eCHoxPUVY[^V^P_G``ab+`f -` eE#s(OU38_eCHoxPUVY[^V^P_G``ab-`` t44dC  	 D> t88$G 	 H>6 MMIIIIK L ^b Yps YLY t55tD 	 E@ t994H 	 I@GGWJJ
 	HHh 	LL,	
 	OO_ 	i 	y 	1 	00) 	,,i 	,,i 	99;M 	557I 	557I 	!!#@  	%%'H!" 	""$B#$ 	&&(J%* $# 
L L^"R( "Rr"   