
    VhR              	          d dl Z d dlZd dlmZ d dlmZmZ d dlmZ d dl	Z	d dl
mc mZ d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ e	j:                  j<                  Z eej@                  jC                  dd             d k(  rdndZ"h ejF                  ejH                  ejJ                  ejL                  ejN                  ejP                  ejR                  ejT                  ejV                  ejX                  ejZ                  ej\                  ej^                  ej`                  ejb                  ejd                  ejf                  ejh                  ejj                  ejl                  ejn                  ejp                  ejr                  ejt                  ejv                  ejx                  ejz                  ej|                  ej~                  ej                  ej                  ZBej                  ej                  ej                  ej                  ej                  ej                  ej                  ej                  ej                  h	ZLeBeLz  ZMdgZN G d de      ZOy)    Ndefaultdict)AnyCallable)Self)active_fake_mode)get_device_tflopsget_gpu_dram_gbps)FakeTensorMode)
ModTracker)no_dispatch)TorchDispatchMode)flop_registryPYTORCH_NO_CUDA_MEMORY_CACHINGi      RuntimeEstimatorc                       e Zd ZU dZej
                  ej                  ej                  ej                  hZ	e
ej                     ed<    e
       Ze
ej                  j                     ed<   eed<   d fdZed        Zedeeef   fd	       Zedeeef   fd
       ZddeddfdZddZdedefdZdef fdZdeddf fdZ  xZ!S )r   am	  
    Estimates the GPU runtime in milliseconds using various estimation methods under the ``FakeTensorMode``.

    This class provides a ``TorchDispatchMode`` based context manager that can be used to estimate the eager
    runtime of PyTorch functions. It supports two estimation modes, benchmarking (`operator-level-benchmark`) and
    roofline cost modeling (`operator-level-cost-model`).
    For modules executed under this context manager, it agggregates the forward and backward operation runtimes
    and also records their execution orders.

    Attributes:
        mod_runtimes (Dict[str, Dict[str, float]]): A dictionary of module runtimes. The key to the outer dictionary
            is the fully qualified name (FQN) of the module. For each module the forward and backward runtimes of the
            operations are aggregated in the inner dictionary keyed by 'fw' and 'bw'.
        mod_fw_pre_order (List[str]): List of module FQNs in pre-forward execution order.
        mod_bw_pre_order (List[str]): List of module FQNs in pre-backward execution order.
        mod_fw_post_order (List[str]): List of module FQNs in post-forward execution order.
        mod_bw_post_order (List[str]): List of module FQNs in post-backward execution order.
        total_runtime (float): The total estimated runtime in milliseconds.

    Note:
        1) The benchmarking estimate mode will execute kernels on GPU and assumes that every operation can run in
            isolation without causing an OOM error. It is also designed to be used only under ``FakeTensorMode``.
        2) Currently wrapper tensor sub-classes such as ``DTensor`` won't produce correct estimates. We plan to support
            them in future PRs.
        3) We only estimate the compute time, if your code has communication, it will not be considered. Again, we will
            support this in future PRs.

    Example usage:

        .. code-block:: python

            runtime_estimator = RuntimeEstimator()
            with FakeTensorMode():
                module = ...
                optimizer = ...
                inp = ...
                with runtime_estimator(estimate_mode_type="operator-level-cost-model"):
                    loss = module(inp)
                    loss.backward()
                    optimizer.step()
                    optimizer.zero_grad()
                runtime_estimator.display_modulewise_stats()
    _float_types_no_fallback_kernel	fake_modereturnNc                     t         |           |  |  t               | _        t	        d       | _        g | _        g | _        g | _        g | _	        d| _
        y )Nc                      t        d       S )Nc                       yN         r       Z/home/dcms/DCMS/lib/python3.12/site-packages/torch/distributed/_tools/runtime_estimator.py<lambda>z=RuntimeEstimator.__init__.<locals>.<lambda>.<locals>.<lambda>       r   r   r   r   r   r    z+RuntimeEstimator.__init__.<locals>.<lambda>   s    K, r   r   )super__init__r   _mod_trackerr   mod_runtimesmod_fw_pre_ordermod_bw_pre_ordermod_fw_post_ordermod_bw_post_ordertotal_runtime)self	__class__s    r   r#   zRuntimeEstimator.__init__   sX    &L9D,:
 ,.+-,.,.$'r   c                 \    t         j                  j                  |j                  v ri t	        j
                  ||f      \  }}t               5   fd}|D cg c]
  } ||       }}t	        j                  ||      \  }} ||i |}	d\  }
}t        |
      D ]
  } ||i |  t         j                  j                  d      }t         j                  j                  d      }|j                  t         j                  j                                t        |      D ]
  } ||i |  |j                  t         j                  j                                t         j                  j                          |j                  |      }||z  }ddd       t               |D ]S  }t!        |t         j"                        s|j$                  r+j'                  |j)                         j*                         U  fd}t	        j,                  |	      fS c c}w # 1 sw Y   xY w)aX  
        Runs and benchmarks a fallback kernel for a given function.

        Args:
            func (Callable): The function to benchmark.
            args (Tuple): The arguments to pass to the function.
            kwargs (Dict[str, Any]): The keyword arguments to pass to the function.
            orig_not_implemented_exception (Exception): The original exception to raise if the fallback kernel
                is not implemented.

        Returns:
            Tuple[Any, float]: A tuple containing the result of the function and
                the mean operation time in milliseconds.
        c                 j   j                   j                  |       r| j                  j                  v r"t	        j
                  | | j                        }n!t	        j                  | | j                        }| j                  r|j                  | j                                | t        |      <   |S | S )N)device)r   is_our_fakedtyper   torch	rand_likefake_device	ones_like	is_sparse_coalesced_is_coalescedid)eoutcls	inp_implss     r   to_real_tensorzQRuntimeEstimator._maybe_run_and_benchmark_fallback_kernel.<locals>.to_real_tensor   s    ==,,Q/ww#"2"22#ooaF#ooaF{{(89)*Ibg&Jr   )      T)enable_timingNc                 t   t        |       vrDt        | t        j                        r*| j                  s| j                         j                  v rt        | t        j                        rKt        |       v rt        |          S j                  j                  j                  j                  |       S | S N)
r9   
isinstancer2   Tensorr6   _typed_storage_cdatar   fake_tensor_converterfrom_real_tensor)r:   r<   r=   orig_not_implemented_exceptionstoragess    r   map_outzJRuntimeEstimator._maybe_run_and_benchmark_fallback_kernel.<locals>.map_out   s    !uI%1ell+$$&--944!U\\*a5I%$RU++==>>OOq  r   )r2   Taginplace_viewtagspytreetree_flattenr   tree_unflattenrangecudaEventrecordcurrent_streamsynchronizeelapsed_timesetrD   rE   r6   addrF   rG   tree_map)r<   funcargskwargsrJ   	flat_args	args_specr>   arwarmup_itersactual_iters_start_event	end_event	cuda_timemean_op_timer:   rL   r=   rK   s   `   `              @@r   (_maybe_run_and_benchmark_fallback_kernelz9RuntimeEstimator._maybe_run_and_benchmark_fallback_kernel   s   2 99!!TYY.00	%22D&>B	9 ] 	4
 5>>q*>I>!00IFLD&d%f%A)-&L,<( &d%f%&*****>K

((t(<Iuzz88:;<( &d%f%&UZZ6689JJ""$#00;I$|3L9	4< 5 	<A!U\\*{{LL!1!1!3!:!:;	<	$ +\::] ?	4 	4s   H"H-D&H"H""H+c                 4   t        | j                  t              sJ d       d}|j                  t        vr 	 | j                  |||t              \  }}||fS  ||i |xs i }||fS # t        $ r( | j                  j                  |j                         Y @w xY w)a  
        Estimates the runtime of a function using benchmarking.

        Args:
            func: The function to estimate.
            args: The arguments to pass to the function.
            kwargs: The keyword arguments to pass to the function.
            res: The result of the function.

        Returns:
            Tuple[Any, float]: A tuple containing the result of the function and
                the mean operation time in milliseconds.
        z;Initialize/Assign FakeTensorMode before using this functionr   )	rD   r   r   _overloadpacket	_VIEW_OPSrk   NotImplementedErrorr   r[   )r<   r]   r^   r_   rj   ress         r   _benchmark_estimatez$RuntimeEstimator._benchmark_estimate   s     #--8 	
I	
8 y0	B$'$P$P'	%!\ \** D)FLb)\"" ' B''++D,@,@ABs   A& &.BBc                    t         j                  j                         sJ d       dt         j                  dt        fddt
        fd}dt
        ffd}|r|ni } ||i |}d}|j                  }|t        vrt        j                  ||f      \  }	}
t        j                  |      \  }} ||	|      }|D ch c]@  }t        |t         j                        r$|j                  | j                  v r|j                  B }}t        j                  |	|
      \  }}t        j                  ||      } ||||||      }t        ||      dz  }||fS c c}w )	a  
        Estimates the runtime of a function using a roofline cost model.

        Args:
            func: The function to estimate.
            args: The arguments to pass to the function.
            kwargs: The keyword arguments to pass to the function.
            out: The output of the function.

        Returns:
            Tuple[Any, float]: A tuple containing the result of the function and
                the mean operation time in milliseconds.
        zIRoofline estimation needs to access CUDA capabilities to make estimationstr   c                     | j                         j                         }t        j                  |t        z        t        z  }|S )z
            Calculates the memory consumption of a tensor.

            Args:
                t (torch.Tensor): The input tensor.

            Returns:
                int: The memory consumption of the tensor in bytes.
            )untyped_storagenbytesmathceil_PYTORCH_MIN_ALLOCATE)rs   	num_bytesmem_consumeds      r   get_num_bytesz:RuntimeEstimator._roofline_estimate.<locals>.get_num_bytes(  s@     ))+224I		)&;;<?TT   r   c                     | t         v rbt        |      dk(  sJ d| d|         |j                         }t        |      dz  }d}||z  }t         |    }	 |	|i |d|idz  }
|
|z  dz  }|S y	)
a  
            Estimates the compute time of an aten operator.

            Args:
                func_packet: The operator overload packet.
                args: The arguments to the operator.
                kwargs: The keyword arguments to the operator.
                out: The output of the operator.
                out_dtypes: The output data types.

            Returns:
                float: The estimated compute time in nanoseconds.
            r   z"Only support single out dtype got z for g  4&kCg      ?out_valr?   g    eAr   )r   lenpopr	   )func_packetr^   r_   r;   
out_dtypesr1   peak_gpu_flopsfactorpeak_empirical_flopsflop_count_func
flop_countcompute_times               r   get_compute_timez=RuntimeEstimator._roofline_estimate.<locals>.get_compute_time8  s     m+:!+ 8E+W+ #(!25!9D!@'-'>$"/"<,dJfJcJQN
 *-A ASH##r   c                     t               }t        fd| D              }t        fd|D              }||z   }||z  }|S )a  
            Estimates the memory transfer time of input and output tensors.

            Args:
                flat_args_kwargs (List[torch.Tensor]): The flat list of arguments and keyword arguments.
                flat_outs (List[torch.Tensor]): The flat list of outputs.

            Returns:
                float: The estimated memory transfer time in nanoseconds.
            c              3   b   K   | ]&  }t        |t        j                        r
 |       ( y wrC   rD   r2   rE   .0rs   r|   s     r   	<genexpr>zQRuntimeEstimator._roofline_estimate.<locals>.get_transfer_time.<locals>.<genexpr>d  s,      a. a s   ,/c              3   d   K   | ]'  }t        |t        j                        s |       ) y wrC   r   r   s     r   r   zQRuntimeEstimator._roofline_estimate.<locals>.get_transfer_time.<locals>.<genexpr>i  s(      %&z!U\\7Ra s   00)r
   sum)flat_args_kwargs	flat_outsgpu_memory_bandwidth
read_byteswrite_bytescounted_bytestransfer_timer|   s          r   get_transfer_timez>RuntimeEstimator._roofline_estimate.<locals>.get_transfer_timeX  s[     $5#6  ) J
  *3 K '4M),@@M  r   r   g    .A)r2   rT   is_availablerE   intfloatrm   _IGNORE_OPSrP   rQ   rD   r1   r   rR   max)r<   r]   r^   r_   r   r   r;   op_timer   r   ra   r   out_specr   rs   r   r   r|   s                    @r   _roofline_estimatez#RuntimeEstimator._roofline_estimate  se    zz&&( 	
W	
(	 U\\ 	 c 	  	E 	@	!e 	!b "rD#F#**k)*0*=*=tVn*M'i"("5"5c":Ix-.>	JM #a.177c>N>N3N J  "001A9MLD&''	8<C+KvsJWL -6<GW~s   :AEdepthc                    t        d       | j                  D ]'  }|j                  d      dz   }||kD  rt        |       ) t        d       | j                  D ]'  }|j                  d      dz   }||kD  rt        |       ) | j                  j                         D ]U  \  }}|j                  d      dz   }||kD  r t        | d|j                  dd      dd	|j                  d
d      dd       W y)aP  
        Displays module-wise statistics collected by ``RuntimeEstimator``.

        Prints the pre-forward and pre-backward execution orders.
        Displays the module-wise forward and backward runtimes in milliseconds.

        Args:
            depth (int): The maximum depth of module hierarchy to display (default to 2).
        zPre-Forward Execution Order: .r   zPre-Backward Execution Order: z fw: fwr   .3fzms bw: bwmsN)printr&   countr'   r%   itemsget)r+   r   mod_fqn	mod_depthruntimess        r   display_modulewise_statsz)RuntimeEstimator.display_modulewise_stats  s    	-.,, 	Gc*Q.I5 'N		
 	./,, 	Gc*Q.I5 'N		
 "&!2!2!8!8!: 	GXc*Q.I5 )5dC!8 =WX\\RVX[E\]`Daacd		r   c                 (   | j                  |||      \  }}| j                  j                  D ]M  }| j                  j                  r| j                  |   dxx   |z  cc<   4| j                  |   dxx   |z  cc<   O | xj
                  |z  c_        |S )Nr   r   )	_estimater$   parentsis_bwr%   r*   )r+   r]   typesr^   r_   rp   r   pars           r   __torch_dispatch__z#RuntimeEstimator.__torch_dispatch__  s     ~~dD&9W$$,, 	8C  &&!!#&t,7,!!#&t,7,		8
 	g%
r   estimate_mode_typec                     |dk(  rt         j                  | _        n*|dk(  rt         j                  | _        nt	        d| d      || _        | S )a  
        Sets the estimate mode type.

        Currently supported modes:
            - "operator-level-benchmark": Estimates runtime using operator benchmarking.
            - "operator-level-cost-model": Estimates runtime using roofline cost model.

        Args:
            estimate_mode_type (str): The type of estimate mode to use.

        Returns:
            RuntimeEstimator: The runtime estimator instance.

        Raises:
            NotImplementedError: If the estimate mode type is not supported.
        zoperator-level-benchmarkzoperator-level-cost-modelzestimate_mode_type z not supported)r   rq   r   r   ro   _estimate_mode_type)r+   r   s     r   __call__zRuntimeEstimator.__call__  sZ    " !;;-AADN#>>-@@DN%%&8%9H  $6 r   c                     t               }t        |t              sJ d       |t        _        d _        t        d        _         j                  j                           j                  j                           j                  j                           j                  j                           j                  j                   fd fd fd fd        j                  j                          t          =           S )	Nz>No FakeTensorMode found, designed to used under FakeTensorModer   c                      t        d       S )Nc                       yr   r   r   r   r   r    z>RuntimeEstimator.__enter__.<locals>.<lambda>.<locals>.<lambda>  r!   r   r   r   r   r   r    z,RuntimeEstimator.__enter__.<locals>.<lambda>  s    K0H r   c                 l    j                   j                  j                  j                  |             S rC   )r&   appendr$   get_known_fqn)modinpr+   s     r   r    z,RuntimeEstimator.__enter__.<locals>.<lambda>  s,    )>)>)E)E!!//4* r   c                 l    j                   j                  j                  j                  |             S rC   )r'   r   r$   r   )r   g_outr+   s     r   r    z,RuntimeEstimator.__enter__.<locals>.<lambda>  s,    4+@+@+G+G!!//4, r   c                 l    j                   j                  j                  j                  |             S rC   )r(   r   r$   r   )r   r   r;   r+   s      r   r    z,RuntimeEstimator.__enter__.<locals>.<lambda>  s,    t/E/E/L/L!!//40 r   c                 l    j                   j                  j                  j                  |             S rC   )r)   r   r$   r   )r   g_inpr+   s     r   r    z,RuntimeEstimator.__enter__.<locals>.<lambda>  s,    D,B,B,I,I!!//4- r   )pre_fw_hookpre_bw_hookpost_fw_hookpost_bw_hook)r   rD   r   r   r   r*   r   r%   r&   clearr'   r(   r)   r$   register_user_hooks	__enter__r"   )r+   r   r,   s   ` r   r   zRuntimeEstimator.__enter__  s    $&	)^4 	
L	
4 &/" '(HI##%##%$$&$$&-- 	. 	
 	##%r   r^   c                 B   t        d| j                   d| j                  dd       t        | j                        dkD  rt        dt        | j                               t        |   |  | j                  j                          | j                  j                          y )NzEstimated (z)total_time: r   z msr   zno_fallback_kernel: )
r   r   r*   r   r   listr"   __exit__r$   clear_user_hooks)r+   r^   r,   s     r   r   zRuntimeEstimator.__exit__  s    $223 4--c2#7	
 t''(1,($t/G/G*HI$**,""$r   )r   N)r?   ).N)"__name__
__module____qualname____doc__r2   float16bfloat16float32float64r   rZ   r1   __annotations__r   _ops_OpNamespacer   r#   classmethodrk   tupler   r   rq   r   r   r   r   strr   r   r   r   __classcell__)r,   s   @r   r   r   O   s   *Z 		&L#ekk"  9<UZZ445=(  [; [;z #c5j8I # #B JuS%Z7H J JXc $ <
3 4 84 <	%c 	%d 	% 	%r   )Prw   oscollectionsr   typingr   r   typing_extensionsr   r2   torch.utils._pytreeutils_pytreerP   torch._guardsr   torch._inductor.utilsr	   r
   torch._subclasses.fake_tensorr   $torch.distributed._tools.mod_trackerr   torch.utils._mode_utilsr   torch.utils._python_dispatchr   torch.utils.flop_counterr   opsatenr   environr   ry   
lift_freshrs   	transposeviewdetach_unsafe_viewsplitadjoint
as_strideddiagonalexpand	expand_asmovedimpermuteselectsqueezemTmHrealimagview_as	unflattenunfoldunbind	unsqueezevsplithsplitsplit_with_sizesswapaxesswapdimschunkrn   randintrandnrand
randn_liker3   randint_likearanger5   
zeros_like_CREATE_OPSr   __all__r   r   r   r   <module>r     s    	 #   "  $ $ * F 8 ; / : 2 yy~~
 

?CDIDq 
 OO FF  	NN  	II	 
 	KK  	  	JJ  	LL  	OO  	MM  	KK  	NN  	LL  	LL  	KK   	LL! " 	GG# $ 	GG% & 	II' ( 	II) * 	LL+ , 	NN- . 	KK/ 0 	KK1 2 	NN3 4 	KK5 6 	KK7 8 	9 : 	MM; < 	MM= > 	JJ? 	F 	LLJJIIOONNKKNNOO
 +%
@%( @%r   