
    Vhu                     
   d dl Z d dlZd dlZd dlmZ d dlmZmZ d dlm	Z	m
Z
mZ d dlmZ d dlZd dlmZmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZmZmZ d dl m!Z! d dl"m#Z# d dl$m%Z% g dZ&ejN                  jP                  Z(e(jR                  jT                  ejN                  jV                  jX                  jZ                  e(j\                  jT                  hZ/e%e/z  Z0 e1ejd                  jg                  dd             d k(  rdndZ4de5e6   de5e5e	      ddfdZ7e G d d             Z8e G d d             Z9e G d d             Z: G d de
      Z;e G d  d!             Z<e G d" d#             Z= G d$ d%e!      Z>y)&    N)OrderedDict)astuple	dataclass)Any
NamedTupleOptional)Self)nannnUntypedStorage)active_fake_mode)FakeTensorMode)get_untyped_storages)
ModTracker)RuntimeEstimator)
is_inplaceis_inplace_view_fn
is_view_fn)TorchDispatchMode)tree_flatten)SAC_IGNORED_OPS)SACEstimatorSACStatsMSPSSACTradeOffStatsSACGreedyOrderMetaPYTORCH_NO_CUDA_MEMORY_CACHINGi      headers
table_datareturnc                 t    	 ddl m } t         ||| d             y # t        $ r}t        d      |d }~ww xY w)Nr   )tabulatezPlease install tabulate.rst)r   tablefmt)r#   ImportErrorprint)r   r    r#   errs       V/home/dcms/DCMS/lib/python3.12/site-packages/torch/distributed/_tools/sac_estimator.py_display_stats_tabularr*   *   s=    ?%
 
(:w
?@	  ?453>?s    	727c                   v    e Zd ZU dZeed<   eed<   eed<   eed<   eedf   ed<   eedf   ed<   e	ed	<   e	ed
<   y)_SACMetadataaZ  
    Stores metadata for a single operator for SAC.

    Attributes:
        func (Any): The operator function.
        time_taken (float): The time taken by the operator.
        memory_used (float): The memory used by the operator.
        curr_idx (int): The current operator index.
        output_ids (Tuple[int, ...]): The storage IDs of the operator's outputs.
        inplace_info (Tuple[int, ...]): Tuple of self and parent operator for in-place operator.
        is_view_like (bool): Whether the operator is view-like.
        is_rand_op (bool): Whether the operator is a random operator.
    func
time_takenmemory_usedcurr_idx.
output_idsinplace_infois_view_like
is_rand_opN)
__name__
__module____qualname____doc__r   __annotations__floatinttuplebool     r)   r,   r,   6   sG     IMc3hS/!r?   r,   c                   6    e Zd ZU dZeed<   eed<   ee   ed<   y)_SACModMetadataa>  
    Stores metadata for a module for SAC.

    Attributes:
        start_idx (int): The starting index of the module's operators.
        force_store_random (bool): Whether to force store random operators in the module.
        sac_metadata (List[_SACMetadata]): List of metadata for each operator in the module.
    	start_idxforce_store_randomsac_metadataN)	r5   r6   r7   r8   r;   r9   r=   listr,   r>   r?   r)   rA   rA   P   s     N|$$r?   rA   c                       e Zd ZU dZee   ed<   ee   ed<   ee   ed<   ee   ed<   ee   ed<   ee   ed<   ee	eef      ed<   e
ed	<   y
)r   a  
    A class for storing Activation Checkpointing statistics corresponding to a module.

    Attributes:
        func_names (List[str]): List of operator names.
        runtimes (List[float]): List of operator runtimes in millliseconds.
        memory (List[int]): List of operator memory usage in bytes.
        view_like_ops (List[int]): Indices of view-like operators.
        rand_ops (List[int]): Indices of random operators.
        saved_autograd_ops (List[int]): Indices of operator results saved by autograd engine.
        inplace_ops (List[Tuple[int, int]]): Tuple of indices of op and its first parent for Inplace operators.
        force_store_random (bool): Whether to force store random operator results.
    
func_namesruntimesmemoryview_like_opsrand_opssaved_autograd_opsinplace_opsrC   N)r5   r6   r7   r8   rE   strr9   r:   r;   r<   r=   r>   r?   r)   r   r   `   s^     S	5kI93iS	!eCHo&&r?   r   c                   J    e Zd ZU dZee   ed<   eed<   eed<   eed<   eed<   y)r   a  
    Represents Memory and Runtime Statistics for an operator/operator group.

    Attributes:
        func_names (set[str]): Set of operator/operator group names.
        op_idx (int): Operator index (group head index incase of operator groups).
        memory (int): Memory usage in bytes.
        runtime (float): Runtime in milliseconds.
        msps (float): Memory per second calculated as memory/runtime.
    rG   op_idxrI   runtimemspsN)	r5   r6   r7   r8   setrN   r9   r;   r:   r>   r?   r)   r   r   z   s'    	 CKKN
Kr?   r   c                   t    e Zd ZU dZeed<   ee   ed<   ee   ed<   ee   ed<   eeef   ed<   eed<   eed<   y	)
r   aJ  
    Stores statistics for activation-checkpointing trade-off.

    Attributes:
        n_segments (int): Number of piecewise linear segments fitted to the trade-off curve.
        slopes (List[float]): Slopes of the pieces of linear segments fitted to the trade-off curve.
        intercepts (List[float]): Intercepts of the of the pieces of linear segments fitted to the trade-off curve.
        fit_breaks (List[float]): Breakpoints of the of the pieces of linear segments fitted to the trade-off curve.
        tradeoff_curve (OrderedDict[float, float]): Trade-off curve data of memory discarded vs recomputation time.
        sac_memory (int): Total memory of operations available for activation checkpointing in bytes.
        sac_runtime (float): Total runtime of operations available for activation checkpointing in milliseconds.
    
n_segmentsslopes
intercepts
fit_breakstradeoff_curve
sac_memorysac_runtimeN)	r5   r6   r7   r8   r;   r9   rE   r:   r   r>   r?   r)   r   r      sF     OKUUu--Or?   r   c                   v    e Zd ZU dZee   ed<   ee   ed<   eeee   f   ed<   eeee   f   ed<   ee	   ed<   y)r   a  
    Stores metadata for Greedy-order SAC.

    Attributes:
        recomputed_ops (set[int]): Set of operator indices to be recomputed.
        stored_ops (set[int]): Set of operator indices to be stored.
        inplace_op_groups (dict[int, set[int]]): Dictionary of inplace operator groups from group-head to operators.
        random_ops_group (dict[int, set[int]]): Dictionary of random op group head to random ops.
        msps_meta (list[MSPS]): List of Memory and Runtime Statistics for operators.
    recomputed_ops
stored_opsinplace_op_groupsrandom_ops_group	msps_metaN)
r5   r6   r7   r8   rS   r;   r9   dictrE   r   r>   r?   r)   r   r      sM    	 HCCSM**3C=))Dzr?   r   c                       e Zd ZdZd(dZdej                  dej                  fdZdej                  de
ddfd	Zdej                  de
d
e
ddfdZde
defdZdee   dedefdZde
dee   deeeedf   eeeedf   f   f   fdZ	 d)dZdedefdZ	 	 	 d*dedededededefdZ	 d+dededdfdZ	 d+dedededdfdZ 	 	 d,dededdfd Z!	 d,d!ededdfd"Z"d#ede#fd$Z$de#f fd%Z%d&e
ddf fd'Z& xZ'S )-r   a  
    Estimates the memory and recomputation time trade-offs for applying Selective Activation Checkpointing (SAC).

    This class provides a ``TorchDispatchMode`` based context manager that can be used to estimate the memory and
    runtime trade-offs of functions or ``torch.nn.Module``s for Selective Activation Checkpointing (SAC). It provides
    detailed statistics and metadata information for operators of each module and provides a greedy order for selecting
    the operators to be recomputed/checkpointed.  It also constructs the per-module trade-off graph of discarded memory
    vs recomputation time for the obtained greedy order. Using ``RuntimeEstimator`` under the hood, it supports two
    estimation modes, `operator-level-benchmark` and (`operator-level-cost-model` (roofline model).

    Attributes:
        sac_mod_stats (Dict[str, SACStats]): Dictionary from module FQN (fuly qualified name) to ``SACStats``.
        sac_mod_tradeoff_stats (Dict[str, SACTradeOffStats]): Dictionary from module FQN to ``SACTradeOffStats``.
        sac_mod_greedy_order_meta (Dict[str, SACGreedyOrderMeta]): Dictionary from module FQN to ``SACGreedyOrderMeta``.

    Note:
        1) This class is designed to be used under ``FakeTensorMode``.
        2) Currently, it only supports estimation of compute time and memory usage, and does not consider communication.

    Example usage:

        .. code-block:: python

            sac_estimator = SACEstimator()
            with FakeTensorMode():
                module = ...
                inp = ...
                with sac_estimator("operator-level-cost-model"):
                    output = module(inp)
                sac_estimator.display_modulewise_sac_stats(depth=4, print_tabular=True)
    r!   Nc                 B   i | _         i | _        i | _        t               | _        g | _        i | _        t               | _        t        j                  j                  j                  | j                  d       | _        t               | _        t         j"                  | _        y )Nc                     | S Nr>   xs    r)   <lambda>z'SACEstimator.__init__.<locals>.<lambda>   s    q r?   )sac_mod_statssac_mod_tradeoff_statssac_mod_greedy_order_metar   _mod_tracker_sac_metadata_sac_mod_metadatarS   _leaf_modulestorchautogradgraphsaved_tensors_hooks
_pack_hook_saved_tensor_hook_ctx_saved_tensor_idsr   _roofline_estimate_estimate_runtime)selfs    r)   __init__zSACEstimator.__init__   s~    24CE#HJ&&L13=?'*u&+nn&:&:&N&NOO['
# ,/5!1!D!Dr?   rh   c                 d    t        |      }d |D        }| j                  j                  |       |S )Nc              3   2   K   | ]  }t        |        y wrf   hash.0sts     r)   	<genexpr>z*SACEstimator._pack_hook.<locals>.<genexpr>   s     ;BtBx;   )r   rw   update)rz   rh   untyped_storagesstorage_idss       r)   ru   zSACEstimator._pack_hook   s3     02;*:;%%k2r?   modinputsc                 B   | j                   j                  |      }|J t        d |j                         D              }|dkD  r@| j	                  |      }t        t        | j                        |g       | j                  |<   y | j                  j                  |       y )Nc              3       K   | ]  }d   yw)r   Nr>   )r   _s     r)   r   z,SACEstimator._pre_fw_hook.<locals>.<genexpr>   s     515s   r   )rB   rC   rD   )rm   get_known_fqnsumchildren_get_force_store_randomrA   lenrn   ro   rp   add)rz   r   r   mod_fqnnum_childrenrC   s         r)   _pre_fw_hookzSACEstimator._pre_fw_hook   s     ##11#6"""5clln55!!%!=!=f!E.=d001#5/D""7+ ""7+r?   outputsc                 R   | j                   j                  |      }|J || j                  v ry | j                  | j                  |   j
                  | j                  |   j                        | j                  |<   | j                  | j                  |         | j                  |<   y )N)datarC   )
rm   r   rp   _get_sac_statsro   rD   rC   rj   _get_greedy_order_metarl   )rz   r   r   r   r   s        r)   _post_fw_hookzSACEstimator._post_fw_hook  s    
 ##11#6"""d(((*.*=*=++G4AA#'#9#9'#B#U#U +> +Dw' 7;6Q6Q""7+7D**73r?   c                 B    t        |      \  }}t        d |D              S )Nc              3   R   K   | ]  }t        |t        j                          ! y wrf   )
isinstancerq   Tensor)r   rh   s     r)   r   z7SACEstimator._get_force_store_random.<locals>.<genexpr>  s     Hqz!U\\22Hs   %')r   all)rz   r   flat_inputsr   s       r)   r   z$SACEstimator._get_force_store_random  s!    %f-QHKHHHr?   r   rC   c           
         |D cg c]  }|j                   t        vs| }}t        |D cg c]  }t        |       c}ddi\  }}}}}	}
}}t	        |      }t	        |      }|D cg c]  }|j
                  j                   }}t        |      D cg c]
  \  }}|s	| }}}t        |      D cg c]
  \  }}|s	| }}}t        |	      D cg c]+  \  }}t        |      j                  | j                        r|- }}}	 |
D cg c]$  }|st        t        |j                  |            & }}t        |      dz
  }t        |      t        |D ch c]  }|d   	 c}      z  }t!        |d      }|D ]  }||k(  s	|dz  } d||<   t#        ||||||||      S c c}w c c}w c c}w c c}}w c c}}w c c}}w c c}w # t        $ r}t        dt               |d }~ww xY wc c}w )NstrictTzbThe remapping of inplace ops failed since one of the inplace op parents must have been present in r   r   )reverse)rG   rH   rI   rJ   rK   rL   rM   rC   )r-   OPS_TO_ALWAYS_SKIPzipr   rE   _overloadpacketr5   	enumeraterS   issubsetrw   r<   mapindex
ValueErrorr   sortedr   )rz   r   rC   rh   filtered_dataops	runtimes_memory_new_idsr1   inplace_ops_view_like_ops_	rand_ops_rH   rI   oprG   irJ   rK   out_idsrL   rM   r(   last_op	skip_ops_reversed_skip_opss                              r)   r   zSACEstimator._get_sac_stats  s$   
 %)MqAFF:L,LMM m4'!*4BTB		
 	?g<?@bb((11@
@'0'@Ftq!AFF"+I"6<$!Q!A<< (
3
77|$$T%;%;< 
 
	AMSAQR5W]]A!67SKS c(Q,&K-Hqad-H)II	"9d;# 	BW}1	 w !'1#1	
 		
i N 5
 AF<
 T 	..@-AC 	 .Isi   F%F%F*(F/
F4 F45
F: F:0G 	G G!G6G G/
G 	G,G''G,r-   out_storages.c                    t        | j                        }| j                  j                  D ch c]  }|| j                  vs| }}t        d |D              }t        |      s|||D ci c]  }|d c}fS |}|D ci c]  }|d }	}t        | j                        D ]  \  }
}|j                  }t        |      j                  t        |            s6|	j                         D ]K  \  }}|dk(  s| j                  j                  |d       x}r|
|j                  k\  s:|
|	|<   @|dk(  sJ |
|	|<   M  |	j                         D ]  \  }}|dk  s||	|<    |D ci c]
  }|||	|   f }}|||fS c c}w c c}w c c}w c c}w )Nc              3   2   K   | ]  }t        |        y wrf   r~   r   s     r)   r   z5SACEstimator._get_inplace_metadata.<locals>.<genexpr>f  s     ;48;r   r>   Globalr   )r   rn   rm   parentsrp   r<   r   r   r1   rS   r   itemsro   getrB   )rz   r-   r   r0   paractive_mod_fqnsr1   r   rP   mod_op_parent_idxsr   dpast_output_idsop_parent_idx	acm_statsmod_inplace_infos                   r)   _get_inplace_metadataz"SACEstimator._get_inplace_metadata\  s    t))*  ,,44%
4CUCU8UC%
 %
 ;l;;
$Z_)U''2+)UUU (7.
#GRK.
 .
 d001 	<DAq  llO:''O(<=.@.F.F.H <*G]$*(,(>(>(B(B7D(QQ9Q I$7$77>? 27 ;#*h#66#6:;.w7<	< '9&>&>&@ 	5"G]q .4"7+	5
 +
 f09::
 
 %555I%
 *V.
*
s   E3E3+
E8?
E=Fc                    | j                  |||      \  }}t        |      \  }}t               }	t               }
t               }|D ]  }t        |t        j
                        s|j                  j                  dk(  r6|	j                  t        |             |j                  |j                         m|
j                  t        |              t        |      dk  sJ |j                   d|        t        d |	D              }t        d |
D              }||z   }|	|
z  }| j                  ||      \  }}}t        |      xs t!        |      }t        j"                  j$                  |j&                  v }|rd}|j(                  j                  dk(  r|j+                  dd      dk7  }| j,                  j.                  D ]  }|| j0                  v rt3        |||||||   ||	      }| j4                  j+                  |d       x}r|j6                  j9                  |       b|d
k(  sJ d| d       | j:                  j9                  |        |S )Ncudar   z''s output has more than 1 CUDA devices c              3   ~   K   | ]5  }t        j                  |j                         t        z        t        z   7 y wrf   )mathceilnbytes_PYTORCH_MIN_ALLOCATEr   s     r)   r   z2SACEstimator.__torch_dispatch__.<locals>.<genexpr>  s3      
 IIbiik$99:=RR
s   ;=c              3   <   K   | ]  }|j                           y wrf   )r   r   s     r)   r   z2SACEstimator.__torch_dispatch__.<locals>.<genexpr>  s     @@s   r   #_scaled_dot_product_flash_attention	dropout_p)r-   r.   r/   r0   r1   r2   r3   r4   r   zModule z not found in AC Mod Stats)ry   r   rS   r   rq   r   devicetyper   r   r   r   r5   r   r   r   r   Tagnondeterministic_seededtagsoverloadpacketr   rm   r   rp   r,   ro   rD   appendrn   )rz   r-   typesargskwargsoutop_time	flat_outsr   out_storages_cudaout_storages_cpucuda_devicesonbytes_cuda
nbytes_cpur   r   r0   r1   r   r3   r4   r   acmr   s                            r)   __torch_dispatch__zSACEstimator.__torch_dispatch__  s`    --dD&AW#C(	11403*-% 	EA!U\\*88==F*%,,-A!-DE $$QXX.$++,@,CD	E < A% 	
}}oD\NS	
%
  
'
 
 @/?@@
z)(+;;151K1K,2
.*. "$'C+=d+CYY66$))C
F ''+PPK3q8J((00 	/G$,,,""!%-g6)%	C !2266wEEyE&&--c2(* gY&@A* ""))#.'	/* 
r?   	sac_statsc           
          i }t        j                        }|j                         D ])  \  }}|j                  ||h      }|j	                  |       + i }t        j                  d      }t        j                        }	|	rt        j                        ||<   t               }
t               }|	rj                  r|
j	                  |       |j                  t        j                               |j                         D ]]  \  }}||v r|
j	                  |       j                  t        |t        j                        z        z  dkD  sM|
j	                  |       _ t               }|	r||
vr|j	                  |       |j                  t        |j                               |
z
         |j                  t        t        t        j                                    |z
  |
z
  t        |j                               z
  t        j                        z
         g }|D ]  }|h}||v r|j                  ||          |	r ||k(  r|j                  j                         t!        fd|D              }t!        fd|D              }|D ch c]  }j"                  |    }}|dkD  r||z  nt$        j&                  j(                  }|j+                  t-        |||||              |j/                  d d       t1        ||
|||      S c c}w )	Nr   )defaultr   c              3   <   K   | ]  }j                   |     y wrf   rI   r   rP   r   s     r)   r   z6SACEstimator._get_greedy_order_meta.<locals>.<genexpr>  s     H6i&&v.H   c              3   <   K   | ]  }j                   |     y wrf   rH   r   s     r)   r   z6SACEstimator._get_greedy_order_meta.<locals>.<genexpr>  s     N),,V4Nr   c                     | j                   S rf   rR   rg   s    r)   ri   z5SACEstimator._get_greedy_order_meta.<locals>.<lambda>  s
    QVV r?   T)keyr   )rb   rM   r   
setdefaultr   minrK   r=   rS   rC   r   rJ   r   keysrangerI   r   rG   sys
float_infomaxr   r   sortr   )rz   r   r_   inplace_op_to_group_headrP   group_head_idxop_groupr`   random_group_head_idxhas_rand_opsr^   r]   recompute_candidatesra   cand_idx
op_indicesmemrQ   rG   rR   s    `                  r)   r   z#SACEstimator._get_greedy_order_meta  s   
 2437	8M8M3N  '?&D&D&F 	!"FN(33N^DTUHLL 	! 13 #I$6$6 CI../69):L:L6M23  #u
#&5I88NN01c)"9"9:;(9(?(?(A 		/$NH!99~. ,,s8c)BTBT>U3U/VV ~.		/ *-1C $$%:;##C(9(>(>(@$AJ$NO##c)**+,- *//123 )$$%	&	
 !#	, 	MH"J,,!!"3H"=>,A A!!)"4"45HZHHCN:NNGEOP6)..v6PJP&-kC'Ms~~7I7IDT*hWdKL	M 	+T:!J(9;KY
 	
 Qs   Lgreedy_order_metarU   save_tradeoff_graphfilenamec           
         ! 	 dd l }dd l}|j                  |j                  |j
                  |j                  |j                  f\  }	}
}}}t               }|
D ]D  }|j                  |       ||v r|j                  ||          ||v s1|j                  ||          F t        fd|D              }t        fd|D              }t        j                        t        j                        d}t               }|z  ||z  |z   <   |D ].  }||j                  z  }||j                  z  }|z  ||z  |z   <   0 t               }|	D ]D  }|j                  |       ||v r|j                  ||          ||v s1|j                  ||          F |t        fd|D              z  }|t        fd|D              z  }|z  ||z  |z   <   t!        |j#                                t!        |j%                               ! d t'               dz
   }!dd  }|j)                  ||      }t+        t-        t'        |      d	z
  |      d      }|j/                  |
       d|j(                  dt         t0           dt         t0           dt2        dd f
 !fd}|r |||||       |j5                         j7                         }t9        |j:                  |j<                        r t9        |j>                  |j<                        sJ |j:                  j7                         }|j>                  j7                         }tA        |||||      S # t        $ r}t        d      |d }~ww xY w)Nr   z&Please install pwlf and numpy package.c              3   <   K   | ]  }j                   |     y wrf   r   r   s     r)   r   z<SACEstimator._get_sac_tradeoff_pwlf_stats.<locals>.<genexpr>>  s     RI,,V4Rr   c              3   <   K   | ]  }j                   |     y wrf   r   r   s     r)   r   z<SACEstimator._get_sac_tradeoff_pwlf_stats.<locals>.<genexpr>?  s     UFY//7Ur   g{Gz?c              3   <   K   | ]  }j                   |     y wrf   r   r   s     r)   r   z<SACEstimator._get_sac_tradeoff_pwlf_stats.<locals>.<genexpr>[  s     S&Y--f5Sr   c              3   <   K   | ]  }j                   |     y wrf   r   r   s     r)   r   z<SACEstimator._get_sac_tradeoff_pwlf_stats.<locals>.<genexpr>\  s     VVi008Vr   r      )rU   pwlf_rh   yr  r!   c                    	 dd l m} dd l}|j	                  t        |      t        |      d      }| j                  |      }|j                          |j                  ||dd       |j                  ||dd	       |j                  d
d       |j                  d       |j                  d       |j                          |j                  |        |j                  d
 dddd       d}	t        j                   j#                  |	      st        j$                  |	       |j'                  t        j                   j)                  |	| d             y # t        $ r}t        d      |d }~ww xY w)Nr   zDInstall matplotlib and numpy using pip: pip install matplotlib numpyi'  )numr   Shifted)label-	Predictedrh   OriginalzRecomp time / Total recomp timezMemory discarded / Total memoryzTotal Memory = z B Total Runtime = .4fz ms
   )fontsizetradeoff_graphsz.png)matplotlib.pyplotpyplotnumpyr&   linspacer   r   predictfigureplotylabelxlabellegendtitlesuptitleospathexistsmakedirssavefigjoin)r  rh   r  r  pltnpr(   xHatyHatfolder_namerZ   r[   x_y_s             r)   save_prediction_graphzHSACEstimator._get_sac_tradeoff_pwlf_stats.<locals>.save_prediction_graphl  sG   /" ;;s1vs1v5;9D==&D JJLHHQ3iH0HHT4KH8HHRS
H3JJ89JJ89JJLII
$LL!*-@S@QQTU   ,K77>>+.K(KK[XJd2CDE3  !Zs   
E 	E5$E00E5)rU   rV   rW   rX   rY   rZ   r[   )!r!  pwlfr&   r^   r]   r_   r`   ra   rS   r   r   r   rH   rI   r   rQ   rE   r   valuesr   PiecewiseLinFitr   r   fitr:   rN   calc_slopestolistr   rW   ndarrayrX   r   )"rz   r   r	  rU   r
  r  r2  r9  r(   r^   r]   r_   r`   ra   recomp_indicesr_idxdiscarded_memrecomp_runtimedeltarY   candstored_indicess_idxrh   r  tradeoff_pwlfr8  rV   rW   rX   rZ   r[   r6  r7  s"    `                            @@@@r)   _get_sac_tradeoff_pwlf_statsz)SACEstimator._get_sac_tradeoff_pwlf_stats   s   	Q
 ((,,//..''V
R
N$57G $'5# 	?Eu%))%%&7&>?((%%&6u&=>	? R>RRUnUU),,-))*
 $ [( 	
2e;<
  	DT[[(Mdll*N, MJ6%?@	 $'5 	?Eu%))%%&7&>?((%%&6u&=>	? 	SNSSS#V~VVV[( 	
2e;< .%%'(.'')* }R1qrF,,Q2SVaZ4a8
Z0	F''	F,0K	F<@K	FSV	F	F 	FB !-Ax@**,335-22BJJ?J$$bjjE
 	
 
 #--446
"--446
!!!)!#
 	
W  	QFGSP	Qs   M 	M!MM!print_tabularc                    t        dt        |j                         dt        |j                         d|j                          g }t        |j                        }t        |j                        D ]  \  }}t        |      ||j                  |   dt        |j                  |         t        ||j                  v       t        ||j                  v       t        ||j                  v       t        |j                  |d            g}|j                  |        g d}|rt        ||       yt!        t#        |            D 	cg c]  }	d }
}	|j%                  d|       |D ]0  }t        |      D ]   \  }}t'        |
|   t#        |            |
|<   " 2 |D ]A  }t        dj)                  t        |      D cg c]  \  }}|d	|
|    d
 c}}             C yc c}	w c c}}w )a  
        Displays the SAC statistics.

        Args:
            sac_stats (SACStats): The SAC statistics to display.
            print_tabular (bool, optional): Whether to print the statistics in a tabular format. Defaults to False.

        Prints:
            1. Total Memory: The total memory usage in bytes.
            2. Total Runtime: The total runtime in milliseconds.
            3. Store Random: A flag indicating whether to force store random operator results.

            Followed by a table with the following columns:
            1. Op Idx: The operator index.
            2. Op Name: The operator name.
            3. Runtimes (ms): The operator runtime in milliseconds.
            4. Memory (B): The operator memory usage in bytes.
            5. View-like: A flag indicating whether the operator is view-like.
            6. Random: A flag indicating whether the operator is random.
            7. Saved Autograd: A flag indicating whether the operator's result is saved by autograd engine.
            8. In-place: The index of the operator's first parent, or None if not in-place.

        If print_tabular is True, the table is printed in a tabular format.
        Otherwise, the table is printed in a plain text format.
        zTotal Memory: z B Total Runtime: z ms Store Random: r  N)zOp IdxzOp NamezRuntimes(ms)z
Memory (B)z	View-likeRandomzSaved AutogradzIn-placer   	< )r'   r   rI   rH   rC   rb   rM   r   rG   rN   rJ   rK   rL   r   r   r*   r   r   insertr   r0  )rz   r   rJ  r    	op_parentr   fn_namerowr   r   
max_widthselems               r)   display_sac_statszSACEstimator.display_sac_stats  s   8 	S!1!1233Ec)J\J\F]E^ _'::;=	
 
../	#I$8$89 	#JAwA%%a(-I$$Q'(A0001A+++,A5556IMM!T*+	C c"	#	
 "7J7%*3w<%89!9J9a)! B(~ BGAt$'
1s4y$AJqMBB " IICLS>R4D:a=/ 123R : Ss   	G8G!c                    g t        j                        t        j                        cdd	 	 	 ddt        t           dt        t
           dt        t           dt        t           dt        t           d	dffd
}|j                  |j                  |j                  |j                  |j                  f\  }}}}}	|D ]  }
|
h}|
|v r|j                  ||
          |
|v r|j                  ||
          t        fd|D              z  t        fd|D              z  |D ch c]  }j                  |    }} |||d        |	D ]  }|j                  z  |j                   z  |j"                  h}|j"                  |v r|j                  ||j"                            |j"                  |v r|j                  ||j"                             |||j                  |j$                          |D ]  }
|
h}|
|v r|j                  ||
          |
|v r|j                  ||
          t        fd|D              z  t        fd|D              z  |D ch c]  }j                  |    }} |||d        g d}|rt'        |       yt)        t+        |            D cg c]  }d }}j-                  d|       D ]0  }t/        |      D ]   \  }}t1        ||   t+        |            ||<   " 2 D ]A  }t3        dj5                  t/        |      D cg c]  \  }}|d||    d c}}             C yc c}w c c}w c c}w c c}}w )af  
        Displays the SAC trade-off statistics.

        Args:
            greedy_order_meta (SACGreedyOrderMeta): The SAC greedy order metadata.
            sac_stats (SACStats): The SAC statistics.
            print_tabular (bool, optional): Whether to print the statistics in a tabular format. Defaults to False.

        Prints:
            A table with the following columns:
            1. Op Id(s): The operator index(es).
            2. Op Name(s): The operator name(s).
            3. Discarded Mem (%): The percentage of discarded memory.
            4. Discarded Mem (B): The discarded memory in bytes.
            5. Recomp time (%): The percentage of recomputed time.
            6. Recomp time (ms): The recomputed time in milliseconds.
            7. MSPS: The memory per second.
            8. Always Stored: A flag indicating whether the operator is always stored.
            9. Always Recomputed: A flag indicating whether the operator is always recomputed.

        If print_tabular is True, the table is printed in a tabular format.
        Otherwise, the table is printed in a plain text format.
        r   g        Nr  rG   rR   stored
recomputedr!   c                     t        |       t        |      	z  dt              
z  dt              ||dnt        t              t        |      t        |      g	}j                  |       y )Nr  z.2e)rN   r
   r   )r  rG   rR   rX  rY  rS  rB  rC  r    total_memorytotal_runtimes         r)   
append_rowz;SACEstimator.display_sac_tradeoff_stats.<locals>.append_row
  st     JJ </4M"!M1#6N#!%!14*s3xFJ
C c"r?   c              3   <   K   | ]  }j                   |     y wrf   r   r   r   r   s     r)   r   z:SACEstimator.display_sac_tradeoff_stats.<locals>.<genexpr>,        I!1!1!!4 Ir   c              3   <   K   | ]  }j                   |     y wrf   r   r_  s     r)   r   z:SACEstimator.display_sac_tradeoff_stats.<locals>.<genexpr>-       !LA)"4"4Q"7!Lr   T)rY  r   c              3   <   K   | ]  }j                   |     y wrf   r   r_  s     r)   r   z:SACEstimator.display_sac_tradeoff_stats.<locals>.<genexpr>A  r`  r   c              3   <   K   | ]  }j                   |     y wrf   r   r_  s     r)   r   z:SACEstimator.display_sac_tradeoff_stats.<locals>.<genexpr>B  rb  r   )rX  )	zOp Id(s)z
Op Name(s)zDiscarded Mem (%)zDiscarded Mem (B)zRecomp time (%)zRecomp time (ms)r   zAlways StoredzAlways RecomputedrM  rN  rO  )NFF)r   rI   rH   rS   r;   rN   r   r:   r=   r^   r]   r_   r`   ra   r   rG   rQ   rP   rR   r*   r   r   rP  r   r   r'   r0  )rz   r	  r   rJ  r]  r^   r]   r_   r`   ra   rP   r  r   rG   rE  r   r   rT  rS  rU  rB  rC  r    r[  r\  s     `                 @@@@@r)   display_sac_tradeoff_statsz'SACEstimator.display_sac_tradeoff_stats  s   : 
&))*:*:&;SASAS=T#m #
 %)%*).	#C	#C	# 5/	# TN		#
 !	# 	# 	#* ((,,//..''V
R
N$57G % 		@F$*8J**!!"3F";<))!!"26":;S Ij IIIMc!L!LLLN;EFa)..q1FJFz:$?		@  	DDT[[(Mdll*N++J{{//!!"3DKK"@A{{..!!"24;;"?@z4??C	D ! 		<F J**!!"3F";<))!!"26":;S Ij IIIMc!L!LLLN;EFa)..q1FJFz:d;		<

 "7J7%*3w<%89!9J9a)! B(~ BGAt$'
1s4y$AJqMBB " IICLS>R4D:a=/ 123RW G* G" : Ss   2M*M8	M-M save_tradeoff_graphsc                     | j                   j                         D ]5  \  }}| j                  || j                  |   |||      | j                  |<   7 y)aK  
        Fits a piecewise linear function with the specified sumber of segments to the SAC trade-off curve of
        discarded memory vs recomputation time.

        Args:
            n_segments (int, optional): The number of segments to be used for fitting the piecewise linear function to
                the trade-off curve. Defaults to 2.
            save_tradeoff_graphs (bool, optional): Whether to save the trade-off graphs to file. Defaults to False.

        If save_tradeoff_graphs is True, the trade-off graphs are saved to file using the module FQN as the filename.
        )r   r	  rU   r
  r  N)rj   r   rI  rl   rk   )rz   rU   rf  r   r   s        r)   pwlf_sac_tradeoff_curvez$SACEstimator.pwlf_sac_tradeoff_curve`  sb      #'"4"4":":"< 	GY373T3T#"&"@"@"I%$8  4U 4D''0	r?   depthc                    | j                   j                         D ]n  \  }}|j                  d      dz   }||kD  r t        d|        | j	                  ||       t        d| d       | j                  | j                  |   ||       p y)a  
        Displays the SAC and trade-off statistics for each module.

        Args:
            depth (int, optional): The maximum depth of modules to display. Defaults to 2.
            print_tabular (bool, optional): Whether to print the statistics in a tabular format. Defaults to False.

        Prints:
            For each module with depth less than or equal to the specified depth:
            1. The SAC statistics for the module (using display_sac_stats).
            2. The SAC trade-off statistics for the module (using display_sac_tradeoff_stats).

        If print_tabular is True, the statistics are printed in a tabular format.
        Otherwise, the statistics are printed in a plain text format.
        .r   zModule: zAC Trade-off for Module: z MSPS = Memory/RuntimeN)rj   r   countr'   rV  re  rl   )rz   ri  rJ  r   r   	mod_depths         r)   display_modulewise_sac_statsz)SACEstimator.display_modulewise_sac_statsy  s    $ #'"4"4":":"< 		GYc*Q.I5 HWI&'""9m<-gY6LMN++..w7M		r?   estimate_mode_typec                     |dk(  rt         j                  | _        | S |dk(  rt         j                  | _        | S t	        d| d      )a   
        Sets the estimate mode type.

        Currently supported modes:
            - "operator-level-benchmark": Estimates runtime using operator benchmarking.
            - "operator-level-cost-model": Estimates runtime using roofline cost model.

        Args:
            estimate_mode_type (str): The type of estimate mode to use.

        Returns:
            SACEstimator: The SAC estimator instance.

        Raises:
            NotImplementedError: If the estimate mode type is not supported.
        zoperator-level-benchmarkzoperator-level-cost-modelzestimate_mode_type z not supported)r   _benchmark_estimatery   rx   NotImplementedError)rz   ro  s     r)   __call__zSACEstimator.__call__  s_    " !;;%5%I%ID"   #>>%5%H%HD"
  &%&8%9H r?   c                 B   t               }t        |t              sJ d       |t        _        | j
                  j                  | j                  | j                         | j
                  j                          | j                  j                          t        | %         S )Nz0SAC Estimator should be called in FakeTensorMode)pre_fw_hookpost_fw_hook)r   r   r   r   	fake_moderm   register_user_hooksr   r   	__enter__rv   super)rz   rw  	__class__s     r)   ry  zSACEstimator.__enter__  s    $&	)^4 	
>	
4 &/"--))++ 	. 	
 	##%##--/w ""r?   r   c                     | j                   j                           | j                  j                  |  t        |   |  y rf   )rv   __exit__rm   rz  )rz   r   r{  s     r)   r}  zSACEstimator.__exit__  s8    ##,,."""D)$r?   )r!   N).N)r  Fac_tradeoff)F)r  F)(r5   r6   r7   r8   r{   rq   r   ru   r   Moduler   r   r   r=   r   rE   r,   r   r   rS   r   r<   r;   rb   rN   r   r   r   r   r   rI  rV  re  rh  rn  r	   rs  ry  r}  __classcell__)r{  s   @r)   r   r      sH   @EELL U\\ ,		 ,3 ,4 ," C # $ $Ic Id IB
&B
<@B
	B
H*6*6'*>':*6	sE#s(OT#uS#X*>%??	@*6Z -1AFS
 S
=O S
r $)%~
~
 .~
 	~

 "~
 ~
 
~
B :?F!F26F	FX $	v-v v 	v
 
vt %* # 
	4 5:-1	:3 4 6#4 # c  d    r?   r   )?r   r+  r   collectionsr   dataclassesr   r   typingr   r   r   typing_extensionsr	   rq   r
   r   r   torch._guardsr   torch._subclasses.fake_tensorr   %torch.distributed._tools.common_utilsr   $torch.distributed._tools.mod_trackerr   *torch.distributed._tools.runtime_estimatorr   ,torch.testing._internal.composite_compliancer   r   r   torch.utils._python_dispatchr   torch.utils._pytreer   torch.utils.checkpointr   __all__r   aten
lift_freshr   profiler_record_function_exit_RecordFunctionclone_ADDITIONAL_IGNORED_OPSr   r;   environr   r   rE   rN   r*   r,   rA   r   r   r   r   r   r>   r?   r)   <module>r     s    	 
 # * , , "  ) ) * 8 F ; G 
 ; , 2 Yyy~~ 	OO	II,,<<JJ 
 %'>>  

?CDIDq 
ADI A4S	? At A   2 % % %   2: &   .   &I $ I r?   