Ë
    œÐVhœR  ã            	       ó  — d dl Z d dlZd dlmZ d dlmZmZ d dlmZ d dl	Z	d dl
mc mZ d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ e	j:                  j<                  Z eej@                  jC                  dd «      «      d k(  rdndZ"h ejF                  ’ejH                  ’ejJ                  ’ejL                  ’ejN                  ’ejP                  ’ejR                  ’ejT                  ’ejV                  ’ejX                  ’ejZ                  ’ej\                  ’ej^                  ’ej`                  ’ejb                  ’ejd                  ’ejf                  ’ejh                  ’ejj                  ’ejl                  ’ejn                  ’ejp                  ’ejr                  ’ejt                  ’ejv                  ’ejx                  ’ejz                  ’ej|                  ’ej~                  ’ej€                  ’ej‚                  ’ZBej†                  ejˆ                  ejŠ                  ejŒ                  ejŽ                  ej                  ej’                  ej”                  ej–                  h	ZLeBeLz  ZMdgZN G d„ de«      ZOy)é    N©Údefaultdict)ÚAnyÚCallable)ÚSelf)Úactive_fake_mode)Úget_device_tflopsÚget_gpu_dram_gbps)ÚFakeTensorMode)Ú
ModTracker)Úno_dispatch)ÚTorchDispatchMode)Úflop_registryÚPYTORCH_NO_CUDA_MEMORY_CACHINGi   é   ÚRuntimeEstimatorc                   ó”  ‡ — e Zd ZU dZej
                  ej                  ej                  ej                  hZ	e
ej                     ed<    e
«       Ze
ej                  j                     ed<   eed<   dˆ fd„Zed„ «       Zedeeef   fd	„«       Zedeeef   fd
„«       Zddeddfd„Zdd„Zdedefd„Zdefˆ fd„Zdeddfˆ fd„Z ˆ xZ!S )r   am	  
    Estimates the GPU runtime in milliseconds using various estimation methods under the ``FakeTensorMode``.

    This class provides a ``TorchDispatchMode`` based context manager that can be used to estimate the eager
    runtime of PyTorch functions. It supports two estimation modes, benchmarking (`operator-level-benchmark`) and
    roofline cost modeling (`operator-level-cost-model`).
    For modules executed under this context manager, it agggregates the forward and backward operation runtimes
    and also records their execution orders.

    Attributes:
        mod_runtimes (Dict[str, Dict[str, float]]): A dictionary of module runtimes. The key to the outer dictionary
            is the fully qualified name (FQN) of the module. For each module the forward and backward runtimes of the
            operations are aggregated in the inner dictionary keyed by 'fw' and 'bw'.
        mod_fw_pre_order (List[str]): List of module FQNs in pre-forward execution order.
        mod_bw_pre_order (List[str]): List of module FQNs in pre-backward execution order.
        mod_fw_post_order (List[str]): List of module FQNs in post-forward execution order.
        mod_bw_post_order (List[str]): List of module FQNs in post-backward execution order.
        total_runtime (float): The total estimated runtime in milliseconds.

    Note:
        1) The benchmarking estimate mode will execute kernels on GPU and assumes that every operation can run in
            isolation without causing an OOM error. It is also designed to be used only under ``FakeTensorMode``.
        2) Currently wrapper tensor sub-classes such as ``DTensor`` won't produce correct estimates. We plan to support
            them in future PRs.
        3) We only estimate the compute time, if your code has communication, it will not be considered. Again, we will
            support this in future PRs.

    Example usage:

        .. code-block:: python

            runtime_estimator = RuntimeEstimator()
            with FakeTensorMode():
                module = ...
                optimizer = ...
                inp = ...
                with runtime_estimator(estimate_mode_type="operator-level-cost-model"):
                    loss = module(inp)
                    loss.backward()
                    optimizer.step()
                    optimizer.zero_grad()
                runtime_estimator.display_modulewise_stats()
    Ú_float_typesÚ_no_fallback_kernelÚ	fake_modeÚreturnNc                 ó°   •— t         ‰|   «        |  |  t        «       | _        t	        d„ «      | _        g | _        g | _        g | _        g | _	        d| _
        y )Nc                  ó   — t        d„ «      S )Nc                   ó   — y©Nç        © r   ó    úZ/home/dcms/DCMS/lib/python3.12/site-packages/torch/distributed/_tools/runtime_estimator.pyú<lambda>z=RuntimeEstimator.__init__.<locals>.<lambda>.<locals>.<lambda>‹   ó    r   r   r   r   r   r    z+RuntimeEstimator.__init__.<locals>.<lambda>‹   s   € ”K¡Ó,€ r   r   )ÚsuperÚ__init__r   Ú_mod_trackerr   Úmod_runtimesÚmod_fw_pre_orderÚmod_bw_pre_orderÚmod_fw_post_orderÚmod_bw_post_orderÚtotal_runtime)ÚselfÚ	__class__s    €r   r#   zRuntimeEstimator.__init__…   sX   ø€ Ü‰ÑÔÙÙÜ&›LˆÔÜ9DÙ,ó:
ˆÔð ,.ˆÔØ+-ˆÔØ,.ˆÔØ,.ˆÔØ$'ˆÕr   c                 ó\  ‡ ‡‡‡— t         j                  j                  |j                  v r‰‚i Št	        j
                  ||f«      \  }}t        «       5  ˆ ˆfd„}|D cg c]
  } ||«      ‘Œ }}t	        j                  ||«      \  }} ||i |¤Ž}	d\  }
}t        |
«      D ]
  } ||i |¤Ž Œ t         j                  j                  d¬«      }t         j                  j                  d¬«      }|j                  t         j                  j                  «       «       t        |«      D ]
  } ||i |¤Ž Œ |j                  t         j                  j                  «       «       t         j                  j                  «        |j                  |«      }||z  }ddd«       t        «       Š|D ]S  }t!        |t         j"                  «      sŒ|j$                  rŒ+‰j'                  |j)                  «       j*                  «       ŒU ˆ ˆˆˆfd„}t	        j,                  |	«      fS c c}w # 1 sw Y   ŒxY w)aX  
        Runs and benchmarks a fallback kernel for a given function.

        Args:
            func (Callable): The function to benchmark.
            args (Tuple): The arguments to pass to the function.
            kwargs (Dict[str, Any]): The keyword arguments to pass to the function.
            orig_not_implemented_exception (Exception): The original exception to raise if the fallback kernel
                is not implemented.

        Returns:
            Tuple[Any, float]: A tuple containing the result of the function and
                the mean operation time in milliseconds.
        c                 ój  •— ‰j                   j                  | «      r–| j                  ‰j                  v r"t	        j
                  | | j                  ¬«      }n!t	        j                  | | j                  ¬«      }| j                  r|j                  | j                  «       «       | ‰t        |«      <   |S | S )N)Údevice)r   Úis_our_fakeÚdtyper   ÚtorchÚ	rand_likeÚfake_deviceÚ	ones_likeÚ	is_sparseÚ_coalesced_Úis_coalescedÚid)ÚeÚoutÚclsÚ	inp_implss     €€r   Úto_real_tensorzQRuntimeEstimator._maybe_run_and_benchmark_fallback_kernel.<locals>.to_real_tensor·   s   ø€ Ø—=‘=×,Ñ,¨QÔ/Ø—w‘w #×"2Ñ"2Ñ2Ü#Ÿo™o¨a¸¿¹ÔF™ä#Ÿo™o¨a¸¿¹ÔF˜Ø—{’{ØŸ™¨¯©Ó(8Ô9Ø)*Iœb ›gÑ&ØJØr   )é   é   T)Úenable_timingNc                 ót  •— t        | «      ‰vrDt        | t        j                  «      r*| j                  s| j                  «       j                  ‰v r‰‚t        | t        j                  «      rKt        | «      ‰v r‰t        | «         S ‰j                  j                  j                  ‰j                  | «      S | S ©N)
r9   Ú
isinstancer2   ÚTensorr6   Ú_typed_storageÚ_cdatar   Úfake_tensor_converterÚfrom_real_tensor)r:   r<   r=   Úorig_not_implemented_exceptionÚstoragess    €€€€r   Úmap_outzJRuntimeEstimator._maybe_run_and_benchmark_fallback_kernel.<locals>.map_outß   s—   ø€ Ü!‹u˜IÑ%Ü˜1œeŸl™lÔ+ØŸšØ×$Ñ$Ó&×-Ñ-°Ñ9à4Ð4ä˜!œUŸ\™\Ô*Üa“5˜IÑ%Ø$¤R¨£UÑ+Ð+àŸ=™=×>Ñ>×OÑOØŸ™ qóð ð r   )r2   ÚTagÚinplace_viewÚtagsÚpytreeÚtree_flattenr   Útree_unflattenÚrangeÚcudaÚEventÚrecordÚcurrent_streamÚsynchronizeÚelapsed_timeÚsetrD   rE   r6   ÚaddrF   rG   Útree_map)r<   ÚfuncÚargsÚkwargsrJ   Ú	flat_argsÚ	args_specr>   ÚaÚrÚwarmup_itersÚactual_itersÚ_Ústart_eventÚ	end_eventÚ	cuda_timeÚmean_op_timer:   rL   r=   rK   s   `   `              @@r   Ú(_maybe_run_and_benchmark_fallback_kernelz9RuntimeEstimator._maybe_run_and_benchmark_fallback_kernel•   sä  û€ ô2 9‰9×!Ñ! T§Y¡YÑ.Ø0Ð0àˆ	Ü%×2Ñ2°D¸&°>ÓBÑˆ	9ô ‹]ñ 	4õ
ð 5>Ö>¨q™¨Õ*Ð>ˆIÐ>Ü!×0Ñ0°¸IÓF‰LˆD&ÙdÐ%˜fÑ%ˆAØ)-Ñ&ˆL˜,Ü˜<Ó(ò &ÙdÐ%˜fÓ%ð&äŸ*™*×*Ñ*¸Ð*Ó>ˆKÜŸ
™
×(Ñ(°tÐ(Ó<ˆIØ×ÑœuŸz™z×8Ñ8Ó:Ô;Ü˜<Ó(ò &ÙdÐ%˜fÓ%ð&à×ÑœUŸZ™Z×6Ñ6Ó8Ô9ÜJ‰J×"Ñ"Ô$Ø#×0Ñ0°Ó;ˆIØ$ |Ñ3ˆL÷9	4ô< “5ˆàò 	<ˆAÜ˜!œUŸ\™\Õ*Ø—{“{Ø—L‘L ×!1Ñ!1Ó!3×!:Ñ!:Õ;ð	<÷	ô$ —‘ ¨Ó+¨\Ð:Ð:ùò] ?÷	4ð 	4ús   ÁH"ÁHÁ-D&H"ÈH"È"H+c                 ó4  — t        | j                  t        «      sJ d«       ‚d}|j                  t        vr 	 | j                  |||t        «      \  }}||fS  ||i |xs i ¤Ž}||fS # t        $ r( | j                  j                  |j                  «       Y Œ@w xY w)aÉ  
        Estimates the runtime of a function using benchmarking.

        Args:
            func: The function to estimate.
            args: The arguments to pass to the function.
            kwargs: The keyword arguments to pass to the function.
            res: The result of the function.

        Returns:
            Tuple[Any, float]: A tuple containing the result of the function and
                the mean operation time in milliseconds.
        z;Initialize/Assign FakeTensorMode before using this functionr   )	rD   r   r   Ú_overloadpacketÚ	_VIEW_OPSrk   ÚNotImplementedErrorr   r[   )r<   r]   r^   r_   rj   Úress         r   Ú_benchmark_estimatez$RuntimeEstimator._benchmark_estimateó   s¸   € ô ˜#Ÿ-™-¬Ô8ð 	
ØIó	
Ð8ð ˆØ×Ñ¤yÑ0ð	BØ$'×$PÑ$PØØØÜ'ó	%Ñ!\ð ˜\Ð*Ð*ñ DÐ)˜FšL bÑ)ˆØ\Ð"Ð"øô 'ò BØ×'Ñ'×+Ñ+¨D×,@Ñ,@ÖAðBús   ·A& Á&.BÂBc                 ó¨  ‡— t         j                  j                  «       sJ d«       ‚dt         j                  dt        fd„Šdt
        fd„}dt
        fˆfd„}|r|ni } ||i |¤Ž}d}|j                  }|t        vrÐt        j                  ||f«      \  }	}
t        j                  |«      \  }} ||	|«      }|D ch c]@  }t        |t         j                  «      r$|j                  | j                  v r|j                  ’ŒB }}t        j                  |	|
«      \  }}t        j                  ||«      } ||||||«      }t        ||«      dz  }||fS c c}w )	aÒ  
        Estimates the runtime of a function using a roofline cost model.

        Args:
            func: The function to estimate.
            args: The arguments to pass to the function.
            kwargs: The keyword arguments to pass to the function.
            out: The output of the function.

        Returns:
            Tuple[Any, float]: A tuple containing the result of the function and
                the mean operation time in milliseconds.
        zIRoofline estimation needs to access CUDA capabilities to make estimationsÚtr   c                 óˆ   — | j                  «       j                  «       }t        j                  |t        z  «      t        z  }|S )zé
            Calculates the memory consumption of a tensor.

            Args:
                t (torch.Tensor): The input tensor.

            Returns:
                int: The memory consumption of the tensor in bytes.
            )Úuntyped_storageÚnbytesÚmathÚceilÚ_PYTORCH_MIN_ALLOCATE)rs   Ú	num_bytesÚmem_consumeds      r   Úget_num_bytesz:RuntimeEstimator._roofline_estimate.<locals>.get_num_bytes(  s@   € ð ×)Ñ)Ó+×2Ñ2Ó4ˆIä—	‘	˜)Ô&;Ñ;Ó<Ô?TÑTð ð  Ðr   c                 óØ   — | t         v rbt        |«      dk(  sJ d|› d| › «       ‚|j                  «       }t        |«      dz  }d}||z  }t         |    }	 |	|i |¤d|i¤Ždz  }
|
|z  dz  }|S y	)
aÇ  
            Estimates the compute time of an aten operator.

            Args:
                func_packet: The operator overload packet.
                args: The arguments to the operator.
                kwargs: The keyword arguments to the operator.
                out: The output of the operator.
                out_dtypes: The output data types.

            Returns:
                float: The estimated compute time in nanoseconds.
            r   z"Only support single out dtype got z for g  4&õkCg      è?Úout_valr?   g    eÍÍAr   )r   ÚlenÚpopr	   )Úfunc_packetr^   r_   r;   Ú
out_dtypesr1   Úpeak_gpu_flopsÚfactorÚpeak_empirical_flopsÚflop_count_funcÚ
flop_countÚcompute_times               r   Úget_compute_timez=RuntimeEstimator._roofline_estimate.<locals>.get_compute_time8  sž   € ð œmÑ+Ü˜:“¨!Ò+ð Ø8¸¸ÀEÈ+ÈÐWóÐ+ð #Ÿ™Ó(ä!2°5Ó!9¸DÑ!@àØ'-°Ñ'>Ð$Ü"/°Ñ"<á,¨dÐJ°fÑJÀcÒJÈQÑN
à *Ð-AÑ AÀSÑHØ#Ð#Ør   c                 ó€   •— t        «       }t        ˆfd„| D «       «      }t        ˆfd„|D «       «      }||z   }||z  }|S )a  
            Estimates the memory transfer time of input and output tensors.

            Args:
                flat_args_kwargs (List[torch.Tensor]): The flat list of arguments and keyword arguments.
                flat_outs (List[torch.Tensor]): The flat list of outputs.

            Returns:
                float: The estimated memory transfer time in nanoseconds.
            c              3   ób   •K  — | ]&  }t        |t        j                  «      r
 ‰|«      –— Œ( y ­wrC   ©rD   r2   rE   ©Ú.0rs   r|   s     €r   ú	<genexpr>zQRuntimeEstimator._roofline_estimate.<locals>.get_transfer_time.<locals>.<genexpr>d  s,   øè ø€ ò àÜ˜a¤§¡Ô.ñ ˜a× ñùs   ƒ,/c              3   ód   •K  — | ]'  }t        |t        j                  «      sŒ ‰|«      –— Œ) y ­wrC   rŒ   r   s     €r   r   zQRuntimeEstimator._roofline_estimate.<locals>.get_transfer_time.<locals>.<genexpr>i  s(   øè ø€ ò Ø%&´zÀ!ÄUÇ\Á\Õ7R‘˜a× ñùs   ƒ0£0)r
   Úsum)Úflat_args_kwargsÚ	flat_outsÚgpu_memory_bandwidthÚ
read_bytesÚwrite_bytesÚcounted_bytesÚtransfer_timer|   s          €r   Úget_transfer_timez>RuntimeEstimator._roofline_estimate.<locals>.get_transfer_timeX  s[   ø€ ô $5Ó#6Ð Üó à)ôó ˆJô
 ó Ø*3ôó ˆKð '¨Ñ4ˆMà)Ð,@Ñ@ˆMØ Ð r   r   g    €„.A)r2   rT   Úis_availablerE   ÚintÚfloatrm   Ú_IGNORE_OPSrP   rQ   rD   r1   r   rR   Úmax)r<   r]   r^   r_   r‰   r™   r;   Úop_timer   r’   ra   r“   Úout_specr˜   rs   r‚   rˆ   r|   s                    @r   Ú_roofline_estimatez#RuntimeEstimator._roofline_estimate  se  ø€ ô z‰z×&Ñ&Ô(ð 	
ØWó	
Ð(ð	 œUŸ\™\ð 	 ¬có 	 ð 	ÌEó 	ð@	!¼eõ 	!ñb "‘ rˆÙDÐ#˜FÑ#ˆØˆØ×*Ñ*ˆØœkÑ)Ü*0×*=Ñ*=¸tÀV¸nÓ*MÑ'Ð˜iÜ"(×"5Ñ"5°cÓ":ÑˆIxÙ-Ð.>À	ÓJˆMð #öàÜ˜a¤§¡Ô.°1·7±7¸c×>NÑ>NÑ3Nð —“ðˆJð ô "×0Ñ0Ð1AÀ9ÓM‰LˆD&Ü×'Ñ'¨	°8Ó<ˆCá+¨K¸¸vÀsÈJÓWˆLô ˜-¨Ó6¸Ñ<ˆGàWˆ~Ðùòs   Â:AEÚdepthc                 óì  — t        d«       | j                  D ]'  }|j                  d«      dz   }||kD  rŒt        |«       Œ) t        d«       | j                  D ]'  }|j                  d«      dz   }||kD  rŒt        |«       Œ) | j                  j                  «       D ]U  \  }}|j                  d«      dz   }||kD  rŒ t        |› d|j                  dd«      d›d	|j                  d
d«      d›d«       ŒW y)aP  
        Displays module-wise statistics collected by ``RuntimeEstimator``.

        Prints the pre-forward and pre-backward execution orders.
        Displays the module-wise forward and backward runtimes in milliseconds.

        Args:
            depth (int): The maximum depth of module hierarchy to display (default to 2).
        zPre-Forward Execution Order: ú.r   zPre-Backward Execution Order: z fw: Úfwr   ú.3fzms bw: ÚbwÚmsN)Úprintr&   Úcountr'   r%   ÚitemsÚget)r+   r¢   Úmod_fqnÚ	mod_depthÚruntimess        r   Údisplay_modulewise_statsz)RuntimeEstimator.display_modulewise_stats¢  s  € ô 	Ð-Ô.Ø×,Ñ,ò 	ˆGØŸ™ cÓ*¨QÑ.ˆIØ˜5Ò ØÜ'Nð		ô
 	Ð.Ô/Ø×,Ñ,ò 	ˆGØŸ™ cÓ*¨QÑ.ˆIØ˜5Ò ØÜ'Nð		ð
 "&×!2Ñ!2×!8Ñ!8Ó!:ò 	ÑˆGXØŸ™ cÓ*¨QÑ.ˆIØ˜5Ò ØÜØ)˜5 §¡¨d°CÓ!8¸Ð =¸WÀXÇ\Á\ÐRVÐX[ÓE\Ð]`ÐDaÐacÐdõñ		r   c                 ó(  — | j                  |||«      \  }}| j                  j                  D ]M  }| j                  j                  r| j                  |   dxx   |z  cc<   Œ4| j                  |   dxx   |z  cc<   ŒO | xj
                  |z  c_        |S )Nr§   r¥   )Ú	_estimater$   ÚparentsÚis_bwr%   r*   )r+   r]   Útypesr^   r_   rp   rŸ   Úpars           r   Ú__torch_dispatch__z#RuntimeEstimator.__torch_dispatch__À  s’   € ð —~‘~ d¨D°&Ó9‰ˆˆWØ×$Ñ$×,Ñ,ò 	8ˆCØ× Ñ ×&Ò&Ø×!Ñ! #Ñ& tÓ,°Ñ7Ô,à×!Ñ! #Ñ& tÓ,°Ñ7Ô,ð		8ð
 	×Ò˜gÑ%ÕØˆ
r   Úestimate_mode_typec                 óž   — |dk(  rt         j                  | _        n*|dk(  rt         j                  | _        nt	        d|› d«      ‚|| _        | S )a  
        Sets the estimate mode type.

        Currently supported modes:
            - "operator-level-benchmark": Estimates runtime using operator benchmarking.
            - "operator-level-cost-model": Estimates runtime using roofline cost model.

        Args:
            estimate_mode_type (str): The type of estimate mode to use.

        Returns:
            RuntimeEstimator: The runtime estimator instance.

        Raises:
            NotImplementedError: If the estimate mode type is not supported.
        zoperator-level-benchmarkzoperator-level-cost-modelzestimate_mode_type z not supported)r   rq   r²   r¡   ro   Ú_estimate_mode_type)r+   r¸   s     r   Ú__call__zRuntimeEstimator.__call__Ì  sZ   € ð" Ð!;Ò;Ü-×AÑAˆDNØÐ#>Ò>Ü-×@Ñ@ˆDNä%Ø%Ð&8Ð%9¸ÐHóð ð $6ˆÔ Øˆr   c                 ó  •‡ — t        «       }t        |t        «      sJ d«       ‚|t        _        d‰ _        t        d„ «      ‰ _        ‰ j                  j                  «        ‰ j                  j                  «        ‰ j                  j                  «        ‰ j                  j                  «        ‰ j                  j                  ˆ fd„ˆ fd„ˆ fd„ˆ fd„¬«       ‰ j                  j                  «        t         ‰‰ =  «        ‰ S )	Nz>No FakeTensorMode found, designed to used under FakeTensorModer   c                  ó   — t        d„ «      S )Nc                   ó   — yr   r   r   r   r   r    z>RuntimeEstimator.__enter__.<locals>.<lambda>.<locals>.<lambda>ï  r!   r   r   r   r   r   r    z,RuntimeEstimator.__enter__.<locals>.<lambda>ï  s   € ´¹KÓ0H€ r   c                 ól   •— ‰j                   j                  ‰j                  j                  | «      «      S rC   )r&   Úappendr$   Úget_known_fqn)ÚmodÚinpr+   s     €r   r    z,RuntimeEstimator.__enter__.<locals>.<lambda>õ  s,   ø€ ¨×)>Ñ)>×)EÑ)EØ×!Ñ!×/Ñ/°Ó4ó*€ r   c                 ól   •— ‰j                   j                  ‰j                  j                  | «      «      S rC   )r'   rÀ   r$   rÁ   )rÂ   Úg_outr+   s     €r   r    z,RuntimeEstimator.__enter__.<locals>.<lambda>ø  s,   ø€ ¨4×+@Ñ+@×+GÑ+GØ×!Ñ!×/Ñ/°Ó4ó,€ r   c                 ól   •— ‰j                   j                  ‰j                  j                  | «      «      S rC   )r(   rÀ   r$   rÁ   )rÂ   rÃ   r;   r+   s      €r   r    z,RuntimeEstimator.__enter__.<locals>.<lambda>û  s,   ø€ ¨t×/EÑ/E×/LÑ/LØ×!Ñ!×/Ñ/°Ó4ó0€ r   c                 ól   •— ‰j                   j                  ‰j                  j                  | «      «      S rC   )r)   rÀ   r$   rÁ   )rÂ   Úg_inpr+   s     €r   r    z,RuntimeEstimator.__enter__.<locals>.<lambda>þ  s,   ø€ ¨D×,BÑ,B×,IÑ,IØ×!Ñ!×/Ñ/°Ó4ó-€ r   )Úpre_fw_hookÚpre_bw_hookÚpost_fw_hookÚpost_bw_hook)r   rD   r   r   r   r*   r   r%   r&   Úclearr'   r(   r)   r$   Úregister_user_hooksÚ	__enter__r"   )r+   r   r,   s   ` €r   rÏ   zRuntimeEstimator.__enter__è  sß   ù€ Ü$Ó&ˆ	Ü˜)¤^Ô4ð 	
ØLó	
Ð4ð &/ÔÔ"Ø ˆÔÜ'Ñ(HÓIˆÔØ×Ñ×#Ñ#Ô%Ø×Ñ×#Ñ#Ô%Ø×Ñ×$Ñ$Ô&Ø×Ñ×$Ñ$Ô&Ø×Ñ×-Ñ-óóóóð 	.ô 	
ð 	×Ñ×#Ñ#Ô%Ü‰ÑÔØˆr   r^   c                 óB  •— t        d| j                  › d| j                  d›d«       t        | j                  «      dkD  rt        dt        | j                  «      «       t        ‰|   |Ž  | j                  j                  «        | j                  j                  «        y )NzEstimated (z)total_time: r¦   z msr   zno_fallback_kernel: )
r©   rº   r*   r   r   Úlistr"   Ú__exit__r$   Úclear_user_hooks)r+   r^   r,   s     €r   rÒ   zRuntimeEstimator.__exit__  s   ø€ ÜØ˜$×2Ñ2Ð3ð 4Ø×-Ñ-¨cÐ2°#ð7ô	
ô ˆt×'Ñ'Ó(¨1Ò,ÜÐ(¬$¨t×/GÑ/GÓ*HÔIÜ‰Ñ˜$ÑØ×Ñ×*Ñ*Ô,Ø×Ñ×"Ñ"Õ$r   )r   N)r?   ).N)"Ú__name__Ú
__module__Ú__qualname__Ú__doc__r2   Úfloat16Úbfloat16Úfloat32Úfloat64r   rZ   r1   Ú__annotations__r   Ú_opsÚ_OpNamespacer   r#   Úclassmethodrk   Útupler   rœ   rq   r¡   r›   r°   r·   Ústrr   r»   rÏ   rÒ   Ú__classcell__)r,   s   @r   r   r   O   s  ø… ñ*ðZ 	‰Ø‰Ø‰Ø‰ð	&€L#e—k‘kÑ"ó ñ 9<»Ð˜˜UŸZ™Z×4Ñ4Ñ5Ó=ØÓõ(ð  ñ[;ó ð[;ðz ð#¸¸cÀ5¸jÑ8Iò #ó ð#ðB ðJ°u¸SÀ%¸ZÑ7Hò Jó ðJñX¨cð ¸$ó ó<
ð¨3ð °4ó ð8˜4õ ð<	%˜cð 	% d÷ 	%ñ 	%r   )Prw   ÚosÚcollectionsr   Útypingr   r   Útyping_extensionsr   r2   Útorch.utils._pytreeÚutilsÚ_pytreerP   Útorch._guardsr   Útorch._inductor.utilsr	   r
   Útorch._subclasses.fake_tensorr   Ú$torch.distributed._tools.mod_trackerr   Útorch.utils._mode_utilsr   Útorch.utils._python_dispatchr   Útorch.utils.flop_counterr   ÚopsÚatenr›   Úenvironr¬   ry   Ú
lift_freshrs   Ú	transposeÚviewÚdetachÚ_unsafe_viewÚsplitÚadjointÚ
as_stridedÚdiagonalÚexpandÚ	expand_asÚmovedimÚpermuteÚselectÚsqueezeÚmTÚmHÚrealÚimagÚview_asÚ	unflattenÚunfoldÚunbindÚ	unsqueezeÚvsplitÚhsplitÚsplit_with_sizesÚswapaxesÚswapdimsÚchunkrn   ÚrandintÚrandnÚrandÚ
randn_liker3   Úrandint_likeÚaranger5   Ú
zeros_likeÚ_CREATE_OPSr   Ú__all__r   r   r   r   ú<module>r     s  ðã Û 	Ý #ß  Ý "ã ß $Ð $Ý *ß FÝ 8Ý ;Ý /Ý :Ý 2ð ‡yy‡~~€ñ
 —
‘
—‘Ð?ÀÓCÓDÈÒIDÈqð ð
 Ø‡OOð à‡FFð ð 	‡NNð ð 	‡IIð	 ð
 	‡KKð ð 	×Ñð ð 	‡JJð ð 	‡LLð ð 	‡OOð ð 	‡MMð ð 	‡KKð ð 	‡NNð ð 	‡LLð ð 	‡LLð ð 	‡KKð ð  	‡LLð! ð" 	‡GGð# ð$ 	‡GGð% ð& 	‡IIð' ð( 	‡IIð) ð* 	‡LLð+ ð, 	‡NNð- ð. 	‡KKð/ ð0 	‡KKð1 ð2 	‡NNð3 ð4 	‡KKð5 ð6 	‡KKð7 ð8 	×Ñð9 ð: 	‡MMð; ð< 	‡MMð= ð> 	‡JJð? €	ðF 	‡LLØ‡JJØ‡IIØ‡OOØ‡NNØ×ÑØ‡KKØ‡NNØ‡OOð
€ð ˜+Ñ%€àÐ
€ô@%Ð(õ @%r   