
    Vh`                    &   d dl mZ d dlZd dlZd dlZd dlZd dlmZmZm	Z	m
Z
 d dlmZ d dlmZ ddlmZmZ ddlmZ dd	lmZ erdd
lmZ ddlmZmZ  ej6                  e      Zej<                   G d d             Zej<                   G d d             Z ej<                   G d d             Z!	 	 	 	 	 	 ddZ"	 	 	 	 ddZ#	 	 	 	 	 	 ddZ$	 	 	 	 	 	 	 	 	 	 ddZ%	 	 	 	 	 	 	 	 ddZ&	 	 	 	 	 	 	 	 	 	 d dZ'd!dZ(d!dZ)e'e(e)gf	 	 	 	 	 	 	 	 	 	 	 	 	 d"dZ*y)#    )annotationsN)CallableTYPE_CHECKING	TypedDictUnion)signpost_event)
OrderedSet   )MultiOutputLayout
NoneLayout)get_dtype_size)V)Dep)BaseSchedulerNodeSchedulerBufferc                  Z    e Zd ZU dZded<   dZded<    ej                  e      Z	ded<   y)	MemoryPlanningInfoForBufferr   int
size_alloc	size_freedefault_factoryOrderedSet[BaseSchedulerNode]
succ_nodesN)
__name__
__module____qualname__r   __annotations__r   dataclassesfieldr	   r        F/home/dcms/DCMS/lib/python3.12/site-packages/torch/_inductor/memory.pyr   r      s3    JIs0A0A0A"1J- r"   r   c                      e Zd ZU dZded<   dZded<    ej                  e      Z	ded<    ej                  e      Z
ded	<    ej                  e      Zded
<   y)MemoryPlanningInfoForNoder   r   indexsizer   z7OrderedSet[Union[SchedulerBuffer, FreeableInputBuffer]]pred_buffersr   
pred_nodesr   N)r   r   r   r&   r   r'   r   r    r	   r(   r)   r   r!   r"   r#   r%   r%   "   sq    E3ND#M*5 I  1B0A0A"1J-  1B0A0A"1J- r"   r%   c                  X    e Zd ZU ded<    ej
                  e      Zded<   d	dZd
dZ	y)FreeableInputBufferstrnamer   r   
mpi_bufferc                    | j                   S N)r-   selfs    r#   get_namezFreeableInputBuffer.get_name8   s    yyr"   c                ,    t        | j                        S r0   )hashr-   r1   s    r#   __hash__zFreeableInputBuffer.__hash__;   s    DIIr"   N)returnr,   )r7   r   )
r   r   r   r   r   r    r   r.   r3   r6   r!   r"   r#   r+   r+   1   s.    
I.?k.?.?3/J+ r"   r+   c                   dd}t        j                  t              }t               }| D ]{  }|j                  j
                  D ]`  }|j                  |v s|j                  j                  d      r.||j                     j                  |        ||      ||j                  <   b } t               }|j                         D ]"  \  }}	t        |t        ||   |	            ||<   $ |S )z
    Create and keep track of all input buffers that can be freed during the program

    Returns:
        A dictionary containing all freeble input buffers, keyed by their names.
    c                l    d}	 | j                         s| j                         }|S # t        $ r Y |S w xY w)Nr   )has_unbacked_symbolsnumbytes_hintKeyError)depress     r#   _dep_size_hintz.get_freeable_input_buf.<locals>._dep_size_hintL   sH    	++-'') 
  	 
	s    & 	33)primals_arg)r   r   )r=   r   r7   r   )collectionsdefaultdictr	   dictread_writesreadsr-   
startswithadditemsr+   r   )
nodesgraph_inputsr?   dep_name_to_succ_nodesdep_name_to_sizenoder=   name_to_freeable_input_bufdep_namer   s
             r#   get_freeable_input_bufrQ   ?   s    
 	
+  (,v A##)) 	ACxx<'0C0C#1 'sxx044T:-;C-@ *	AA BF 6 < < > 
*/B'*840
"8,
 &%r"   c                    ddl m ddlm t	               	 d	 	 	 	 	 dfd| j                         D ]  }|j                         vs |        S )a  
    Compute the size of each scheduler buffer, including (1) memory allocated when
    it is created and (2) memory deallocated when it is freed.

    We specially handle the case of MultiOutputLayout.
    Consider the following case:
        buf0 = some_ops_with_multi_outputs(...)
        buf1 = buf0[0] # assume 10 bytes
        buf2 = buf0[1] # assume 20 bytes
    In such cases,
        buf0: at creation, 30 bytes allocated, when deleted, 0 bytes freed
        buf1: at creation, 0 bytes allocated, when deleted, 10 bytes freed
        buf2: at creation, 0 bytes allocated, when deleted, 20 bytes freed

    Returns:
        A dictionary mapping a scheduler buffer to a tuple of (size_alloc, size_free).
    r
   )MultiOutput)
OutputNodec                   t        | j                  j                  t              rd	| j	                         <   yt        | j                  j                  t
              rd}| j                  D ][  }t        |j                        r|j                  j                         D ]%  }t        |j                        s| |d      z  }' ] |rdn|df	| j	                         <   |S t        j                  j                  j                  | j                  j                         d      t        | j                  j                               z  }|rdn||f	| j	                         <   |S )N)r   r   r   T)fallback)
isinstancerN   layoutr   r3   r   usersget_outputsr   graphsizevars	size_hint	get_numelr   	get_dtype)
	sched_bufuser_of_MultiOutputLayoutr   userbufbuf_sizerS   rT   _compute_and_update_buf_sizesched_buf_to_sizes
         r#   re   zGcompute_size_for_scheduler_buffer.<locals>._compute_and_update_buf_size   sL    inn++Z86<i0023	--/@AJ! Ndii499002 NC!#((K8"&B3&MM
NN /J7i0023 ww''11((*Q 2 y~~779:;H /H7i0023 Or"   )F)r`   r   ra   boolr7   r   )irrS   	schedulerrT   rD   valuesr3   )name_to_bufr`   rS   rT   re   rf   s     @@@@r#   !compute_size_for_scheduler_bufferrl   r   sz    (  %48F GL"?C	 : !'') 4	 '88(3	4 r"   c                ,   t        |      }t        j                  t              }| D ]1  }|j                  D ]   }||j
                     j                  |       " 3 |j                         D ]'  }t        ||   d   ||   d   ||         ||   _	        ) y)z
    For each SchedulerBuffer, assign its size info and successor nodes.
    A buffer's successor nodes determines when a buffer can be freed.
    r   r
   )r   r   r   N)
rl   rB   rC   r	   unmet_dependenciesr-   rH   keysr   r.   )rJ   rk   rf   rL   rN   r=   buf_names          r#   1assign_memory_planning_info_for_scheduler_buffersrq      s     :+F
 	
+   7** 	7C"388,006	77  $$& 
+F(215'1!4-h7,
H(
r"   c                L   ddl m t        |       D ]  \  }}t        d |j	                         D              }t        t        t        f             }|j                  j                  D ]j  }|j                  |v r-||j                  v r|j                  ||j                            >|j                  |v sM|j                  ||j                            l t        fd|D              }	t        d |j	                         D              }
t        ||||	|
      |_         y)zL
    Assign to each scheduler node its predecessor and successor nodes.
    r
   )r   c              3  H   K   | ]  }|j                   j                    y wr0   )r.   r   ).0buffers     r#   	<genexpr>zBassign_memory_planning_info_for_scheduler_nodes.<locals>.<genexpr>   s     W&**55W    "c              3  \   K   | ]#  }t        |      r|j                             % y wr0   )rW   defining_op_name)rt   pred_bufferr   name_to_fused_nodes     r#   rv   zBassign_memory_planning_info_for_scheduler_nodes.<locals>.<genexpr>   s1       
;8 {;;=> 
s   ),c              3  V   K   | ]!  }|j                   j                  D ]  }|  # y wr0   )r.   r   )rt   ru   	succ_nodes      r#   rv   zBassign_memory_planning_info_for_scheduler_nodes.<locals>.<genexpr>   s9       
#..99 
   
 
s   '))r&   r'   r(   r)   r   N)ri   r   	enumeratesumrZ   r	   r   r+   rE   rF   r-   rn   rH   r%   mpi_node)rJ   r{   rk   rO   r&   rN   r   r(   r=   r)   r   r   s    `         @r#   /assign_memory_planning_info_for_scheduler_nodesr      s    + ' 
tWDDTDTDVWW
!%9L(L"MNP##)) 	GCxx;&3$2I2I+I  SXX!6777  !;CHH!EF		G
    
+ 
 


    
**, 
 


 2%!!
%
r"   c                   t         j                   G d d             }t               t        |       D ]
  \  }}||<    g }|j	                         D ]  \  }}||v rt        |       dz
  n't        fd|j                  j                  D              }	|j                   |||j                  j                  |j                  j                  d|	              t        |       D ]  \  }}|j                         D ]  }
|
j                         |v rt        |       dz
  n1t        |
j                  j                  D cg c]  }|   	 c}|      }	|j                   ||
|
j                  j                  |
j                  j                  ||	               t        t        |       dz         D cg c]  }d }}|D ]G  }||j                  xx   |j                  z  cc<   ||j                   dz   xx   |j                  z  cc<   I d}d}g }t        t        |       dz         D ]'  }|||   z  }|j                  |       t        ||      }) ||fS c c}w c c}w )a  
    Given a list of nodes in their execution order, estimate the peak memory, by
    keeping track of the liveliness of SchedulerBuffers and FreeableInputBuffers.

    Returns:
        int: peak memory
        List[int]: memory usage at each node (or each step).
    c                  @    e Zd ZU ded<   ded<   ded<   ded<   ded<   y)	(estimate_peak_memory.<locals>.BufferInfoz+Union[SchedulerBuffer, FreeableInputBuffer]ru   r   r   r   
start_stepend_stepNr   r   r   r   r!   r"   r#   
BufferInfor     s    ;;r"   r   r
   c              3  (   K   | ]	  }|     y wr0   r!   )rt   r}   node_to_steps     r#   rv   z'estimate_peak_memory.<locals>.<genexpr>  s      ,5Y's   r   )default)r   	dataclassrD   r~   rI   lenmaxr.   r   appendr   rZ   r3   r   ranger   r   )rJ   rO   graph_outputsr   steprN   buf_info_listrp   	input_bufr   r`   r}   _memorybuf_info
max_memory
cur_memorymemories_at_nodestr   s                      @r#   estimate_peak_memoryr      s       26L& "
d!T" ')M9??A 
) =( JN 9B9M9M9X9X  	 	$$..$$..	

&  & 
d))+ 	I %%'=8 E
Q *3)=)=)H)H% %Y/ !    ((33((22	6 s5zA~./Aa/F/ " <x""#x':'::#x  1$%););;%<
 JJ3u:>" 1fQi
  ,Z0
1
 )**E$ 0s   9I.	Ic                    G d dt               } G d dt               }t               t               }t               }| D ]D  }t        |j                  j
                        dd|<   |   d   dk(  s4|j                  |       F t        |j                               t        |j                               z   D ]=  }	dt        |	j                  j                        |	j                         |v rd	ndz   i||	<   ? t        d
 |j                         D              d}
|D ]D  }||v r|
||   j                  j                  z  }
$||v s)|
||   j                  j                  z  }
F t        |
      | D ]  }|j                  j                  D ]2  }	||	   d   d	k(  s|   dxx   |	j                  j                  z  cc<   4 |j!                         D ]2  }	||	   d   dk(  s|   dxx   |	j                  j                  z  cc<   4  g }d}|t        |       k  rV|rSt#        |fd      }|j%                  |       |j'                  |       |d	z  }|j                  j(                  z  t              |   d   z  |j                  j                  D ]<  }|   d   dkD  sJ |   dxx   d	z  cc<   |   d   dk(  s,|j                  |       > |j                  j                  D ]j  }	||	   d   dkD  sJ ||	   dxx   d	z  cc<   ||	   d   d	k(  s,|	j                  j                  D ]&  }|   dxx   |	j                  j                  z  cc<   ( l |t        |       k  r|rS|t        |       kD  rt+        d      |S )a  
    A bfs-based greedy topological order. LPMF stands for "Least Peak Memory First".

    The idea is from this paper:
    Buffer memory optimization for video codec application modeled in Simulink
    https://www.cs.york.ac.uk/rts/docs/DAC-1964-2006/PAPERS/2006/DAC06/PDFFILES/P0689.PDF

    The algorithm maintain the max memory so far.
    At every iteration, for each scheduleable node, it computes:
        - how much memory needs to be allocated for the output buffers of this node;
        - how much memory can be freed as a result of executing this node.
    This gives us two values for each node:
        (1) mem1: memory during the execution of the node;
        (2) mem2: memory after executing the node, after some input buffers are freed.
    The greedy approach select as follows:
        (i) if there are nodes whose mem1 values are below the max memory so far,
            then pick the node with the lowest mem2 value;
        (ii) otherwise, pick the one with the lowest mem1 value.
    c                  "    e Zd ZU ded<   ded<   y)'topological_sort_lpmf.<locals>.NodeInfor   indegreememory_to_freeNr   r!   r"   r#   NodeInfor   p  s    r"   r   c                      e Zd ZU ded<   y))topological_sort_lpmf.<locals>.BufferInfor   	outdegreeNr   r!   r"   r#   r   r   t  s    r"   r   r   )r   r   r   r   r
   c              3  H   K   | ]  }|j                   j                    y wr0   r.   r   )rt   r   s     r#   rv   z(topological_sort_lpmf.<locals>.<genexpr>  s$       	&&rw   r   c                    t        | j                  j                  z         | j                  j                  |    d   z
  | j                  j                  fS )Nr   )r   r   r'   r&   )rN   live_memoryr   	node_infos    r#   <lambda>z'topological_sort_lpmf.<locals>.<lambda>  sL    K$--"4"44jA""Yt_5E%FF## r"   keyz4Failed to schedule, while loop ran too long for lpmf)r   rD   r	   r   r   r)   rH   listrj   r.   r   r3   r   r   r   r(   rZ   minremover   r'   RuntimeError)rJ   rO   rk   r   r   r   r   nodes_to_schedulerN   rc   output_memoryrp   schedule	num_itersselected_noder}   r   r   r   s                   @@@r#   topological_sort_lpmfr   V  s   49 Y  486INRfH 8B| (DMM445
	$ T?:&!+!!$'( K&&()D1K1R1R1T,UU 
S^^667LLNm3q<

  3::< K M! W{"[2==GGGM337ALLVVVM	W
 [-0J  N==-- 	NC}[)Q.$ 01S^^5M5MM1	N ##% 	NC}[)Q.$ 01S^^5M5MM1	NN )+HI
c%j
 %6
 	  /&Q	 	}--222[1
y/0@AA '//:: 	1IY'
3a777i ,1,#J/14!%%i0		1 !))66 	WCC=-111SM+&!+&}[)Q.!$!:!: WIi()9:cnn>V>VV:W		W7 c%j
 %6D 3u:QRROr"   c           	       
  G d dt               }t               
t        j                   G d d             }d
fd}g }| D ]V  }t	        |j
                  j                        dd
|<   
|   d   d	k(  s4t        j                  | | ||      |             X g }d	}|t	        |       k  r|rt        j                  |      j                  }t	        |      
|   d
<   |j                  |       |dz  }|j
                  j                  D ]N  }	
|	   d   d	kD  sJ 
|	   dxx   dz  cc<   
|	   d   d	k(  s,t        j                  | | ||	      |	             P |t	        |       k  r|r|t	        |       kD  rt        d      |S )a  
    A BFS topological sort that selects nodes whose dependencies are executed the
    earliest. This follows a FIFO idea. Specifically, at every iteration, for each node
    that is schedulable, we gather the order in which its predecessor nodes are executed,
    and this sorted list of execution orders of predecessor nodes defines the priority.
    We select the node whose predecessors nodes are executed the earliest. The FIFO
    idea aims to reduce the liveness duration of buffers created.
    c                  "    e Zd ZU ded<   ded<   y)&topological_sort_bfs.<locals>.NodeInfor   r   orderNr   r!   r"   r#   r   r     s    
r"   r   c                  *    e Zd ZU ded<   ded<   ddZy).topological_sort_bfs.<locals>.NodeWithPriority	list[int]priorityr   rN   c                    | j                   |j                   k(  rA| j                  j                  j                  |j                  j                  j                  k  S | j                   |j                   k  S r0   )r   rN   r   r&   )r2   others     r#   __lt__z5topological_sort_bfs.<locals>.NodeWithPriority.__lt__  sP    }}.yy))//%**2E2E2K2KKK==5>>11r"   N)r   NodeWithPriorityr7   rg   )r   r   r   r   r   r!   r"   r#   r   r     s    	2r"   r   c                    |    d   dk(  sJ t        t        fd| j                  j                  D                    }|S )Nr   r   c              3  .   K   | ]  }|   d      yw)r   Nr!   )rt   	pred_noder   s     r#   rv   z?topological_sort_bfs.<locals>._node_priority.<locals>.<genexpr>  s       2;	)$W-s   )sortedr	   r   r)   )rN   exec_ordersr   s     r#   _node_priorityz,topological_sort_bfs.<locals>._node_priority  sK    z*a/// ?C}}?W?W 

 r"   )r   r   r   r   r   r
   z3Failed to schedule, while loop ran too long for bfs)rN   r   r7   r   )r   rD   r   r   r   r   r)   heapqheappushheappoprN   r   r   r   )rJ   r   r   r   r   rN   r   r   r   r}   r   s             @r#   topological_sort_bfsr     s   9  486I2 2 2 13 '*4==+C+C'DrR	$T?:&!+NN!#3N44H$#O )+HI
c%j
 %6&78==,/M	- )&Q	 '//:: 	IY'
3a777i ,1,#J/14%$^I%>	J		 c%j
 %6" 3u:PQQOr"   c                n   t               t               g t               dfd| D ]  }|j                         D ]  }||<   	  | D ]B  }|j                  j                  t        d |j                  j                  D              z   |<   D t        | fd      D ]
  } |        S )a  
    This is a DFS topological sort. The setup is similar to `topological_sort_schedule`
    in scheduler.py. The difference is the order nodes are visited in the outer loop.
    In `topological_sort_schedule`, nodes are visited in their original order.
    In this function, nodes are visited based on their priority -- for each node, we
    compute the total memory of all buffers it reads from or writes to, and we visit
    the nodes in ascending order of this priority.
    c                    | vrtj                  |        | j                  D cg c]  }|j                  v r|j                     ! }}t        |fd      D ]
  } |        j	                  |        y y c c}w )Nc                :    |    | j                   j                  fS r0   r   r&   nsize_with_readss    r#   r   z5topological_sort_dfs.<locals>.visit.<locals>.<lambda>2  s    /!*<ajj>N>N)O r"   r   )rH   rn   r-   r   r   )	r   r=   	dep_nodesrN   name_to_noderesultseenr   visits	       r#   r   z#topological_sort_dfs.<locals>.visit)  s    D=HHQK //88|+ SXX&I 
 O  d MM! s   $A;c              3  H   K   | ]  }|j                   j                    y wr0   r   )rt   pred_bufs     r#   rv   z'topological_sort_dfs.<locals>.<genexpr><  s!      9
.6H))9
rw   c                :    |    | j                   j                  fS r0   r   r   s    r#   r   z&topological_sort_dfs.<locals>.<lambda>?  s    _Q-?AQAQ,R r"   r   )r   r   r7   None)r	   rD   get_buffer_namesr   r'   r   r(   r   )rJ   rN   r-   r   r   r   r   r   s      @@@@@r#   topological_sort_dfsr     s     +5,D15L&(F48FO   &))+ 	&D!%L	&&  
 $ 2 2S 9
:>--:T:T9
 6
 !

 u"RS d Mr"   c           
        t         j                  dt        |              t        j                   G d d             }t        | |      }t        | |       t        | |||       g }t        | ||      \  }	}
|j                   || |	d             t         j                  d|	       |D ]  }	 |t        k(  r || |||      }n ||       }t        |      t        |       k(  sJ t        |||      \  }}
|j                   ||||j                               t         j                  d|j                  |        t        d	d
d|D ci c]  }|j                  |j                    c}i       t#        |d       }|j$                  S # t        $ r,}t         j                  d|j                  |       Y d}~d}~ww xY wc c}w )z
    Try a few heuristics based topological sort algorithms, and pick the one whose
    resulting topological order has the lowest peak memory estimation.
    z&Reordering for peak memory -- %d nodesc                  ,    e Zd ZU ded<   ded<   ded<   y)1reorder_for_peak_memory.<locals>.PeakMemoryResultlist[BaseSchedulerNode]r   r   peak_memoryr,   methodNr   r!   r"   r#   PeakMemoryResultr   X  s    &&r"   r   baselinezBaseline peak memory: %dz%s peak memory: %dzFailed to reorder for %s: %sNinductorr   orm)categoryr-   
parametersc                    | j                   S r0   )r   )xs    r#   r   z)reorder_for_peak_memory.<locals>.<lambda>  s
    amm r"   r   )	torch_loginfor   r   r   rQ   rq   r   r   r   r   r   	Exceptionerrorr   r   r   r   r   )rJ   rk   r{   rK   r   methodsr   rO   peak_memory_diff_methodsestimated_peak_memoryr   r   r   r   eelembest_results                    r#   reorder_for_peak_memoryr   E  s   " NN;SZH   BX|B 6e[I3!;0J
 8:  4)= 1 ## 5zB NN-/DE  P	P..5{M uu:U+++11=NK %++ V__E NN/+NP& >VWdDKK!1!11W
 .4KLK  	POO:FOOQOO	P Xs   *B
FF;	F8!F33F8)rJ   r   rK   OrderedSet[str]r7   dict[str, FreeableInputBuffer])rk   dict[str, SchedulerBuffer]r7   zdict[str, tuple[int, int]])rJ   r   rk   r   r7   r   )
rJ   r   r{   dict[str, BaseSchedulerNode]rk   r   rO   r   r7   r   )rJ   r   rO   r   r   r   r7   ztuple[int, list[int]])
rJ   r   rO   r   rk   r   r   r   r7   r   )rJ   r   r7   r   )rJ   r   rk   r   r{   r   rK   r   r   r   r   z,list[Callable[..., list[BaseSchedulerNode]]]r7   r   )+
__future__r   rB   r   r   loggingtypingr   r   r   r   torch._utils_internalr   torch.utils._ordered_setr	   rh   r   r   utilsr   virtualizedr   dependenciesr   ri   r   r   	getLoggerr   r   r   r   r%   r+   rQ   rl   rq   r   r   r   r   r   r   r!   r"   r#   <module>r     s   "     < < 0 / - !  != Gh'	       
 
 
0&"0&!0& $0&f<+<<~
"
+
 

<#
"#
4#
 ,#
 !?	#

 
#
L^+"^+ >^+ #^+ 	^+Bz"z >z ,z #	z
 zzEP'b 	=N"N+N 5N "	N
 #N :N Nr"   