
    Vh.$                        d Z ddlZddlmZ ddlmZ ddlZddlmZ ddl	m
Z
 ddlmZ ddlmZmZmZmZmZmZ dd	lmZmZmZmZmZ dd
lmZ ddlmZ d Zdej>                  j@                  fdZ!dej>                  j@                  dee"   fdZ#dej>                  j@                  dee"   fdZ$de%fdZ&de'ee"      fdZ(d Z) G d d      Z* ed e*              ddZ+y)a  
This module implements CUDA graphs support for TorchDynamo backends.

CUDA graphs allow for capturing and replaying GPU operations, which can significantly
reduce CPU overhead in GPU-accelerated PyTorch models. This module provides:

- CUDA graph creation and management for both forward and backward passes
- Input mutation detection and handling
- Device compatibility checking
- Stack trace management for debugging
- Integration with TorchInductor's cudagraph trees

The backend supports two main modes:
1. cudagraphs: Full CUDA graph support with both forward and backward pass optimization
2. cudagraphs_inner: Lower-level CUDA graph implementation used for benchmarking

Key components:
- CudagraphsBackend: Main backend class for CUDA graph integration
- Mutation detection utilities to ensure graph safety
- Device mapping and compatibility checks
- Stack trace collection for debugging
    N)defaultdict)Optional)config)aot_autograd)	boxed_nop)BoxedDeviceIndex'check_multiple_devices_or_any_cpu_nodesformat_default_skip_messageget_mutation_stack_traceget_placeholder_info#log_cudagraph_skip_and_bump_counter)	BoxedBoolcount_tangents%get_first_incompatible_cudagraph_nodenum_fw_fixed_argumentsoutput_node)StorageWeakRef   )register_backendc           	      t   d }t        t              }d}t               }| j                  D ]  }|j                  dk(  rkt	         ||j
                        t        j                        r;|t         ||j
                        j                                  j                  |       |dz  }~|j                  dk(  st        |j                  d      s|j                  j                  }t        |j                        D ]  \  }}|t!        |j"                        k  r|j"                  |   }	n2|j$                  |j&                  vrG|j&                  |j$                     }	d}
|j(                  r|j(                  j*                  rd}
|
s||t         ||	j
                        j                                  z  }  |S )	Nc                     d| v r| d   S | d   S )Nvalfake_result )metas    Q/home/dcms/DCMS/lib/python3.12/site-packages/torch/_dynamo/backends/cudagraphs.pymeta_fkz%find_input_mutations.<locals>.meta_fk7   s    #tmtE{Dm1DD    r   placeholderr   call_function_schemaFT)r   setnodesop
isinstancer   torchTensorr   _typed_storageaddhasattrtargetr!   	enumerate	argumentslenargsnamekwargs
alias_infois_write)gr   inputs	input_idxmutated_inputsnschemaiargargumentmut_args              r   find_input_mutationsr>   6   sh   E FIUNWW 44= '!&&/5<<8~gaffo&D&D&FGHLLYWNITT_$188Y/XX%%F#F$4$45 3s166{? vvayHxxqxx/  xx1H>>~~.."& #f&wx}}'='L'L'NO' N: r   gmc                     i }| j                   j                  D ]W  }|j                  j                  dd       }t	        |t
        j                        s:|j                  |vsI|||j                  <   Y |S )Nr   )graphr#   r   getr%   r&   r'   device)r?   device_node_mappingr8   ts       r   get_device_node_mappingrF   ]   sd    =?XX^^ .FFJJud#a&188;N+N,-). r   	aot_modelreturnc                     t        | j                        t        t        |            z
  }|sy t	        | j                        }t        ||      S N)r>   rA   r"   ranger   r   )rG   	num_fixedmutation_indicesplaceholderss       r   3check_for_mutation_ignore_cuda_graph_managed_tensorrO   f   sD     ,IOO<s5CS?TT'	8L#L2BCCr   c                     t         j                  st        | |      x}r|S t        t	        |             x}r|S t        |       x}rt        d|j                   d      S y )Nzincompatible op ())r   (cudagraph_backend_support_input_mutationrO   r	   rF   r   r
   r0   )rG   rL   mut_skipskipnodes        r   check_for_skiprV   q   sz    ::Jy
 
8 
 O6	* t  4Y??t?*->tyyk+KLLr   c                 v    t        t        t        |                   }|j                  dk(  sJ |j                  S )Ncuda)nextiterrF   typeindex)r?   rC   s     r   get_device_indexr]      s3    $.r234F;;&   <<r   c                    t        |       }t        |j                        dk(  sJ |j                  d   D cg c]>  }t        |t        j
                  j                  j                        r|j                  nd @ c}S c c}w )Nr   r   )	r   r.   r/   r%   r&   fxrU   Nodestack_trace)r?   outputr;   s      r   get_stack_tracesrc      sl    _Fv{{q    ;;q> 'sEHHMM,>,>?T	I  s   AA=c                     ddl m t        d      t        d       dfd	}fd}t	        ||t        j                  |d      t        j                  j                  j                        } ||       S )	Nr   )cudagraphify_implTc                    t        | |      }t        t        
      t        |            }t        | |      x}r%t	        j
                  	       t        d|        |S j                  t        |               ||t        |      j                  ddt        |       t        | j                        t        | j                        	      }d|_        |S )Nzskipping cudagraphs due to Fdevice_indexis_backwardis_inferencestack_tracesrN   mutated_input_idxsT)r   r   r.   rV   r   disabler   r"   r]   rK   valuerc   r   rA   r>   _boxed_call)rG   
aot_inputsrj   interpfixedskip_msgoutboxed_device_indexre   do_cudagraphsdynamo_inputss          r   forward_cudagraphsz&cudagraphs.<locals>.forward_cudagraphs   s    9j1&s='93z?K%i7787m,/-hZ8 M/	:;%L+11))4-ioo>3IOOD

 
r   c                     t         |      }
s S t               }t         |      x}rTt        d|       t        j
                  j                  j                  j                  d      J  fd}d|_	        |S  	||t        |      t               ddt               t         j                        t         j                        	      }d|_	        |S )Nzskipping cudagraphs due to %sF)create_if_none_existsc                 4    j                           |       S rJ   )set_to_running_backward)r5   rG   managers    r   fnz3cudagraphs.<locals>.backward_cudagraphs.<locals>.fn   s    //1 ((r   Trg   )r   r   rV   r   r&   	_inductorcudagraph_treesget_managerrn   ro   rK   r]   rc   r   rA   r>   )rG   rp   rq   rr   rs   r~   rt   r}   ru   re   rv   s   `      @r   backward_cudagraphsz'cudagraphs.<locals>.backward_cudagraphs   s    9j1y)%i7787//
 oo55AA"(( B G &&&) "BNI%L))4))4-ioo>3IOOD

 
r   )rj   )fw_compilerbw_compilerinference_compilerkeep_inference_input_mutations)F)torch._inductor.cudagraph_treesre   r   r   r   	functoolspartialr&   _dynamor   %cudagraph_backend_keep_input_mutation)dynamo_modelrw   rx   r   aot_cudagraphsru   re   rv   s    `   @@@r   
cudagraphsr      so    AdOM)$/ 2$L "&'$,,-?dS',}}';';'a'a	N ,66r   c                   0    e Zd ZdZed        Zed        Zy)CudagraphsBackendr   c                      ddl m}   |         y )Nr   reset_cudagraph_trees)r   r   r   s    r   resetzCudagraphsBackend.reset   s    Ir   c                     t        | |      S rJ   )r   )modelr5   s     r   __call__zCudagraphsBackend.__call__   s    %((r   N)__name__
__module____qualname__compiler_namestaticmethodr   r   r   r   r   r   r      s-     M   
 ) )r   r   r   )r0   compiler_fnc                   	 t        |t        t        f      sJ r$|D cg c]  }t        j                  |       c}nt        |      t        j
                  j                          t        j
                  j                         }|j                  t        j
                  j                                t        j
                  j                  |      5   | |  ddd       |j                          t        j
                  j                         j                  |       t        j
                  j                          t        j
                  j                         t        j
                  j                  |      5   |  	ddd       t        	t        t        f      s	f		fd}|S c c}w # 1 sw Y   xY w# 1 sw Y   >xY w)zBThis isn't registered as a backend, but is used in some benchmarksN)streamc                      t              t        |       k(  sJ r%t        |       D ]  \  }}|j                  |        j                          rD cg c]  }|j	                          c}S S c c}w rJ   )r.   zipcopy_replayclone)	
new_inputsdstsrcxcopy_inputscopy_outputsrA   static_inputsstatic_outputss	       r   runzcudagraphs_inner.<locals>.run  sp    =!S_444z: S		#'56!AGGI66!! 7s   A4)r%   listtupler&   
zeros_likerX   synchronizeStreamwait_streamcurrent_streamr   	CUDAGraphrA   )
r   r5   r   r   r   r   r   rA   r   r   s
     ``   @@@r   cudagraphs_innerr      sY   ftUm,,,6<=))!,=V 
JJZZ F
uzz0023			6	" v
	JJ++F3	JJ JJ  "E			%		/ /./ntUm4(*	" 	" JA > / /s   F1F6?G6F?G)TT),__doc__r   collectionsr   typingr   r&   torch._dynamor   torch._dynamo.backends.commonr    torch._dynamo.backends.debuggingr   torch._inductor.cudagraph_utilsr   r	   r
   r   r   r   torch._inductor.utilsr   r   r   r   r    torch.multiprocessing.reductionsr   registryr   r>   r_   GraphModulerF   strrO   rV   intr]   r   rc   r   r   r   r   r   r   <module>r      s   .  #     6 6   < &$N 4 4 Dxx##Dc]Dehh22 (3- $C D#/ K7\) )  l0A0C D$r   