
    mVhO                        d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ ej                  j!                  ej                  j#                  e            Zej                  j'                  ed      gZd Z e j,                         d	        Zd
 Z G d de      Zd Zd Z G d de      Z G d de      Zy)    N)Path)_build)get_cache_manager)	GPUTarget)	GPUDriverincludec                     dd l }|j                         dk7  ry dd lddlm}m}m}mmm}  G fddj                        }j                  | ||       ||       ||            }	 j                  d      j                  }|g|_        ||_        dj!                  dz         }	 fd	}
 | ||
      |	      r$t#        j$                  j'                  |	            S y # t        $ r Y y w xY w)
Nr   Linux)c_charc_intc_size_tc_void_pc_char_pPOINTERc                   "    e Zd ZdW fdW  fgZy)8_find_already_mmapped_dylib_on_linux.<locals>.DlPhdrInfo	dlpi_addr	dlpi_nameN)__name__
__module____qualname___fields_)r   r   s   J/home/dcms/DCMS/lib/python3.12/site-packages/triton/backends/amd/driver.py
DlPhdrInfor      s    (#(#
    r   z	libc.so.6i      c           
          | j                   j                  }t        t        j                  |            }|j
                  v r'j                  ||t        t        |                   yy)Nr   r   )	contentsr   r   osfsdecodenamememmoveminlen)infosizedatar   pctypeslib_namemax_path_lengths        r   callbackz6_find_already_mmapped_dylib_on_linux.<locals>.callback2   sR    MM++	Y'(qvvNN4CY,PQr   )platformsystemr)   r   r   r   r   r   r   	Structure	CFUNCTYPECDLLdl_iterate_phdr	Exceptionargtypesrestypecreate_string_bufferr   r    	string_at)r*   r-   r   r   r   r   r   
callback_tr2   pathr,   r   r   r)   r+   s   `          @@@@r   $_find_already_mmapped_dylib_on_linuxr:      s    G#
 KK
V%% 
 !!%)<gh>OQXY_Q`aJ ++k2BB !+H5O#OO&&':;D z(+T2{{6++D122+  s   1C- -	C98C9c                     d} t        j                  d      }|rC|j                  |       r!t         j                  j	                  |      r|S t        d| d|        t        |       }|r2t         j                  j	                  |      r|S t        d| d|        g }dd l}|j                         }|j                         }|j                  r|g|z   }|D ]X  }t         j                  j                  |dd	|       }t         j                  j	                  |      r|c S |j                  |       Z t        j                  d
      }|rj|j                  d      D ]V  }	t         j                  j                  |	|       }
t         j                  j	                  |
      r|
c S |j                  |
       X t        j                  ddg      j!                         }|j#                         D cg c]5  }|j%                         j                  |       s#|j                         d   7 }}|D ]6  }t         j                  j	                  |      r|c S |j                  |       8 t         j                  j                  d|       }t         j                  j	                  |      r|S |j                  |       t        d|  d|       c c}w )Nzlibamdhip64.soTRITON_LIBHIP_PATHzTRITON_LIBHIP_PATH 'z' does not point to a valid zmemory mapped 'z'' in process does not point to a valid r   torchlibLD_LIBRARY_PATH:z/sbin/ldconfigz-pz/opt/rocm/lib/zcannot locate z after attempted paths )r   getenvendswithr9   existsRuntimeErrorr:   sitegetsitepackagesgetusersitepackagesENABLE_USER_SITEjoinappendsplit
subprocesscheck_outputdecode
splitlinesstrip)r*   env_libhip_pathmmapped_pathpathsrF   site_packages	user_siter9   env_ld_library_pathdflibslinelocsloccommon_install_paths                   r   _get_path_to_hip_runtime_dylibr_   @   sh   H ii 45O##H-"''..2Q""1/1BB^_g^hijj 8AL77>>,'_\N:abjaklmmE ((*M((*I"m3 ww||D'5(;77>>$KT	 ))$56$**3/ 	AQ)Aww~~a LLO		 ""$4d#;<CCED *.):^djjl>S>ST\>]DJJL^D^ 77>>#JS '',,'7B	ww~~)*""	LL$%
z1HP
QQ _s   )$KKc           	         t        j                  | j                  d            j                         }t	        |      }|j                  | d      }|t        j                         5 }t        j                  j                  |d      }t        |d      5 }|j                  |        d d d        t        |||g t        g       }t        |d      5 }|j                  |j!                         | dd      }d d d        d d d        dd l}	|	j$                  j'                  ||      }
|	j$                  j)                  |
      }|
j*                  j-                  |       |S # 1 sw Y   xY w# 1 sw Y   uxY w# 1 sw Y   yxY w)	Nzutf-8z.sozmain.cwrbT)binaryr   )hashlibsha256encode	hexdigestr   get_filetempfileTemporaryDirectoryr   r9   rJ   openwriter   include_dirputreadimportlib.utilutilspec_from_file_locationmodule_from_specloaderexec_module)srcr!   keycache
cache_pathtmpdirsrc_pathrY   so	importlibspecmods               r   compile_module_from_srcr      sK   
..G,
-
7
7
9Cc"E4&-J((* 	Lfww||FH5Hh$ hKDBb$ L1"YYqvvxD6dYK
L	L >>11$
CD
..
)
)$
/CKKC J L L	L 	Ls<   (-E5E'(E5&E)5E5E&	"E5)E2	.E55E>c                   $     e Zd Z fdZd Z xZS )HIPUtilsc                 d    t        | d      st        t        |   |       | _        | j                  S )Ninstance)hasattrsuperr   __new__r   )cls	__class__s    r   r   zHIPUtils.__new__   s*    sJ' 37<CL||r   c                    t               }t        t        j                  j	                  t
        d            j                         }|j                  d|d      }t        |d      }|j                  | _	        |j                  | _
        y )Nzdriver.cz/*py_libhip_search_path*/r   	hip_utils)r_   r   r   r9   rJ   dirname	read_textreplacer   load_binaryget_device_properties)selflibhip_pathrv   r   s       r   __init__zHIPUtils.__init__   sg    46277<<45??A kk5{AF%c;7??%(%>%>"r   )r   r   r   r   r   __classcell__r   s   @r   r   r      s    
	?r   r   c                 >    | d   dk(  ryddddddd	d
dddddddd|    S )Nr   *hipDeviceptr_tint32_tint8_tint16_tint64_tuint32_tuint8_tuint16_tuint64_tfloatdouble)i1i8i16i32i64u1u8u16u32u64fp16bf16fp32f32fp64 )tys    r   	ty_to_cppr      sQ    	!u|  	!
 
r   c                     fdfdfddj                  |j                         D cg c]
  } |       c}      }d|z   }dj                  t        |j                                     }t        t	        t
        |j                  d                  }t        |      D ci c]  \  }}||
 }}}t        |      dkD  r)ddj                  d	 |j                         D              z   nd}dj                  d
 |j                         D              }	g }
|j                         D ]=  \  }}|d   dk(  r|
j                  d| d       $|dk7  s*|
j                  d|        ? t               }t        t        t        |                  }|j                         D cg c]  \  }}|dk7  sd|  }}}|j                  d       d| dt        |	      dkD  rd|	z   nd ddj                  |       d| d| ddj                  |j                         D cg c]  \  }} |       d| d c}}       d| d| ddj                  |j                         D cg c]  \  }}|d   dk(  rd| d| d| d | d!	nd  c}}       d"t        |
      dkD  rddj                  |
      z   nd d#}|S c c}w c c}}w c c}}w c c}}w c c}}w )$Nc                 ^    t        | t              rdj                  t        |             S | S )N,)
isinstancetuplerJ   map)sig_serialize_signatures    r   r   z+make_launcher.<locals>._serialize_signature   s)    c5!88C 4c:;;
r   c                     t        | t              r!dj                  t        |             }d| dS | d   dk(  ry| dv ryt	        |       S )Nr   []r   r   z	PyObject*	constexprr   r   rJ   r   r   )r   val_extracted_types     r   r   z&make_launcher.<locals>._extracted_type   sS    b% ((334Cse1:a5C<+}r   c                     t        | t              r!dj                  t        |             }d| dS | d   dk(  ry| dv rydd	d
dddddddddt	        |          S )N ()r   r   Or   rY   rX   lbhiLBHIK)r   r   longr   r   r   r   r   r   r   r   r   )r   r   	format_ofs     r   r   z make_launcher.<locals>.format_of   s    b% ''#i,-Cse1:a5C<+
 B- 	r   r   
piiiKKOOOOr   r   z, c              3   ,   K   | ]  \  }}d |   yw)z&_argNr   .0r   r   s      r   	<genexpr>z make_launcher.<locals>.<genexpr>   s      LB5 Ls   c              3   N   K   | ]  \  }}|d k7  st        |       d|   yw)r   z argN)r   r   s      r   r   z make_launcher.<locals>.<genexpr>   s,     h2VX\gVgYr]O4s3hs   %%r   ptr_infoz.dev_ptrr   _argz&argz&global_scratcha;  
#define __HIP_PLATFORM_AMD__
#include <hip/hip_runtime.h>
#include <Python.h>
#include <dlfcn.h>
#include <stdbool.h>
#include <dlfcn.h>

// The list of paths to search for the HIP runtime library. The caller Python
// code should substitute the search path placeholder.
static const char *hipLibSearchPaths[] = {"a  "};

// The list of HIP dynamic library symbols and their signature we are interested
// in this file.
#define HIP_SYMBOL_LIST(FOR_EACH_ERR_FN, FOR_EACH_STR_FN)                     \
  FOR_EACH_STR_FN(hipGetErrorString, hipError_t hipError)                     \
  FOR_EACH_ERR_FN(hipModuleLaunchKernel, hipFunction_t f,                     \
                  unsigned int gridDimX, unsigned int gridDimY,               \
                  unsigned int gridDimZ, unsigned int blockDimX,              \
                  unsigned int blockDimY, unsigned int blockDimZ,             \
                  unsigned int sharedMemBytes, hipStream_t stream,            \
                  void **kernelParams, void **extra)                          \
  FOR_EACH_ERR_FN(hipModuleLaunchCooperativeKernel, hipFunction_t f,          \
                  unsigned int gridDimX, unsigned int gridDimY,               \
                  unsigned int gridDimZ, unsigned int blockDimX,              \
                  unsigned int blockDimY, unsigned int blockDimZ,             \
                  unsigned int sharedMemBytes, hipStream_t stream,            \
                  void **kernelParams, void **extra)                          \
  FOR_EACH_ERR_FN(hipPointerGetAttribute, void *data,                         \
                  hipPointer_attribute attribute, hipDeviceptr_t ptr)

// The HIP symbol table for holding resolved dynamic library symbols.
struct HIPSymbolTable {
#define DEFINE_EACH_ERR_FIELD(hipSymbolName, ...)                             \
  hipError_t (*hipSymbolName)(__VA_ARGS__);
#define DEFINE_EACH_STR_FIELD(hipSymbolName, ...)                             \
  const char *(*hipSymbolName)(__VA_ARGS__);

  HIP_SYMBOL_LIST(DEFINE_EACH_ERR_FIELD, DEFINE_EACH_STR_FIELD)
};

static struct HIPSymbolTable hipSymbolTable;

bool initSymbolTable() {
  // Use the HIP runtime library loaded into the existing process if it exits.
  void *lib = dlopen("libamdhip64.so", RTLD_NOLOAD);
  if (lib) {
    // printf("[triton] chosen loaded libamdhip64.so in the process\n");
  }

  // Otherwise, go through the list of search paths to dlopen the first HIP
  // driver library.
  if (!lib) {
    int n = sizeof(hipLibSearchPaths) / sizeof(hipLibSearchPaths[0]);
    for (int i = 0; i < n; ++i) {
      void *handle = dlopen(hipLibSearchPaths[i], RTLD_LAZY | RTLD_LOCAL);
      if (handle) {
        lib = handle;
        // printf("[triton] chosen %s\n", hipLibSearchPaths[i]);
      }
    }
  }
  if (!lib) {
    PyErr_SetString(PyExc_RuntimeError, "cannot open libamdhip64.so");
    return false;
  }

  // Resolve all symbols we are interested in.
  dlerror(); // Clear existing errors
  const char *error = NULL;
#define QUERY_EACH_FN(hipSymbolName, ...)                                     \
  *(void **)&hipSymbolTable.hipSymbolName = dlsym(lib, #hipSymbolName);       \
  error = dlerror();                                                          \
  if (error) {                                                               \
    PyErr_SetString(PyExc_RuntimeError,                                       \
                    "cannot query " #hipSymbolName " from libamdhip64.so");   \
    dlclose(lib);                                                             \
    return false;                                                             \
  }

  HIP_SYMBOL_LIST(QUERY_EACH_FN, QUERY_EACH_FN)

  return true;
}

static inline void gpuAssert(hipError_t code, const char *file, int line)
{
   if (code != HIP_SUCCESS)
   {
      const char* prefix = "Triton Error [HIP]: ";
       const char* str = hipSymbolTable.hipGetErrorString(code);
      char err[1024] = {0};
      snprintf(err, 1024, "%s Code: %d, Messsage: %s", prefix, code, str );
      PyErr_SetString(PyExc_RuntimeError, err);
   }
}

#define HIP_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }

static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int launch_cooperative_grid, int clusterDimX, int clusterDimY, int clusterDimZ, int shared_memory, hipStream_t stream, hipFunction_t functionzc) {
  // printf("_launch hip kernel\n");
  hipDeviceptr_t global_scratch = 0;
  void *params[] = { z };
  if (gridX*gridY*gridZ > 0 && launch_cooperative_grid) {
    HIP_CHECK(hipSymbolTable.hipModuleLaunchCooperativeKernel(function, gridX, gridY, gridZ, z*num_warps, 1, 1, shared_memory, stream, params, 0));
    return;
  }
  if (gridX*gridY*gridZ > 0) {
    HIP_CHECK(hipSymbolTable.hipModuleLaunchKernel(function, gridX, gridY, gridZ, a  *num_warps, 1, 1, shared_memory, stream, params, 0));
  }
}

typedef struct _DevicePtrInfo {
    hipDeviceptr_t dev_ptr;
    bool valid;
} DevicePtrInfo;

static inline DevicePtrInfo getPointer(PyObject *obj, int idx) {
  DevicePtrInfo ptr_info;
  ptr_info.dev_ptr = 0;
  ptr_info.valid = true;
  if (PyLong_Check(obj)) {
    ptr_info.dev_ptr = (hipDeviceptr_t)PyLong_AsUnsignedLongLong(obj);
    return ptr_info;
  }
  if (obj == Py_None) {
    // valid nullptr
    return ptr_info;
  }
  PyObject *ptr = PyObject_GetAttrString(obj, "data_ptr");
  if(ptr){
    PyObject *empty_tuple = PyTuple_New(0);
    PyObject *ret = PyObject_Call(ptr, empty_tuple, NULL);
    Py_DECREF(empty_tuple);
    Py_DECREF(ptr);
    if (!PyLong_Check(ret)) {
      PyErr_SetString(PyExc_TypeError, "data_ptr method of Pointer object must return 64-bit int");
      ptr_info.valid = false;
      return ptr_info;
    }
    ptr_info.dev_ptr = (hipDeviceptr_t)PyLong_AsUnsignedLongLong(ret);
    if(!ptr_info.dev_ptr)
      return ptr_info;
    uint64_t dev_ptr;
    hipError_t status = hipSymbolTable.hipPointerGetAttribute(&dev_ptr, HIP_POINTER_ATTRIBUTE_DEVICE_POINTER, ptr_info.dev_ptr);
    if (status == hipErrorInvalidValue) {
        PyErr_Format(PyExc_ValueError,
                     "Pointer argument (at %d) cannot be accessed from Triton (cpu tensor?)", idx);
        ptr_info.valid = false;
    }
    ptr_info.dev_ptr = (hipDeviceptr_t)dev_ptr;
    Py_DECREF(ret);
    return ptr_info;
  }
  PyErr_SetString(PyExc_TypeError, "Pointer argument must be either uint64 or have data_ptr method");
  return ptr_info;
}

static PyObject* launch(PyObject* self, PyObject* args) {
   // printf("launch\n");
  int gridX, gridY, gridZ;
  uint64_t _stream;
  uint64_t _function;
  int launch_cooperative_grid;
  PyObject *launch_enter_hook = NULL;
  PyObject *launch_exit_hook = NULL;
  PyObject *kernel_metadata = NULL;
  PyObject *launch_metadata = NULL;
   z _argz; z
  if(!PyArg_ParseTuple(args, "a  ", &launch_cooperative_grid,
                                           &gridX, &gridY, &gridZ, &_stream, &_function,
                                           &kernel_metadata, &launch_metadata,
                                           &launch_enter_hook, &launch_exit_hook a=  )) {
    return NULL;
  }

  // extract kernel metadata
  int num_warps, num_ctas, shared_memory, clusterDimX, clusterDimY, clusterDimZ;
  if (!PyArg_ParseTuple(kernel_metadata, "iiiiii", &num_warps, &num_ctas, &shared_memory, &clusterDimX, &clusterDimY, &clusterDimZ)) {
    return NULL;
  }
  // extract launch metadata
  if (launch_enter_hook != Py_None){
    PyObject* args = Py_BuildValue("(O)", launch_metadata);
    PyObject* ret = PyObject_CallObject(launch_enter_hook, args);
    Py_DECREF(args);
    if (!ret)
      return NULL;
  }


  // raise exception asap
  zDevicePtrInfo ptr_infoz = getPointer(_argz); if (!ptr_infoz.valid) return NULL;z;
  _launch(gridX, gridY, gridZ, num_warps, num_ctas, launch_cooperative_grid, clusterDimX, clusterDimY, clusterDimZ, shared_memory, (hipStream_t)_stream, (hipFunction_t)_functionan  );

  if(launch_exit_hook != Py_None){
    PyObject* args = Py_BuildValue("(O)", launch_metadata);
    PyObject* ret = PyObject_CallObject(launch_exit_hook, args);
    Py_DECREF(args);
    if (!ret)
      return NULL;
  }

  if(PyErr_Occurred()) {
    return NULL;
  }
  // return None
  Py_INCREF(Py_None);
  return Py_None;
}

static PyMethodDef ModuleMethods[] = {
  {"launch", launch, METH_VARARGS, "Entry point for all kernels with this signature"},
  {NULL, NULL, 0, NULL} // sentinel
};

static struct PyModuleDef ModuleDef = {
  PyModuleDef_HEAD_INIT,
  "__triton_launcher",
  NULL, //documentation
  -1, //size
  ModuleMethods
};

PyMODINIT_FUNC PyInit___triton_launcher(void) {
  if (!initSymbolTable()) {
    return NULL;
  }
  PyObject *m = PyModule_Create(&ModuleDef);
  if(m == NULL) {
    return NULL;
  }
  PyModule_AddFunctions(m, ModuleMethods);
  return m;
}
)rJ   valuesr   listfilterboolrL   	enumerater$   itemsrK   r_   range)	constants	signature	warp_sizer   args_formatformatr   s	args_list	arg_declsinternal_args_listr   paramsrv   r   r   r   s                 @@@r   make_launcherr      s   
, ''93C3C3EFR9R=FGKK'F193C3C3EFGIVD)//#"678I"+I"67$!QA7I7PST]P^abPbtyy L)//:K LLLhjI 		hARhhI" 22a5C<%%8&<=;%%QCj1	2
 12K %I'(F&/oo&7MUQ2;LQCjMFM
MM#$
- .9M Y:`r ux  yB  uC  FG  uG  ae  hq  aq  MO  `P P yy() *^^g]h iS T]R] <^x 88Y__=NOEAr#$E!B/OPQ R  &x (R S\Q\ ]( 99  R[  Ra  Ra  Rc  d  IN  IJ  LNoqrsotx{o{&qc);A3bCSTUSVVjk  BD  D  d  e  f fr [^  _q  [r  uv  [v  sw  z~  zC  zC  DV  zW  sW  |~  r *CkCX JC G 8 NT P0 ds#   J8*J=%K3KK	 ##K,c                       e Zd Zd Zd Zy)HIPLauncherc                    t        d      rj                  n	t               }fd}|j                         D ci c]  \  }} ||      | }}}j                  j                         D ci c]  \  }}||
 }}}t        |||j                        t        d      }|j                  | _        |j                  | _	        y c c}}w c c}}w )Nr   c                 t    t        | t              r&j                  j                  j	                  |       fS | S N)r   strfn	arg_namesindex)xrv   s    r   <lambda>z&HIPLauncher.__init__.<locals>.<lambda>  s-    Z3=OSVV--33A69 UV r   __triton_launcher)
r   r   dictr   r   r   r   r   launchlaunch_cooperative_grid)	r   rv   metadatar   arg_idxidxvaluer   r   s	    `       r   r   zHIPLauncher.__init__  s    %,S+%>CMMDF	V;D??;LMZS%WS\5(M	M25--2E2E2GHJCS%ZH	HIy(2D2DE%c+>?jj'/'G'G$ NHs   C/Cc                 >     | j                   | j                  g|  y r   )r   r   )r   argss     r   __call__zHIPLauncher.__call__  s    D00848r   N)r   r   r   r   r  r   r   r   r   r     s    H9r   r   c                   R     e Zd Z fdZd Zed        Zd Zd Zd Z	d Z
d Z xZS )		HIPDriverc                 V    t         |           t               | _        t        | _        y r   )r   r   r   utilsr   launcher_cls)r   r   s    r   r   zHIPDriver.__init__  s    Z
'r   c                 "    dd l }|j                  S )Nr   )r=   cudar   r=   s     r   get_device_interfacezHIPDriver.get_device_interface  s    zzr   c                  Z    	 dd l } | j                  j                  d uS # t        $ r Y yw xY w)Nr   F)r=   versionhipImportError)r=   s    r   	is_activezHIPDriver.is_active  s1    	==$$D00 		s    	**c                     | j                         }| j                  j                  |      }|d   }|d   }t        d|j	                  d      d   |      S )NarchwarpSizer  r@   r   )get_current_devicer  r   r   rL   )r   devicedevice_propertiesr  r   s        r   get_current_targetzHIPDriver.get_current_target
  sU    ((* JJ<<VD (%j1	

3 2I>>r   c                 J    dd l }|j                  d| j                               S )Nr   r  )r=   r  r  r  s     r   get_active_torch_devicez!HIPDriver.get_active_torch_device  s    ||FD$;$;$=>>r   c                     ddl m} |S )Nr   )do_bench)triton.testingr  )r   r  s     r   get_benchmarkerzHIPDriver.get_benchmarker  s
    +r   c                 b    dd l }d}|j                  t        |dz        |j                  d      S )Nr   i      r  )dtyper  )r=   emptyint)r   r=   
cache_sizes      r   get_empty_cache_for_benchmarkz'HIPDriver.get_empty_cache_for_benchmark  s.     '
{{3zQ/uyy{PPr   c                 $    |j                          y r   )zero_)r   rx   s     r   clear_cachezHIPDriver.clear_cache!  s    r   )r   r   r   r   r  staticmethodr  r  r  r  r&  r)  r   r   s   @r   r  r    s;    (
  ??
Qr   r  )	functoolsr   rd   rM   ri   pathlibr   triton.runtime.buildr   triton.runtime.cacher   triton.backends.compilerr   triton.backends.driverr   r9   r   realpath__file__rJ   rm   r:   	lru_cacher_   r   objectr   r   r   r   r  r   r   r   <module>r5     s     	     ' 2 . ,
''//"''**84
5ww||GY/0-` ;R ;R|&?v ?(
,hV	9& 9 +	 +r   