Spaces:
Running
Running
R0CKSTAR
commited on
Commit
·
8ec75c3
1
Parent(s):
3142fa9
musa: enable building fat binaries, enable unified memory, and disable Flash Attention on QY1 (MTT S80) (llama/9526)
Browse files* mtgpu: add mp_21 support
Signed-off-by: Xiaodong Ye <[email protected]>
* mtgpu: disable flash attention on qy1 (MTT S80); disable q3_k and mul_mat_batched_cublas
Signed-off-by: Xiaodong Ye <[email protected]>
* mtgpu: enable unified memory
Signed-off-by: Xiaodong Ye <[email protected]>
* mtgpu: map cublasOperation_t to mublasOperation_t (sync code to latest)
Signed-off-by: Xiaodong Ye <[email protected]>
---------
Signed-off-by: Xiaodong Ye <[email protected]>
ggml/src/CMakeLists.txt
CHANGED
|
@@ -364,7 +364,7 @@ if (GGML_CUDA)
|
|
| 364 |
if (GGML_MUSA)
|
| 365 |
set_source_files_properties(${GGML_SOURCES_CUDA} PROPERTIES LANGUAGE CXX)
|
| 366 |
foreach(SOURCE ${GGML_SOURCES_CUDA})
|
| 367 |
-
set_property(SOURCE ${SOURCE} PROPERTY COMPILE_FLAGS "-x musa -mtgpu --cuda-gpu-arch=mp_22")
|
| 368 |
endforeach()
|
| 369 |
endif()
|
| 370 |
|
|
|
|
| 364 |
if (GGML_MUSA)
|
| 365 |
set_source_files_properties(${GGML_SOURCES_CUDA} PROPERTIES LANGUAGE CXX)
|
| 366 |
foreach(SOURCE ${GGML_SOURCES_CUDA})
|
| 367 |
+
set_property(SOURCE ${SOURCE} PROPERTY COMPILE_FLAGS "-x musa -mtgpu --cuda-gpu-arch=mp_21 --cuda-gpu-arch=mp_22")
|
| 368 |
endforeach()
|
| 369 |
endif()
|
| 370 |
|
ggml/src/ggml-cuda.cu
CHANGED
|
@@ -136,7 +136,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
|
|
| 136 |
return res;
|
| 137 |
#else
|
| 138 |
|
| 139 |
-
#if !defined(GGML_USE_HIPBLAS)
|
| 140 |
cudaError_t err;
|
| 141 |
if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
|
| 142 |
{
|
|
@@ -149,7 +149,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
|
|
| 149 |
return err;
|
| 150 |
#else
|
| 151 |
return cudaMalloc(ptr, size);
|
| 152 |
-
#endif // !defined(GGML_USE_HIPBLAS)
|
| 153 |
|
| 154 |
#endif
|
| 155 |
}
|
|
@@ -2830,6 +2830,12 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
| 2830 |
if (op->op == GGML_OP_MUL_MAT && a->ne[3] != b->ne[3]) {
|
| 2831 |
return false;
|
| 2832 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2833 |
switch (a->type) {
|
| 2834 |
case GGML_TYPE_F32:
|
| 2835 |
case GGML_TYPE_F16:
|
|
@@ -2853,6 +2859,11 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
| 2853 |
case GGML_TYPE_IQ3_XXS:
|
| 2854 |
case GGML_TYPE_IQ4_NL:
|
| 2855 |
case GGML_TYPE_IQ4_XS:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2856 |
return true;
|
| 2857 |
default:
|
| 2858 |
return false;
|
|
@@ -2978,6 +2989,9 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
| 2978 |
case GGML_OP_RWKV_WKV:
|
| 2979 |
return true;
|
| 2980 |
case GGML_OP_FLASH_ATTN_EXT: {
|
|
|
|
|
|
|
|
|
|
| 2981 |
if (op->src[0]->ne[0] == 64 && op->src[1]->type == GGML_TYPE_F16) {
|
| 2982 |
return true;
|
| 2983 |
}
|
|
|
|
| 136 |
return res;
|
| 137 |
#else
|
| 138 |
|
| 139 |
+
#if !defined(GGML_USE_HIPBLAS)
|
| 140 |
cudaError_t err;
|
| 141 |
if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
|
| 142 |
{
|
|
|
|
| 149 |
return err;
|
| 150 |
#else
|
| 151 |
return cudaMalloc(ptr, size);
|
| 152 |
+
#endif // !defined(GGML_USE_HIPBLAS)
|
| 153 |
|
| 154 |
#endif
|
| 155 |
}
|
|
|
|
| 2830 |
if (op->op == GGML_OP_MUL_MAT && a->ne[3] != b->ne[3]) {
|
| 2831 |
return false;
|
| 2832 |
}
|
| 2833 |
+
#ifdef GGML_USE_MUSA
|
| 2834 |
+
if (b->type == GGML_TYPE_F16 && b->ne[2]*b->ne[3] > 1 &&
|
| 2835 |
+
!ggml_is_transposed(a) && !ggml_is_transposed(b)) {
|
| 2836 |
+
return false;
|
| 2837 |
+
}
|
| 2838 |
+
#endif // GGML_USE_MUSA
|
| 2839 |
switch (a->type) {
|
| 2840 |
case GGML_TYPE_F32:
|
| 2841 |
case GGML_TYPE_F16:
|
|
|
|
| 2859 |
case GGML_TYPE_IQ3_XXS:
|
| 2860 |
case GGML_TYPE_IQ4_NL:
|
| 2861 |
case GGML_TYPE_IQ4_XS:
|
| 2862 |
+
#ifdef GGML_USE_MUSA
|
| 2863 |
+
if (a->type == GGML_TYPE_Q3_K) {
|
| 2864 |
+
return false;
|
| 2865 |
+
}
|
| 2866 |
+
#endif // GGML_USE_MUSA
|
| 2867 |
return true;
|
| 2868 |
default:
|
| 2869 |
return false;
|
|
|
|
| 2989 |
case GGML_OP_RWKV_WKV:
|
| 2990 |
return true;
|
| 2991 |
case GGML_OP_FLASH_ATTN_EXT: {
|
| 2992 |
+
#ifndef FLASH_ATTN_AVAILABLE
|
| 2993 |
+
return false;
|
| 2994 |
+
#endif
|
| 2995 |
if (op->src[0]->ne[0] == 64 && op->src[1]->type == GGML_TYPE_F16) {
|
| 2996 |
return true;
|
| 2997 |
}
|
ggml/src/ggml-cuda/common.cuh
CHANGED
|
@@ -50,6 +50,8 @@
|
|
| 50 |
#define CC_RDNA1 (CC_OFFSET_AMD + 1010)
|
| 51 |
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
| 52 |
#define CC_RDNA3 (CC_OFFSET_AMD + 1100)
|
|
|
|
|
|
|
| 53 |
|
| 54 |
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
| 55 |
|
|
@@ -134,6 +136,10 @@ typedef float2 dfloat2;
|
|
| 134 |
#define INT8_MMA_AVAILABLE
|
| 135 |
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
|
| 136 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
static constexpr bool fast_fp16_available(const int cc) {
|
| 138 |
return cc >= CC_PASCAL && cc != 610;
|
| 139 |
}
|
|
|
|
| 50 |
#define CC_RDNA1 (CC_OFFSET_AMD + 1010)
|
| 51 |
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
| 52 |
#define CC_RDNA3 (CC_OFFSET_AMD + 1100)
|
| 53 |
+
#define CC_QY1 210
|
| 54 |
+
#define CC_QY2 220
|
| 55 |
|
| 56 |
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
| 57 |
|
|
|
|
| 136 |
#define INT8_MMA_AVAILABLE
|
| 137 |
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
|
| 138 |
|
| 139 |
+
#if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1)
|
| 140 |
+
#define FLASH_ATTN_AVAILABLE
|
| 141 |
+
#endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1)
|
| 142 |
+
|
| 143 |
static constexpr bool fast_fp16_available(const int cc) {
|
| 144 |
return cc >= CC_PASCAL && cc != 610;
|
| 145 |
}
|
ggml/src/ggml-cuda/fattn-tile-f32.cu
CHANGED
|
@@ -44,13 +44,17 @@ static __global__ void flash_attn_tile_ext_f32(
|
|
| 44 |
const int ne1,
|
| 45 |
const int ne2,
|
| 46 |
const int ne3) {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
// Skip unused kernel variants for faster compilation:
|
| 48 |
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
| 49 |
NO_DEVICE_CODE;
|
| 50 |
return;
|
| 51 |
}
|
| 52 |
|
| 53 |
-
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
| 54 |
|
| 55 |
const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
|
| 56 |
const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
|
|
|
|
| 44 |
const int ne1,
|
| 45 |
const int ne2,
|
| 46 |
const int ne3) {
|
| 47 |
+
#ifndef FLASH_ATTN_AVAILABLE
|
| 48 |
+
NO_DEVICE_CODE;
|
| 49 |
+
return;
|
| 50 |
+
#endif // FLASH_ATTN_AVAILABLE
|
| 51 |
// Skip unused kernel variants for faster compilation:
|
| 52 |
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
| 53 |
NO_DEVICE_CODE;
|
| 54 |
return;
|
| 55 |
}
|
| 56 |
|
| 57 |
+
// In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
| 58 |
|
| 59 |
const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
|
| 60 |
const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
|
ggml/src/ggml-cuda/vendors/musa.h
CHANGED
|
@@ -26,6 +26,7 @@
|
|
| 26 |
#define cublasSetStream mublasSetStream
|
| 27 |
#define cublasSgemm mublasSgemm
|
| 28 |
#define cublasStatus_t mublasStatus_t
|
|
|
|
| 29 |
#define cublasGetStatusString mublasStatus_to_string
|
| 30 |
#define cudaDataType_t musaDataType_t
|
| 31 |
#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
|
|
@@ -56,6 +57,7 @@
|
|
| 56 |
#define cudaLaunchHostFunc musaLaunchHostFunc
|
| 57 |
#define cudaMalloc musaMalloc
|
| 58 |
#define cudaMallocHost musaMallocHost
|
|
|
|
| 59 |
#define cudaMemcpy musaMemcpy
|
| 60 |
#define cudaMemcpyAsync musaMemcpyAsync
|
| 61 |
#define cudaMemcpyPeerAsync musaMemcpyPeerAsync
|
|
|
|
| 26 |
#define cublasSetStream mublasSetStream
|
| 27 |
#define cublasSgemm mublasSgemm
|
| 28 |
#define cublasStatus_t mublasStatus_t
|
| 29 |
+
#define cublasOperation_t mublasOperation_t
|
| 30 |
#define cublasGetStatusString mublasStatus_to_string
|
| 31 |
#define cudaDataType_t musaDataType_t
|
| 32 |
#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
|
|
|
|
| 57 |
#define cudaLaunchHostFunc musaLaunchHostFunc
|
| 58 |
#define cudaMalloc musaMalloc
|
| 59 |
#define cudaMallocHost musaMallocHost
|
| 60 |
+
#define cudaMallocManaged musaMallocManaged
|
| 61 |
#define cudaMemcpy musaMemcpy
|
| 62 |
#define cudaMemcpyAsync musaMemcpyAsync
|
| 63 |
#define cudaMemcpyPeerAsync musaMemcpyPeerAsync
|