R0CKSTAR commited on
Commit
8ec75c3
·
1 Parent(s): 3142fa9

musa: enable building fat binaries, enable unified memory, and disable Flash Attention on QY1 (MTT S80) (llama/9526)

Browse files

* mtgpu: add mp_21 support

Signed-off-by: Xiaodong Ye <[email protected]>

* mtgpu: disable flash attention on qy1 (MTT S80); disable q3_k and mul_mat_batched_cublas

Signed-off-by: Xiaodong Ye <[email protected]>

* mtgpu: enable unified memory

Signed-off-by: Xiaodong Ye <[email protected]>

* mtgpu: map cublasOperation_t to mublasOperation_t (sync code to latest)

Signed-off-by: Xiaodong Ye <[email protected]>

---------

Signed-off-by: Xiaodong Ye <[email protected]>

ggml/src/CMakeLists.txt CHANGED
@@ -364,7 +364,7 @@ if (GGML_CUDA)
364
  if (GGML_MUSA)
365
  set_source_files_properties(${GGML_SOURCES_CUDA} PROPERTIES LANGUAGE CXX)
366
  foreach(SOURCE ${GGML_SOURCES_CUDA})
367
- set_property(SOURCE ${SOURCE} PROPERTY COMPILE_FLAGS "-x musa -mtgpu --cuda-gpu-arch=mp_22")
368
  endforeach()
369
  endif()
370
 
 
364
  if (GGML_MUSA)
365
  set_source_files_properties(${GGML_SOURCES_CUDA} PROPERTIES LANGUAGE CXX)
366
  foreach(SOURCE ${GGML_SOURCES_CUDA})
367
+ set_property(SOURCE ${SOURCE} PROPERTY COMPILE_FLAGS "-x musa -mtgpu --cuda-gpu-arch=mp_21 --cuda-gpu-arch=mp_22")
368
  endforeach()
369
  endif()
370
 
ggml/src/ggml-cuda.cu CHANGED
@@ -136,7 +136,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
136
  return res;
137
  #else
138
 
139
- #if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
140
  cudaError_t err;
141
  if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
142
  {
@@ -149,7 +149,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
149
  return err;
150
  #else
151
  return cudaMalloc(ptr, size);
152
- #endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
153
 
154
  #endif
155
  }
@@ -2830,6 +2830,12 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
2830
  if (op->op == GGML_OP_MUL_MAT && a->ne[3] != b->ne[3]) {
2831
  return false;
2832
  }
 
 
 
 
 
 
2833
  switch (a->type) {
2834
  case GGML_TYPE_F32:
2835
  case GGML_TYPE_F16:
@@ -2853,6 +2859,11 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
2853
  case GGML_TYPE_IQ3_XXS:
2854
  case GGML_TYPE_IQ4_NL:
2855
  case GGML_TYPE_IQ4_XS:
 
 
 
 
 
2856
  return true;
2857
  default:
2858
  return false;
@@ -2978,6 +2989,9 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
2978
  case GGML_OP_RWKV_WKV:
2979
  return true;
2980
  case GGML_OP_FLASH_ATTN_EXT: {
 
 
 
2981
  if (op->src[0]->ne[0] == 64 && op->src[1]->type == GGML_TYPE_F16) {
2982
  return true;
2983
  }
 
136
  return res;
137
  #else
138
 
139
+ #if !defined(GGML_USE_HIPBLAS)
140
  cudaError_t err;
141
  if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
142
  {
 
149
  return err;
150
  #else
151
  return cudaMalloc(ptr, size);
152
+ #endif // !defined(GGML_USE_HIPBLAS)
153
 
154
  #endif
155
  }
 
2830
  if (op->op == GGML_OP_MUL_MAT && a->ne[3] != b->ne[3]) {
2831
  return false;
2832
  }
2833
+ #ifdef GGML_USE_MUSA
2834
+ if (b->type == GGML_TYPE_F16 && b->ne[2]*b->ne[3] > 1 &&
2835
+ !ggml_is_transposed(a) && !ggml_is_transposed(b)) {
2836
+ return false;
2837
+ }
2838
+ #endif // GGML_USE_MUSA
2839
  switch (a->type) {
2840
  case GGML_TYPE_F32:
2841
  case GGML_TYPE_F16:
 
2859
  case GGML_TYPE_IQ3_XXS:
2860
  case GGML_TYPE_IQ4_NL:
2861
  case GGML_TYPE_IQ4_XS:
2862
+ #ifdef GGML_USE_MUSA
2863
+ if (a->type == GGML_TYPE_Q3_K) {
2864
+ return false;
2865
+ }
2866
+ #endif // GGML_USE_MUSA
2867
  return true;
2868
  default:
2869
  return false;
 
2989
  case GGML_OP_RWKV_WKV:
2990
  return true;
2991
  case GGML_OP_FLASH_ATTN_EXT: {
2992
+ #ifndef FLASH_ATTN_AVAILABLE
2993
+ return false;
2994
+ #endif
2995
  if (op->src[0]->ne[0] == 64 && op->src[1]->type == GGML_TYPE_F16) {
2996
  return true;
2997
  }
ggml/src/ggml-cuda/common.cuh CHANGED
@@ -50,6 +50,8 @@
50
  #define CC_RDNA1 (CC_OFFSET_AMD + 1010)
51
  #define CC_RDNA2 (CC_OFFSET_AMD + 1030)
52
  #define CC_RDNA3 (CC_OFFSET_AMD + 1100)
 
 
53
 
54
  #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
55
 
@@ -134,6 +136,10 @@ typedef float2 dfloat2;
134
  #define INT8_MMA_AVAILABLE
135
  #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
136
 
 
 
 
 
137
  static constexpr bool fast_fp16_available(const int cc) {
138
  return cc >= CC_PASCAL && cc != 610;
139
  }
 
50
  #define CC_RDNA1 (CC_OFFSET_AMD + 1010)
51
  #define CC_RDNA2 (CC_OFFSET_AMD + 1030)
52
  #define CC_RDNA3 (CC_OFFSET_AMD + 1100)
53
+ #define CC_QY1 210
54
+ #define CC_QY2 220
55
 
56
  #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
57
 
 
136
  #define INT8_MMA_AVAILABLE
137
  #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
138
 
139
+ #if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1)
140
+ #define FLASH_ATTN_AVAILABLE
141
+ #endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1)
142
+
143
  static constexpr bool fast_fp16_available(const int cc) {
144
  return cc >= CC_PASCAL && cc != 610;
145
  }
ggml/src/ggml-cuda/fattn-tile-f32.cu CHANGED
@@ -44,13 +44,17 @@ static __global__ void flash_attn_tile_ext_f32(
44
  const int ne1,
45
  const int ne2,
46
  const int ne3) {
 
 
 
 
47
  // Skip unused kernel variants for faster compilation:
48
  if (use_logit_softcap && !(D == 128 || D == 256)) {
49
  NO_DEVICE_CODE;
50
  return;
51
  }
52
 
53
- //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
54
 
55
  const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
56
  const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
 
44
  const int ne1,
45
  const int ne2,
46
  const int ne3) {
47
+ #ifndef FLASH_ATTN_AVAILABLE
48
+ NO_DEVICE_CODE;
49
+ return;
50
+ #endif // FLASH_ATTN_AVAILABLE
51
  // Skip unused kernel variants for faster compilation:
52
  if (use_logit_softcap && !(D == 128 || D == 256)) {
53
  NO_DEVICE_CODE;
54
  return;
55
  }
56
 
57
+ // In this kernel Q, K, V are matrices while i, j, k are matrix indices.
58
 
59
  const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
60
  const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
ggml/src/ggml-cuda/vendors/musa.h CHANGED
@@ -26,6 +26,7 @@
26
  #define cublasSetStream mublasSetStream
27
  #define cublasSgemm mublasSgemm
28
  #define cublasStatus_t mublasStatus_t
 
29
  #define cublasGetStatusString mublasStatus_to_string
30
  #define cudaDataType_t musaDataType_t
31
  #define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
@@ -56,6 +57,7 @@
56
  #define cudaLaunchHostFunc musaLaunchHostFunc
57
  #define cudaMalloc musaMalloc
58
  #define cudaMallocHost musaMallocHost
 
59
  #define cudaMemcpy musaMemcpy
60
  #define cudaMemcpyAsync musaMemcpyAsync
61
  #define cudaMemcpyPeerAsync musaMemcpyPeerAsync
 
26
  #define cublasSetStream mublasSetStream
27
  #define cublasSgemm mublasSgemm
28
  #define cublasStatus_t mublasStatus_t
29
+ #define cublasOperation_t mublasOperation_t
30
  #define cublasGetStatusString mublasStatus_to_string
31
  #define cudaDataType_t musaDataType_t
32
  #define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
 
57
  #define cudaLaunchHostFunc musaLaunchHostFunc
58
  #define cudaMalloc musaMalloc
59
  #define cudaMallocHost musaMallocHost
60
+ #define cudaMallocManaged musaMallocManaged
61
  #define cudaMemcpy musaMemcpy
62
  #define cudaMemcpyAsync musaMemcpyAsync
63
  #define cudaMemcpyPeerAsync musaMemcpyPeerAsync