yeahdongcn commited on
Commit
a687ec3
·
1 Parent(s): 6bdff5c

musa: upgrade musa sdk to rc4.2.0 (llama/14498)

Browse files

* musa: apply mublas API changes

Signed-off-by: Xiaodong Ye <[email protected]>

* musa: update musa version to 4.2.0

Signed-off-by: Xiaodong Ye <[email protected]>

* musa: restore MUSA graph settings in CMakeLists.txt

Signed-off-by: Xiaodong Ye <[email protected]>

* musa: disable mudnnMemcpyAsync by default

Signed-off-by: Xiaodong Ye <[email protected]>

* musa: switch back to non-mudnn images

Signed-off-by: Xiaodong Ye <[email protected]>

* minor changes

Signed-off-by: Xiaodong Ye <[email protected]>

* musa: restore rc in docker image tag

Signed-off-by: Xiaodong Ye <[email protected]>

---------

Signed-off-by: Xiaodong Ye <[email protected]>

ggml/CMakeLists.txt CHANGED
@@ -174,6 +174,8 @@ option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental,
174
  option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
175
  option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
176
  option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
 
 
177
  option(GGML_VULKAN "ggml: use Vulkan" OFF)
178
  option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
179
  option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
 
174
  option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
175
  option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
176
  option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
177
+ option(GGML_MUSA_GRAPHS "ggml: use MUSA graph, experimental, unstable" OFF)
178
+ option(GGML_MUSA_MUDNN_COPY "ggml: enable muDNN for accelerated copy" OFF)
179
  option(GGML_VULKAN "ggml: use Vulkan" OFF)
180
  option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
181
  option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
ggml/src/ggml-cuda/common.cuh CHANGED
@@ -765,7 +765,7 @@ struct ggml_tensor_extra_gpu {
765
  };
766
 
767
 
768
- #if (defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS))
769
  #define USE_CUDA_GRAPH
770
  #endif
771
 
 
765
  };
766
 
767
 
768
+ #if (defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)) || defined(GGML_MUSA_GRAPHS)
769
  #define USE_CUDA_GRAPH
770
  #endif
771
 
ggml/src/ggml-cuda/cpy.cu CHANGED
@@ -1,9 +1,9 @@
1
  #include "cpy.cuh"
2
  #include "dequantize.cuh"
3
  #include "cpy-utils.cuh"
4
- #ifdef GGML_USE_MUSA
5
  #include "ggml-musa/mudnn.cuh"
6
- #endif // GGML_USE_MUSA
7
 
8
  typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
9
 
@@ -121,7 +121,7 @@ static __global__ void cpy_q_f32(const char * cx, char * cdst_direct, const int
121
  // Copy destination pointers to GPU to be available when pointer indirection is in use
122
 
123
  void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size, cudaStream_t stream) {
124
- #if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
125
  if (cuda_graph->dest_ptrs_size < host_dest_ptrs_size) { // (re-)allocate GPU memory for destination pointers
126
  CUDA_CHECK(cudaStreamSynchronize(stream));
127
  if (cuda_graph->dest_ptrs_d != nullptr) {
@@ -314,7 +314,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
314
 
315
  char ** dest_ptrs_d = nullptr;
316
  int graph_cpynode_index = -1;
317
- #if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
318
  if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
319
  dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d;
320
  graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index;
@@ -324,11 +324,11 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
324
  #endif
325
  if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
326
  GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
327
- #ifdef GGML_USE_MUSA
328
  if (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) {
329
  CUDA_CHECK(mudnnMemcpyAsync(ctx, src1, src0));
330
  } else
331
- #endif // GGML_USE_MUSA
332
  {
333
  CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
334
  }
@@ -379,7 +379,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
379
  GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
380
  ggml_type_name(src0->type), ggml_type_name(src1->type));
381
  }
382
- #if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
383
  if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
384
  ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index;
385
  }
 
1
  #include "cpy.cuh"
2
  #include "dequantize.cuh"
3
  #include "cpy-utils.cuh"
4
+ #if defined(GGML_USE_MUSA) && defined(GGML_MUSA_MUDNN_COPY)
5
  #include "ggml-musa/mudnn.cuh"
6
+ #endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY
7
 
8
  typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
9
 
 
121
  // Copy destination pointers to GPU to be available when pointer indirection is in use
122
 
123
  void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size, cudaStream_t stream) {
124
+ #if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
125
  if (cuda_graph->dest_ptrs_size < host_dest_ptrs_size) { // (re-)allocate GPU memory for destination pointers
126
  CUDA_CHECK(cudaStreamSynchronize(stream));
127
  if (cuda_graph->dest_ptrs_d != nullptr) {
 
314
 
315
  char ** dest_ptrs_d = nullptr;
316
  int graph_cpynode_index = -1;
317
+ #if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
318
  if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
319
  dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d;
320
  graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index;
 
324
  #endif
325
  if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
326
  GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
327
+ #if defined(GGML_USE_MUSA) && defined(GGML_MUSA_MUDNN_COPY)
328
  if (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) {
329
  CUDA_CHECK(mudnnMemcpyAsync(ctx, src1, src0));
330
  } else
331
+ #endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY
332
  {
333
  CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
334
  }
 
379
  GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
380
  ggml_type_name(src0->type), ggml_type_name(src1->type));
381
  }
382
+ #if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
383
  if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
384
  ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index;
385
  }
ggml/src/ggml-cuda/vendors/musa.h CHANGED
@@ -13,7 +13,7 @@
13
  #define CUBLAS_OP_N MUBLAS_OP_N
14
  #define CUBLAS_OP_T MUBLAS_OP_T
15
  #define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
16
- #define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
17
  #define CUDA_R_16F MUSA_R_16F
18
  #define CUDA_R_16BF MUSA_R_16BF
19
  #define CUDA_R_32F MUSA_R_32F
@@ -29,7 +29,7 @@
29
  #define cublasSgemm mublasSgemm
30
  #define cublasStatus_t mublasStatus_t
31
  #define cublasOperation_t mublasOperation_t
32
- #define cublasGetStatusString mublasStatus_to_string
33
  #define cudaDataType_t musaDataType_t
34
  #define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
35
  #define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
 
13
  #define CUBLAS_OP_N MUBLAS_OP_N
14
  #define CUBLAS_OP_T MUBLAS_OP_T
15
  #define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
16
+ #define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_TENSOR_OP_MATH
17
  #define CUDA_R_16F MUSA_R_16F
18
  #define CUDA_R_16BF MUSA_R_16BF
19
  #define CUDA_R_32F MUSA_R_32F
 
29
  #define cublasSgemm mublasSgemm
30
  #define cublasStatus_t mublasStatus_t
31
  #define cublasOperation_t mublasOperation_t
32
+ #define cublasGetStatusString mublasGetStatusString
33
  #define cudaDataType_t musaDataType_t
34
  #define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
35
  #define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
ggml/src/ggml-musa/CMakeLists.txt CHANGED
@@ -34,8 +34,12 @@ if (MUSAToolkit_FOUND)
34
  list(APPEND GGML_SOURCES_MUSA ${SRCS})
35
  file(GLOB SRCS "../ggml-cuda/template-instances/mmq*.cu")
36
  list(APPEND GGML_SOURCES_MUSA ${SRCS})
37
- file(GLOB SRCS "../ggml-musa/*.cu")
38
- list(APPEND GGML_SOURCES_MUSA ${SRCS})
 
 
 
 
39
 
40
  if (GGML_CUDA_FA_ALL_QUANTS)
41
  file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*.cu")
@@ -72,6 +76,10 @@ if (MUSAToolkit_FOUND)
72
  add_compile_definitions(GGML_USE_MUSA)
73
  add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
74
 
 
 
 
 
75
  if (GGML_CUDA_FORCE_MMQ)
76
  add_compile_definitions(GGML_CUDA_FORCE_MMQ)
77
  endif()
@@ -97,10 +105,16 @@ if (MUSAToolkit_FOUND)
97
  endif()
98
 
99
  if (GGML_STATIC)
100
- # TODO: mudnn has not provided static libraries yet
101
  target_link_libraries(ggml-musa PRIVATE MUSA::musart_static MUSA::mublas_static)
 
 
 
 
102
  else()
103
- target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas mudnn)
 
 
 
104
  endif()
105
 
106
  if (GGML_CUDA_NO_VMM)
 
34
  list(APPEND GGML_SOURCES_MUSA ${SRCS})
35
  file(GLOB SRCS "../ggml-cuda/template-instances/mmq*.cu")
36
  list(APPEND GGML_SOURCES_MUSA ${SRCS})
37
+
38
+ if (GGML_MUSA_MUDNN_COPY)
39
+ file(GLOB SRCS "../ggml-musa/*.cu")
40
+ list(APPEND GGML_SOURCES_MUSA ${SRCS})
41
+ add_compile_definitions(GGML_MUSA_MUDNN_COPY)
42
+ endif()
43
 
44
  if (GGML_CUDA_FA_ALL_QUANTS)
45
  file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*.cu")
 
76
  add_compile_definitions(GGML_USE_MUSA)
77
  add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
78
 
79
+ if (GGML_MUSA_GRAPHS)
80
+ add_compile_definitions(GGML_MUSA_GRAPHS)
81
+ endif()
82
+
83
  if (GGML_CUDA_FORCE_MMQ)
84
  add_compile_definitions(GGML_CUDA_FORCE_MMQ)
85
  endif()
 
105
  endif()
106
 
107
  if (GGML_STATIC)
 
108
  target_link_libraries(ggml-musa PRIVATE MUSA::musart_static MUSA::mublas_static)
109
+ # TODO: mudnn has not provided static libraries yet
110
+ # if (GGML_MUSA_MUDNN_COPY)
111
+ # target_link_libraries(ggml-musa PRIVATE mudnn_static)
112
+ # endif()
113
  else()
114
+ target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas)
115
+ if (GGML_MUSA_MUDNN_COPY)
116
+ target_link_libraries(ggml-musa PRIVATE mudnn)
117
+ endif()
118
  endif()
119
 
120
  if (GGML_CUDA_NO_VMM)