Spaces:
Running
Running
Commit
·
a687ec3
1
Parent(s):
6bdff5c
musa: upgrade musa sdk to rc4.2.0 (llama/14498)
Browse files* musa: apply mublas API changes
Signed-off-by: Xiaodong Ye <[email protected]>
* musa: update musa version to 4.2.0
Signed-off-by: Xiaodong Ye <[email protected]>
* musa: restore MUSA graph settings in CMakeLists.txt
Signed-off-by: Xiaodong Ye <[email protected]>
* musa: disable mudnnMemcpyAsync by default
Signed-off-by: Xiaodong Ye <[email protected]>
* musa: switch back to non-mudnn images
Signed-off-by: Xiaodong Ye <[email protected]>
* minor changes
Signed-off-by: Xiaodong Ye <[email protected]>
* musa: restore rc in docker image tag
Signed-off-by: Xiaodong Ye <[email protected]>
---------
Signed-off-by: Xiaodong Ye <[email protected]>
- ggml/CMakeLists.txt +2 -0
- ggml/src/ggml-cuda/common.cuh +1 -1
- ggml/src/ggml-cuda/cpy.cu +7 -7
- ggml/src/ggml-cuda/vendors/musa.h +2 -2
- ggml/src/ggml-musa/CMakeLists.txt +18 -4
ggml/CMakeLists.txt
CHANGED
|
@@ -174,6 +174,8 @@ option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental,
|
|
| 174 |
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
|
| 175 |
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
|
| 176 |
option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
|
|
|
|
|
|
|
| 177 |
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
| 178 |
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
| 179 |
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
|
|
|
|
| 174 |
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
|
| 175 |
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
|
| 176 |
option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
|
| 177 |
+
option(GGML_MUSA_GRAPHS "ggml: use MUSA graph, experimental, unstable" OFF)
|
| 178 |
+
option(GGML_MUSA_MUDNN_COPY "ggml: enable muDNN for accelerated copy" OFF)
|
| 179 |
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
| 180 |
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
| 181 |
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
|
ggml/src/ggml-cuda/common.cuh
CHANGED
|
@@ -765,7 +765,7 @@ struct ggml_tensor_extra_gpu {
|
|
| 765 |
};
|
| 766 |
|
| 767 |
|
| 768 |
-
#if (defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS))
|
| 769 |
#define USE_CUDA_GRAPH
|
| 770 |
#endif
|
| 771 |
|
|
|
|
| 765 |
};
|
| 766 |
|
| 767 |
|
| 768 |
+
#if (defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)) || defined(GGML_MUSA_GRAPHS)
|
| 769 |
#define USE_CUDA_GRAPH
|
| 770 |
#endif
|
| 771 |
|
ggml/src/ggml-cuda/cpy.cu
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
#include "cpy.cuh"
|
| 2 |
#include "dequantize.cuh"
|
| 3 |
#include "cpy-utils.cuh"
|
| 4 |
-
#
|
| 5 |
#include "ggml-musa/mudnn.cuh"
|
| 6 |
-
#endif // GGML_USE_MUSA
|
| 7 |
|
| 8 |
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
| 9 |
|
|
@@ -121,7 +121,7 @@ static __global__ void cpy_q_f32(const char * cx, char * cdst_direct, const int
|
|
| 121 |
// Copy destination pointers to GPU to be available when pointer indirection is in use
|
| 122 |
|
| 123 |
void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size, cudaStream_t stream) {
|
| 124 |
-
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
|
| 125 |
if (cuda_graph->dest_ptrs_size < host_dest_ptrs_size) { // (re-)allocate GPU memory for destination pointers
|
| 126 |
CUDA_CHECK(cudaStreamSynchronize(stream));
|
| 127 |
if (cuda_graph->dest_ptrs_d != nullptr) {
|
|
@@ -314,7 +314,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
|
| 314 |
|
| 315 |
char ** dest_ptrs_d = nullptr;
|
| 316 |
int graph_cpynode_index = -1;
|
| 317 |
-
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
|
| 318 |
if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
|
| 319 |
dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d;
|
| 320 |
graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index;
|
|
@@ -324,11 +324,11 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
|
| 324 |
#endif
|
| 325 |
if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
| 326 |
GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
|
| 327 |
-
#
|
| 328 |
if (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) {
|
| 329 |
CUDA_CHECK(mudnnMemcpyAsync(ctx, src1, src0));
|
| 330 |
} else
|
| 331 |
-
#endif // GGML_USE_MUSA
|
| 332 |
{
|
| 333 |
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
|
| 334 |
}
|
|
@@ -379,7 +379,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
|
| 379 |
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
|
| 380 |
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
| 381 |
}
|
| 382 |
-
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
|
| 383 |
if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
|
| 384 |
ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index;
|
| 385 |
}
|
|
|
|
| 1 |
#include "cpy.cuh"
|
| 2 |
#include "dequantize.cuh"
|
| 3 |
#include "cpy-utils.cuh"
|
| 4 |
+
#if defined(GGML_USE_MUSA) && defined(GGML_MUSA_MUDNN_COPY)
|
| 5 |
#include "ggml-musa/mudnn.cuh"
|
| 6 |
+
#endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY
|
| 7 |
|
| 8 |
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
| 9 |
|
|
|
|
| 121 |
// Copy destination pointers to GPU to be available when pointer indirection is in use
|
| 122 |
|
| 123 |
void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size, cudaStream_t stream) {
|
| 124 |
+
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
|
| 125 |
if (cuda_graph->dest_ptrs_size < host_dest_ptrs_size) { // (re-)allocate GPU memory for destination pointers
|
| 126 |
CUDA_CHECK(cudaStreamSynchronize(stream));
|
| 127 |
if (cuda_graph->dest_ptrs_d != nullptr) {
|
|
|
|
| 314 |
|
| 315 |
char ** dest_ptrs_d = nullptr;
|
| 316 |
int graph_cpynode_index = -1;
|
| 317 |
+
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
|
| 318 |
if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
|
| 319 |
dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d;
|
| 320 |
graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index;
|
|
|
|
| 324 |
#endif
|
| 325 |
if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
| 326 |
GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
|
| 327 |
+
#if defined(GGML_USE_MUSA) && defined(GGML_MUSA_MUDNN_COPY)
|
| 328 |
if (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) {
|
| 329 |
CUDA_CHECK(mudnnMemcpyAsync(ctx, src1, src0));
|
| 330 |
} else
|
| 331 |
+
#endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY
|
| 332 |
{
|
| 333 |
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
|
| 334 |
}
|
|
|
|
| 379 |
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
|
| 380 |
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
| 381 |
}
|
| 382 |
+
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
|
| 383 |
if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
|
| 384 |
ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index;
|
| 385 |
}
|
ggml/src/ggml-cuda/vendors/musa.h
CHANGED
|
@@ -13,7 +13,7 @@
|
|
| 13 |
#define CUBLAS_OP_N MUBLAS_OP_N
|
| 14 |
#define CUBLAS_OP_T MUBLAS_OP_T
|
| 15 |
#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
|
| 16 |
-
#define CUBLAS_TF32_TENSOR_OP_MATH
|
| 17 |
#define CUDA_R_16F MUSA_R_16F
|
| 18 |
#define CUDA_R_16BF MUSA_R_16BF
|
| 19 |
#define CUDA_R_32F MUSA_R_32F
|
|
@@ -29,7 +29,7 @@
|
|
| 29 |
#define cublasSgemm mublasSgemm
|
| 30 |
#define cublasStatus_t mublasStatus_t
|
| 31 |
#define cublasOperation_t mublasOperation_t
|
| 32 |
-
#define cublasGetStatusString
|
| 33 |
#define cudaDataType_t musaDataType_t
|
| 34 |
#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
|
| 35 |
#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
|
|
|
|
| 13 |
#define CUBLAS_OP_N MUBLAS_OP_N
|
| 14 |
#define CUBLAS_OP_T MUBLAS_OP_T
|
| 15 |
#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
|
| 16 |
+
#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_TENSOR_OP_MATH
|
| 17 |
#define CUDA_R_16F MUSA_R_16F
|
| 18 |
#define CUDA_R_16BF MUSA_R_16BF
|
| 19 |
#define CUDA_R_32F MUSA_R_32F
|
|
|
|
| 29 |
#define cublasSgemm mublasSgemm
|
| 30 |
#define cublasStatus_t mublasStatus_t
|
| 31 |
#define cublasOperation_t mublasOperation_t
|
| 32 |
+
#define cublasGetStatusString mublasGetStatusString
|
| 33 |
#define cudaDataType_t musaDataType_t
|
| 34 |
#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
|
| 35 |
#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
|
ggml/src/ggml-musa/CMakeLists.txt
CHANGED
|
@@ -34,8 +34,12 @@ if (MUSAToolkit_FOUND)
|
|
| 34 |
list(APPEND GGML_SOURCES_MUSA ${SRCS})
|
| 35 |
file(GLOB SRCS "../ggml-cuda/template-instances/mmq*.cu")
|
| 36 |
list(APPEND GGML_SOURCES_MUSA ${SRCS})
|
| 37 |
-
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
if (GGML_CUDA_FA_ALL_QUANTS)
|
| 41 |
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*.cu")
|
|
@@ -72,6 +76,10 @@ if (MUSAToolkit_FOUND)
|
|
| 72 |
add_compile_definitions(GGML_USE_MUSA)
|
| 73 |
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
|
| 74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
if (GGML_CUDA_FORCE_MMQ)
|
| 76 |
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
| 77 |
endif()
|
|
@@ -97,10 +105,16 @@ if (MUSAToolkit_FOUND)
|
|
| 97 |
endif()
|
| 98 |
|
| 99 |
if (GGML_STATIC)
|
| 100 |
-
# TODO: mudnn has not provided static libraries yet
|
| 101 |
target_link_libraries(ggml-musa PRIVATE MUSA::musart_static MUSA::mublas_static)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
else()
|
| 103 |
-
target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas
|
|
|
|
|
|
|
|
|
|
| 104 |
endif()
|
| 105 |
|
| 106 |
if (GGML_CUDA_NO_VMM)
|
|
|
|
| 34 |
list(APPEND GGML_SOURCES_MUSA ${SRCS})
|
| 35 |
file(GLOB SRCS "../ggml-cuda/template-instances/mmq*.cu")
|
| 36 |
list(APPEND GGML_SOURCES_MUSA ${SRCS})
|
| 37 |
+
|
| 38 |
+
if (GGML_MUSA_MUDNN_COPY)
|
| 39 |
+
file(GLOB SRCS "../ggml-musa/*.cu")
|
| 40 |
+
list(APPEND GGML_SOURCES_MUSA ${SRCS})
|
| 41 |
+
add_compile_definitions(GGML_MUSA_MUDNN_COPY)
|
| 42 |
+
endif()
|
| 43 |
|
| 44 |
if (GGML_CUDA_FA_ALL_QUANTS)
|
| 45 |
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*.cu")
|
|
|
|
| 76 |
add_compile_definitions(GGML_USE_MUSA)
|
| 77 |
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
|
| 78 |
|
| 79 |
+
if (GGML_MUSA_GRAPHS)
|
| 80 |
+
add_compile_definitions(GGML_MUSA_GRAPHS)
|
| 81 |
+
endif()
|
| 82 |
+
|
| 83 |
if (GGML_CUDA_FORCE_MMQ)
|
| 84 |
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
| 85 |
endif()
|
|
|
|
| 105 |
endif()
|
| 106 |
|
| 107 |
if (GGML_STATIC)
|
|
|
|
| 108 |
target_link_libraries(ggml-musa PRIVATE MUSA::musart_static MUSA::mublas_static)
|
| 109 |
+
# TODO: mudnn has not provided static libraries yet
|
| 110 |
+
# if (GGML_MUSA_MUDNN_COPY)
|
| 111 |
+
# target_link_libraries(ggml-musa PRIVATE mudnn_static)
|
| 112 |
+
# endif()
|
| 113 |
else()
|
| 114 |
+
target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas)
|
| 115 |
+
if (GGML_MUSA_MUDNN_COPY)
|
| 116 |
+
target_link_libraries(ggml-musa PRIVATE mudnn)
|
| 117 |
+
endif()
|
| 118 |
endif()
|
| 119 |
|
| 120 |
if (GGML_CUDA_NO_VMM)
|