Spaces:
Sleeping
SYCL: Reduce most of the compiler warnings (llama/10748)
Browse files* Try to reduce some unused and typecast warnings
* Reduce compiler warnings step 2
* add a newline at the end of the file
* Initialize nreduce as size_t
* [SYCL] Remove pragma directives from mmq.cpp
* SYCL: mmq add condition to prevent blocks_per_tile_x_row variable from becoming 0
* SYCL softmax: Initialize nreduce as size_t
* ggml-sycl.cpp: fix some trailing whitespaces
* SYCL: remove the unused variables instead of commenting it out
* SYCL poo2d kernel: set NAN for invalid pooling op
* SYCL gemm.hpp: remove pragma directives
* SYCL gemm.hpp: use const cast to properly support dnnl::memory
* SYCL: wkv6 remove a comment
* SYCL: clean comments step 2
* SYCL: clean comments and variables step 3
* SYCL: Use GGML_UNUSED for unused variables
* SYCL: remove extra empty lines and a comment
* Remove TODO
* cleanup spaces
* add a stdout for unsupported op
* use sycl printf over fprintf
* remove prints for CI
* SYCL ggml-sycl: pool2D use sycl::nan and remove if-else block
---------
Co-authored-by: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
- ggml/src/ggml-sycl/common.cpp +2 -11
- ggml/src/ggml-sycl/common.hpp +1 -0
- ggml/src/ggml-sycl/concat.cpp +2 -2
- ggml/src/ggml-sycl/convert.cpp +1 -1
- ggml/src/ggml-sycl/dmmv.cpp +5 -5
- ggml/src/ggml-sycl/dpct/helper.hpp +1 -1
- ggml/src/ggml-sycl/element_wise.cpp +80 -61
- ggml/src/ggml-sycl/gemm.hpp +4 -4
- ggml/src/ggml-sycl/ggml-sycl.cpp +74 -60
- ggml/src/ggml-sycl/im2col.cpp +3 -2
- ggml/src/ggml-sycl/mmq.cpp +6 -6
- ggml/src/ggml-sycl/mmvq.cpp +6 -20
- ggml/src/ggml-sycl/norm.cpp +4 -3
- ggml/src/ggml-sycl/rope.cpp +4 -3
- ggml/src/ggml-sycl/softmax.cpp +7 -7
- ggml/src/ggml-sycl/tsembd.cpp +1 -0
- ggml/src/ggml-sycl/wkv6.cpp +4 -1
|
@@ -11,6 +11,7 @@
|
|
| 11 |
//
|
| 12 |
|
| 13 |
#include "common.hpp"
|
|
|
|
| 14 |
|
| 15 |
int get_current_device_id() {
|
| 16 |
return dpct::dev_mgr::instance().current_device_id();
|
|
@@ -28,11 +29,7 @@ void* ggml_sycl_host_malloc(size_t size) try {
|
|
| 28 |
|
| 29 |
if (err != 0) {
|
| 30 |
// clear the error
|
| 31 |
-
|
| 32 |
-
stderr,
|
| 33 |
-
"WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
|
| 34 |
-
size / 1024.0 / 1024.0,
|
| 35 |
-
"syclGetErrorString is not supported");
|
| 36 |
return nullptr;
|
| 37 |
}
|
| 38 |
|
|
@@ -66,18 +63,12 @@ int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block
|
|
| 66 |
void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
| 67 |
const ggml_tensor *src1, ggml_tensor *dst,
|
| 68 |
const ggml_sycl_op_flatten_t op) try {
|
| 69 |
-
const int64_t nrows0 = ggml_nrows(src0);
|
| 70 |
|
| 71 |
const bool use_src1 = src1 != nullptr;
|
| 72 |
-
const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
|
| 73 |
|
| 74 |
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
| 75 |
GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
| 76 |
|
| 77 |
-
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
| 78 |
-
ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
| 79 |
-
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
| 80 |
-
|
| 81 |
// dd = data device
|
| 82 |
float * src0_ddf = (float *) src0->data;
|
| 83 |
float * src1_ddf = use_src1 ? (float *) src1->data : nullptr;
|
|
|
|
| 11 |
//
|
| 12 |
|
| 13 |
#include "common.hpp"
|
| 14 |
+
#include "ggml-impl.h"
|
| 15 |
|
| 16 |
int get_current_device_id() {
|
| 17 |
return dpct::dev_mgr::instance().current_device_id();
|
|
|
|
| 29 |
|
| 30 |
if (err != 0) {
|
| 31 |
// clear the error
|
| 32 |
+
GGML_LOG_ERROR("WARNING: failed to allocate %.2f MB of pinned memory: %s\n", size / 1024.0 / 1024.0, "syclGetErrorString is not supported");
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
return nullptr;
|
| 34 |
}
|
| 35 |
|
|
|
|
| 63 |
void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
| 64 |
const ggml_tensor *src1, ggml_tensor *dst,
|
| 65 |
const ggml_sycl_op_flatten_t op) try {
|
|
|
|
| 66 |
|
| 67 |
const bool use_src1 = src1 != nullptr;
|
|
|
|
| 68 |
|
| 69 |
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
| 70 |
GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
| 71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
// dd = data device
|
| 73 |
float * src0_ddf = (float *) src0->data;
|
| 74 |
float * src1_ddf = use_src1 ? (float *) src1->data : nullptr;
|
|
@@ -626,6 +626,7 @@ struct bin_bcast_sycl {
|
|
| 626 |
});
|
| 627 |
}
|
| 628 |
}
|
|
|
|
| 629 |
}
|
| 630 |
};
|
| 631 |
|
|
|
|
| 626 |
});
|
| 627 |
}
|
| 628 |
}
|
| 629 |
+
GGML_UNUSED(ctx);
|
| 630 |
}
|
| 631 |
};
|
| 632 |
|
|
@@ -47,7 +47,7 @@ static void concat_f32_dim1(const float *x, const float *y, float *dst,
|
|
| 47 |
// operation
|
| 48 |
int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
|
| 49 |
item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
|
| 50 |
-
if (item_ct1.get_group(1) < ne01) { // src0
|
| 51 |
int offset_src =
|
| 52 |
nidx + item_ct1.get_group(1) * ne0 + item_ct1.get_group(0) * ne0 * ne01;
|
| 53 |
dst[offset_dst] = x[offset_src];
|
|
@@ -70,7 +70,7 @@ static void concat_f32_dim2(const float *x, const float *y, float *dst,
|
|
| 70 |
// operation
|
| 71 |
int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
|
| 72 |
item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
|
| 73 |
-
if (item_ct1.get_group(0) < ne02) { // src0
|
| 74 |
int offset_src = nidx + item_ct1.get_group(1) * ne0 +
|
| 75 |
item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
|
| 76 |
dst[offset_dst] = x[offset_src];
|
|
|
|
| 47 |
// operation
|
| 48 |
int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
|
| 49 |
item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
|
| 50 |
+
if (item_ct1.get_group(1) < (size_t) ne01) { // src0
|
| 51 |
int offset_src =
|
| 52 |
nidx + item_ct1.get_group(1) * ne0 + item_ct1.get_group(0) * ne0 * ne01;
|
| 53 |
dst[offset_dst] = x[offset_src];
|
|
|
|
| 70 |
// operation
|
| 71 |
int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
|
| 72 |
item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
|
| 73 |
+
if (item_ct1.get_group(0) < (size_t) ne02) { // src0
|
| 74 |
int offset_src = nidx + item_ct1.get_group(1) * ne0 +
|
| 75 |
item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
|
| 76 |
dst[offset_dst] = x[offset_src];
|
|
@@ -424,7 +424,7 @@ static void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y,
|
|
| 424 |
const int64_t global_id = item_ct1.get_local_id(2) + work_group_size * item_ct1.get_group(2);
|
| 425 |
|
| 426 |
// make each work-item deal with more elements since sycl global range can not exceed max int
|
| 427 |
-
const src_t * x = (src_t *) vx;
|
| 428 |
for (int64_t i = global_id; i < k; i += work_group_size * item_ct1.get_group_range(2)) {
|
| 429 |
y[i] = x[i];
|
| 430 |
}
|
|
|
|
| 424 |
const int64_t global_id = item_ct1.get_local_id(2) + work_group_size * item_ct1.get_group(2);
|
| 425 |
|
| 426 |
// make each work-item deal with more elements since sycl global range can not exceed max int
|
| 427 |
+
const src_t * x = (const src_t *) vx;
|
| 428 |
for (int64_t i = global_id; i < k; i += work_group_size * item_ct1.get_group_range(2)) {
|
| 429 |
y[i] = x[i];
|
| 430 |
}
|
|
@@ -1015,9 +1015,9 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
|
|
| 1015 |
break;
|
| 1016 |
}
|
| 1017 |
|
| 1018 |
-
(
|
| 1019 |
-
(
|
| 1020 |
-
(
|
| 1021 |
-
(
|
| 1022 |
-
(
|
| 1023 |
}
|
|
|
|
| 1015 |
break;
|
| 1016 |
}
|
| 1017 |
|
| 1018 |
+
GGML_UNUSED(src1);
|
| 1019 |
+
GGML_UNUSED(dst);
|
| 1020 |
+
GGML_UNUSED(src1_ddq_i);
|
| 1021 |
+
GGML_UNUSED(src1_ncols);
|
| 1022 |
+
GGML_UNUSED(src1_padded_row_size);
|
| 1023 |
}
|
|
@@ -1237,7 +1237,7 @@ namespace dpct
|
|
| 1237 |
|
| 1238 |
std::map<byte_t *, allocation>::iterator get_map_iterator(const void *ptr)
|
| 1239 |
{
|
| 1240 |
-
auto it = m_map.upper_bound((byte_t
|
| 1241 |
if (it == m_map.end())
|
| 1242 |
{
|
| 1243 |
// Not a virtual pointer.
|
|
|
|
| 1237 |
|
| 1238 |
std::map<byte_t *, allocation>::iterator get_map_iterator(const void *ptr)
|
| 1239 |
{
|
| 1240 |
+
auto it = m_map.upper_bound(const_cast<byte_t *>(reinterpret_cast<const byte_t *>(ptr)));
|
| 1241 |
if (it == m_map.end())
|
| 1242 |
{
|
| 1243 |
// Not a virtual pointer.
|
|
@@ -237,7 +237,7 @@ void upscale_f32(const float *x, float *dst, const int nb00, const int nb01,
|
|
| 237 |
int i02 = i12 / sf2;
|
| 238 |
int i03 = i13 / sf3;
|
| 239 |
|
| 240 |
-
dst[index] = *(float *)((char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
|
| 241 |
}
|
| 242 |
|
| 243 |
void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02,
|
|
@@ -251,8 +251,7 @@ void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const i
|
|
| 251 |
// operation
|
| 252 |
int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
|
| 253 |
item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
|
| 254 |
-
if (nidx < ne00 && item_ct1.get_group(1) < ne01 &&
|
| 255 |
-
item_ct1.get_group(0) < ne02) {
|
| 256 |
int offset_src = nidx + item_ct1.get_group(1) * ne00 +
|
| 257 |
item_ct1.get_group(0) * ne00 * ne01;
|
| 258 |
dst[offset_dst] = x[offset_src];
|
|
@@ -520,9 +519,10 @@ inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, const ggml_tensor
|
|
| 520 |
|
| 521 |
silu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
| 522 |
|
| 523 |
-
(
|
| 524 |
-
(
|
| 525 |
-
(
|
|
|
|
| 526 |
}
|
| 527 |
|
| 528 |
inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
|
@@ -535,9 +535,10 @@ inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, const ggml_tensor
|
|
| 535 |
|
| 536 |
gelu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
| 537 |
|
| 538 |
-
(
|
| 539 |
-
(
|
| 540 |
-
(
|
|
|
|
| 541 |
}
|
| 542 |
inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
| 543 |
const ggml_tensor *src1, ggml_tensor *dst,
|
|
@@ -550,9 +551,10 @@ inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, const ggml_
|
|
| 550 |
|
| 551 |
gelu_quick_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
| 552 |
|
| 553 |
-
(
|
| 554 |
-
(
|
| 555 |
-
(
|
|
|
|
| 556 |
}
|
| 557 |
|
| 558 |
inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
|
@@ -564,9 +566,10 @@ inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, const ggml_tensor
|
|
| 564 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
| 565 |
tanh_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
| 566 |
|
| 567 |
-
(
|
| 568 |
-
(
|
| 569 |
-
(
|
|
|
|
| 570 |
}
|
| 571 |
|
| 572 |
inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
|
@@ -579,9 +582,10 @@ inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, const ggml_tensor
|
|
| 579 |
|
| 580 |
relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
| 581 |
|
| 582 |
-
(
|
| 583 |
-
(
|
| 584 |
-
(
|
|
|
|
| 585 |
}
|
| 586 |
|
| 587 |
inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
@@ -595,9 +599,10 @@ inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, const ggml
|
|
| 595 |
|
| 596 |
hardsigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
| 597 |
|
| 598 |
-
(
|
| 599 |
-
(
|
| 600 |
-
(
|
|
|
|
| 601 |
}
|
| 602 |
|
| 603 |
inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
@@ -610,9 +615,10 @@ inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, const ggml_t
|
|
| 610 |
|
| 611 |
hardswish_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
| 612 |
|
| 613 |
-
(
|
| 614 |
-
(
|
| 615 |
-
(
|
|
|
|
| 616 |
}
|
| 617 |
|
| 618 |
inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
@@ -625,9 +631,10 @@ inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, const ggml_tensor
|
|
| 625 |
|
| 626 |
exp_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
| 627 |
|
| 628 |
-
(
|
| 629 |
-
(
|
| 630 |
-
(
|
|
|
|
| 631 |
}
|
| 632 |
|
| 633 |
inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
@@ -640,9 +647,10 @@ inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, const ggml_tensor
|
|
| 640 |
|
| 641 |
log_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
| 642 |
|
| 643 |
-
(
|
| 644 |
-
(
|
| 645 |
-
(
|
|
|
|
| 646 |
}
|
| 647 |
|
| 648 |
inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
@@ -655,9 +663,10 @@ inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, const ggml_ten
|
|
| 655 |
|
| 656 |
sigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
| 657 |
|
| 658 |
-
(
|
| 659 |
-
(
|
| 660 |
-
(
|
|
|
|
| 661 |
}
|
| 662 |
|
| 663 |
inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
@@ -670,9 +679,10 @@ inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, const ggml_tensor
|
|
| 670 |
|
| 671 |
sqrt_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
| 672 |
|
| 673 |
-
(
|
| 674 |
-
(
|
| 675 |
-
(
|
|
|
|
| 676 |
}
|
| 677 |
|
| 678 |
inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
@@ -685,9 +695,10 @@ inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, const ggml_tensor
|
|
| 685 |
|
| 686 |
sin_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
| 687 |
|
| 688 |
-
(
|
| 689 |
-
(
|
| 690 |
-
(
|
|
|
|
| 691 |
}
|
| 692 |
|
| 693 |
inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
@@ -700,9 +711,10 @@ inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, const ggml_tensor
|
|
| 700 |
|
| 701 |
cos_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
| 702 |
|
| 703 |
-
(
|
| 704 |
-
(
|
| 705 |
-
(
|
|
|
|
| 706 |
}
|
| 707 |
|
| 708 |
inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
@@ -715,9 +727,10 @@ inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, const ggml_tensor
|
|
| 715 |
|
| 716 |
step_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
| 717 |
|
| 718 |
-
(
|
| 719 |
-
(
|
| 720 |
-
(
|
|
|
|
| 721 |
}
|
| 722 |
|
| 723 |
inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
@@ -730,9 +743,10 @@ inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, const ggml_tensor
|
|
| 730 |
|
| 731 |
neg_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
| 732 |
|
| 733 |
-
(
|
| 734 |
-
(
|
| 735 |
-
(
|
|
|
|
| 736 |
}
|
| 737 |
|
| 738 |
inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
@@ -749,9 +763,10 @@ inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, const ggml_
|
|
| 749 |
|
| 750 |
leaky_relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream);
|
| 751 |
|
| 752 |
-
(
|
| 753 |
-
(
|
| 754 |
-
(
|
|
|
|
| 755 |
}
|
| 756 |
|
| 757 |
inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
|
@@ -764,9 +779,10 @@ inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor
|
|
| 764 |
|
| 765 |
sqr_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
| 766 |
|
| 767 |
-
(
|
| 768 |
-
(
|
| 769 |
-
(
|
|
|
|
| 770 |
}
|
| 771 |
|
| 772 |
inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
@@ -787,9 +803,10 @@ inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, const ggml_ten
|
|
| 787 |
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3,
|
| 788 |
main_stream);
|
| 789 |
|
| 790 |
-
(
|
| 791 |
-
(
|
| 792 |
-
(
|
|
|
|
| 793 |
}
|
| 794 |
|
| 795 |
inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
|
@@ -805,9 +822,10 @@ inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, const ggml_tensor
|
|
| 805 |
src0->ne[0], src0->ne[1], src0->ne[2],
|
| 806 |
dst->ne[0], dst->ne[1], dst->ne[2], main_stream);
|
| 807 |
|
| 808 |
-
(
|
| 809 |
-
(
|
| 810 |
-
(
|
|
|
|
| 811 |
}
|
| 812 |
|
| 813 |
inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
|
@@ -827,7 +845,8 @@ inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, const ggml_tensor
|
|
| 827 |
|
| 828 |
acc_f32_sycl(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream);
|
| 829 |
|
| 830 |
-
(
|
|
|
|
| 831 |
}
|
| 832 |
|
| 833 |
inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
|
|
|
| 237 |
int i02 = i12 / sf2;
|
| 238 |
int i03 = i13 / sf3;
|
| 239 |
|
| 240 |
+
dst[index] = *(const float *)((const char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
|
| 241 |
}
|
| 242 |
|
| 243 |
void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02,
|
|
|
|
| 251 |
// operation
|
| 252 |
int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
|
| 253 |
item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
|
| 254 |
+
if (nidx < ne00 && item_ct1.get_group(1) < (size_t) ne01 && item_ct1.get_group(0) < (size_t) ne02) {
|
|
|
|
| 255 |
int offset_src = nidx + item_ct1.get_group(1) * ne00 +
|
| 256 |
item_ct1.get_group(0) * ne00 * ne01;
|
| 257 |
dst[offset_dst] = x[offset_src];
|
|
|
|
| 519 |
|
| 520 |
silu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
| 521 |
|
| 522 |
+
GGML_UNUSED(src1);
|
| 523 |
+
GGML_UNUSED(dst);
|
| 524 |
+
GGML_UNUSED(src1_dd);
|
| 525 |
+
GGML_UNUSED(ctx);
|
| 526 |
}
|
| 527 |
|
| 528 |
inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
|
|
|
| 535 |
|
| 536 |
gelu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
| 537 |
|
| 538 |
+
GGML_UNUSED(src1);
|
| 539 |
+
GGML_UNUSED(dst);
|
| 540 |
+
GGML_UNUSED(src1_dd);
|
| 541 |
+
GGML_UNUSED(ctx);
|
| 542 |
}
|
| 543 |
inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
| 544 |
const ggml_tensor *src1, ggml_tensor *dst,
|
|
|
|
| 551 |
|
| 552 |
gelu_quick_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
| 553 |
|
| 554 |
+
GGML_UNUSED(src1);
|
| 555 |
+
GGML_UNUSED(dst);
|
| 556 |
+
GGML_UNUSED(src1_dd);
|
| 557 |
+
GGML_UNUSED(ctx);
|
| 558 |
}
|
| 559 |
|
| 560 |
inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
|
|
|
| 566 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
| 567 |
tanh_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
| 568 |
|
| 569 |
+
GGML_UNUSED(src1);
|
| 570 |
+
GGML_UNUSED(dst);
|
| 571 |
+
GGML_UNUSED(src1_dd);
|
| 572 |
+
GGML_UNUSED(ctx);
|
| 573 |
}
|
| 574 |
|
| 575 |
inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
|
|
|
| 582 |
|
| 583 |
relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
| 584 |
|
| 585 |
+
GGML_UNUSED(src1);
|
| 586 |
+
GGML_UNUSED(dst);
|
| 587 |
+
GGML_UNUSED(src1_dd);
|
| 588 |
+
GGML_UNUSED(ctx);
|
| 589 |
}
|
| 590 |
|
| 591 |
inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
|
|
| 599 |
|
| 600 |
hardsigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
| 601 |
|
| 602 |
+
GGML_UNUSED(src1);
|
| 603 |
+
GGML_UNUSED(dst);
|
| 604 |
+
GGML_UNUSED(src1_dd);
|
| 605 |
+
GGML_UNUSED(ctx);
|
| 606 |
}
|
| 607 |
|
| 608 |
inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
|
|
| 615 |
|
| 616 |
hardswish_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
| 617 |
|
| 618 |
+
GGML_UNUSED(src1);
|
| 619 |
+
GGML_UNUSED(dst);
|
| 620 |
+
GGML_UNUSED(src1_dd);
|
| 621 |
+
GGML_UNUSED(ctx);
|
| 622 |
}
|
| 623 |
|
| 624 |
inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
|
|
| 631 |
|
| 632 |
exp_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
| 633 |
|
| 634 |
+
GGML_UNUSED(src1);
|
| 635 |
+
GGML_UNUSED(dst);
|
| 636 |
+
GGML_UNUSED(src1_dd);
|
| 637 |
+
GGML_UNUSED(ctx);
|
| 638 |
}
|
| 639 |
|
| 640 |
inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
|
|
| 647 |
|
| 648 |
log_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
| 649 |
|
| 650 |
+
GGML_UNUSED(src1);
|
| 651 |
+
GGML_UNUSED(dst);
|
| 652 |
+
GGML_UNUSED(src1_dd);
|
| 653 |
+
GGML_UNUSED(ctx);
|
| 654 |
}
|
| 655 |
|
| 656 |
inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
|
|
| 663 |
|
| 664 |
sigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
| 665 |
|
| 666 |
+
GGML_UNUSED(src1);
|
| 667 |
+
GGML_UNUSED(dst);
|
| 668 |
+
GGML_UNUSED(src1_dd);
|
| 669 |
+
GGML_UNUSED(ctx);
|
| 670 |
}
|
| 671 |
|
| 672 |
inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
|
|
| 679 |
|
| 680 |
sqrt_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
| 681 |
|
| 682 |
+
GGML_UNUSED(src1);
|
| 683 |
+
GGML_UNUSED(dst);
|
| 684 |
+
GGML_UNUSED(src1_dd);
|
| 685 |
+
GGML_UNUSED(ctx);
|
| 686 |
}
|
| 687 |
|
| 688 |
inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
|
|
| 695 |
|
| 696 |
sin_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
| 697 |
|
| 698 |
+
GGML_UNUSED(src1);
|
| 699 |
+
GGML_UNUSED(dst);
|
| 700 |
+
GGML_UNUSED(src1_dd);
|
| 701 |
+
GGML_UNUSED(ctx);
|
| 702 |
}
|
| 703 |
|
| 704 |
inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
|
|
| 711 |
|
| 712 |
cos_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
| 713 |
|
| 714 |
+
GGML_UNUSED(src1);
|
| 715 |
+
GGML_UNUSED(dst);
|
| 716 |
+
GGML_UNUSED(src1_dd);
|
| 717 |
+
GGML_UNUSED(ctx);
|
| 718 |
}
|
| 719 |
|
| 720 |
inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
|
|
| 727 |
|
| 728 |
step_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
| 729 |
|
| 730 |
+
GGML_UNUSED(src1);
|
| 731 |
+
GGML_UNUSED(dst);
|
| 732 |
+
GGML_UNUSED(src1_dd);
|
| 733 |
+
GGML_UNUSED(ctx);
|
| 734 |
}
|
| 735 |
|
| 736 |
inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
|
|
| 743 |
|
| 744 |
neg_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
| 745 |
|
| 746 |
+
GGML_UNUSED(src1);
|
| 747 |
+
GGML_UNUSED(dst);
|
| 748 |
+
GGML_UNUSED(src1_dd);
|
| 749 |
+
GGML_UNUSED(ctx);
|
| 750 |
}
|
| 751 |
|
| 752 |
inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
|
|
| 763 |
|
| 764 |
leaky_relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream);
|
| 765 |
|
| 766 |
+
GGML_UNUSED(src1);
|
| 767 |
+
GGML_UNUSED(dst);
|
| 768 |
+
GGML_UNUSED(src1_dd);
|
| 769 |
+
GGML_UNUSED(ctx);
|
| 770 |
}
|
| 771 |
|
| 772 |
inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
|
|
|
| 779 |
|
| 780 |
sqr_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
| 781 |
|
| 782 |
+
GGML_UNUSED(src1);
|
| 783 |
+
GGML_UNUSED(dst);
|
| 784 |
+
GGML_UNUSED(src1_dd);
|
| 785 |
+
GGML_UNUSED(ctx);
|
| 786 |
}
|
| 787 |
|
| 788 |
inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
|
|
| 803 |
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3,
|
| 804 |
main_stream);
|
| 805 |
|
| 806 |
+
GGML_UNUSED(src1);
|
| 807 |
+
GGML_UNUSED(dst);
|
| 808 |
+
GGML_UNUSED(src1_dd);
|
| 809 |
+
GGML_UNUSED(ctx);
|
| 810 |
}
|
| 811 |
|
| 812 |
inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
|
|
|
| 822 |
src0->ne[0], src0->ne[1], src0->ne[2],
|
| 823 |
dst->ne[0], dst->ne[1], dst->ne[2], main_stream);
|
| 824 |
|
| 825 |
+
GGML_UNUSED(src1);
|
| 826 |
+
GGML_UNUSED(dst);
|
| 827 |
+
GGML_UNUSED(src1_dd);
|
| 828 |
+
GGML_UNUSED(ctx);
|
| 829 |
}
|
| 830 |
|
| 831 |
inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
|
|
|
| 845 |
|
| 846 |
acc_f32_sycl(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream);
|
| 847 |
|
| 848 |
+
GGML_UNUSED(dst);
|
| 849 |
+
GGML_UNUSED(ctx);
|
| 850 |
}
|
| 851 |
|
| 852 |
inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
|
@@ -51,8 +51,8 @@ public:
|
|
| 51 |
const auto a_in_md = dnnl::memory::desc(a_dims, at, a_trans ? tag::ba : tag::ab);
|
| 52 |
const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_trans ? tag::ba : tag::ab);
|
| 53 |
const auto c_md = dnnl::memory::desc(c_dims, ct, tag::ab);
|
| 54 |
-
auto a_mem = dnnl::memory(a_in_md, eng, (
|
| 55 |
-
auto b_mem = dnnl::memory(b_in_md, eng, (
|
| 56 |
auto matmul_pd = dnnl::matmul::primitive_desc(eng, a_in_md, b_in_md, c_md);
|
| 57 |
auto c_mem = dnnl::memory(matmul_pd.dst_desc(), eng, c);
|
| 58 |
|
|
@@ -79,8 +79,8 @@ public:
|
|
| 79 |
const auto a_in_md = dnnl::memory::desc(a_dims, at, a_trans ? tag::ba : tag::ab);
|
| 80 |
const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_trans ? tag::ba : tag::ab);
|
| 81 |
const auto c_md = dnnl::memory::desc(c_dims, ct, tag::ab);
|
| 82 |
-
auto a_mem = dnnl::memory(a_in_md, eng, (
|
| 83 |
-
auto b_mem = dnnl::memory(b_in_md, eng, (
|
| 84 |
auto matmul_pd = dnnl::matmul::primitive_desc(eng, a_in_md, b_in_md, c_md);
|
| 85 |
auto c_mem = dnnl::memory(matmul_pd.dst_desc(), eng, c);
|
| 86 |
|
|
|
|
| 51 |
const auto a_in_md = dnnl::memory::desc(a_dims, at, a_trans ? tag::ba : tag::ab);
|
| 52 |
const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_trans ? tag::ba : tag::ab);
|
| 53 |
const auto c_md = dnnl::memory::desc(c_dims, ct, tag::ab);
|
| 54 |
+
auto a_mem = dnnl::memory(a_in_md, eng, const_cast<void*>(a));
|
| 55 |
+
auto b_mem = dnnl::memory(b_in_md, eng, const_cast<void*>(b));
|
| 56 |
auto matmul_pd = dnnl::matmul::primitive_desc(eng, a_in_md, b_in_md, c_md);
|
| 57 |
auto c_mem = dnnl::memory(matmul_pd.dst_desc(), eng, c);
|
| 58 |
|
|
|
|
| 79 |
const auto a_in_md = dnnl::memory::desc(a_dims, at, a_trans ? tag::ba : tag::ab);
|
| 80 |
const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_trans ? tag::ba : tag::ab);
|
| 81 |
const auto c_md = dnnl::memory::desc(c_dims, ct, tag::ab);
|
| 82 |
+
auto a_mem = dnnl::memory(a_in_md, eng, const_cast<void*>(a));
|
| 83 |
+
auto b_mem = dnnl::memory(b_in_md, eng, const_cast<void*>(b));
|
| 84 |
auto matmul_pd = dnnl::matmul::primitive_desc(eng, a_in_md, b_in_md, c_md);
|
| 85 |
auto c_mem = dnnl::memory(matmul_pd.dst_desc(), eng, c);
|
| 86 |
|
|
@@ -47,7 +47,7 @@ static ggml_sycl_device_info ggml_sycl_init() {
|
|
| 47 |
|
| 48 |
info.device_count = dpct::dev_mgr::instance().device_count();
|
| 49 |
if (info.device_count == 0) {
|
| 50 |
-
GGML_LOG_ERROR("%s: failed to initialize
|
| 51 |
return info;
|
| 52 |
}
|
| 53 |
|
|
@@ -64,7 +64,7 @@ static ggml_sycl_device_info ggml_sycl_init() {
|
|
| 64 |
#else
|
| 65 |
GGML_LOG_INFO("%s: SYCL_USE_XMX: no\n", __func__);
|
| 66 |
#endif
|
| 67 |
-
GGML_LOG_INFO("%s: found %d
|
| 68 |
|
| 69 |
for (int i = 0; i < info.device_count; ++i) {
|
| 70 |
info.devices[i].vmm = 0;
|
|
@@ -137,7 +137,6 @@ void ggml_backend_sycl_print_sycl_devices() {
|
|
| 137 |
|
| 138 |
for (int id = 0; id < device_count; ++id) {
|
| 139 |
sycl::device device = dpct::dev_mgr::instance().get_device(id);
|
| 140 |
-
sycl::backend backend = device.get_backend();
|
| 141 |
std::string backend_type = get_device_backend_and_type(device);
|
| 142 |
int type_id = DeviceNums[backend_type]++;
|
| 143 |
std::stringstream device_type;
|
|
@@ -420,13 +419,11 @@ ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
|
|
| 420 |
return true;
|
| 421 |
}
|
| 422 |
return false;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 423 |
}
|
| 424 |
-
catch (sycl::exception const &exc) {
|
| 425 |
-
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
| 426 |
-
<< ", line:" << __LINE__ << std::endl;
|
| 427 |
-
std::exit(1);
|
| 428 |
-
}
|
| 429 |
-
|
| 430 |
|
| 431 |
static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer,
|
| 432 |
uint8_t value) try {
|
|
@@ -1092,10 +1089,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
|
|
| 1092 |
ggml_sycl_buffer buffer_pool[MAX_SYCL_BUFFERS] = {};
|
| 1093 |
size_t pool_size = 0;
|
| 1094 |
|
| 1095 |
-
explicit ggml_sycl_pool_leg(queue_ptr qptr_, int device_) :
|
| 1096 |
-
qptr(qptr_),
|
| 1097 |
-
device(device_) {
|
| 1098 |
-
}
|
| 1099 |
|
| 1100 |
~ggml_sycl_pool_leg() {
|
| 1101 |
for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
|
|
@@ -1238,7 +1232,7 @@ static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy,
|
|
| 1238 |
zeros[i] = 0.f;
|
| 1239 |
qzeros[i] = 0;
|
| 1240 |
}
|
| 1241 |
-
const TC xi = ix < kx ? *(TC *)&x[iy * kx + ix] : zeros;
|
| 1242 |
float sum = xi[0];
|
| 1243 |
float amax = sycl::fabs(xi[0]);
|
| 1244 |
#pragma unroll
|
|
@@ -1799,6 +1793,9 @@ static void pool2d_nchw_kernel(
|
|
| 1799 |
switch (op) {
|
| 1800 |
case GGML_OP_POOL_AVG: res = 0; break;
|
| 1801 |
case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
|
|
|
|
|
|
|
|
|
|
| 1802 |
}
|
| 1803 |
|
| 1804 |
for (int i = bh; i < eh; i += 1) {
|
|
@@ -1817,6 +1814,9 @@ static void pool2d_nchw_kernel(
|
|
| 1817 |
switch (op) {
|
| 1818 |
case GGML_OP_POOL_AVG: res += (cur / (kh * kw)); break;
|
| 1819 |
case GGML_OP_POOL_MAX: res = sycl::max(res, (To)cur); break;
|
|
|
|
|
|
|
|
|
|
| 1820 |
}
|
| 1821 |
}
|
| 1822 |
}
|
|
@@ -1855,7 +1855,8 @@ static void get_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
|
|
| 1855 |
s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
|
| 1856 |
});
|
| 1857 |
|
| 1858 |
-
(
|
|
|
|
| 1859 |
}
|
| 1860 |
|
| 1861 |
template <typename src0_t>
|
|
@@ -1893,10 +1894,10 @@ static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tens
|
|
| 1893 |
});
|
| 1894 |
}
|
| 1895 |
|
| 1896 |
-
(
|
|
|
|
| 1897 |
}
|
| 1898 |
|
| 1899 |
-
|
| 1900 |
static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
|
| 1901 |
const int ky, const int kx_padded,
|
| 1902 |
queue_ptr stream) {
|
|
@@ -2464,8 +2465,8 @@ static void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, const ggml_tens
|
|
| 2464 |
|
| 2465 |
ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_repeat>>(ctx, dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
|
| 2466 |
|
| 2467 |
-
(
|
| 2468 |
-
(
|
| 2469 |
}
|
| 2470 |
|
| 2471 |
|
|
@@ -2484,17 +2485,18 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
|
| 2484 |
const int64_t ne00 = src0->ne[0];
|
| 2485 |
const int64_t ne10 = src1->ne[0];
|
| 2486 |
|
| 2487 |
-
const int64_t ne0 = dst->ne[0];
|
| 2488 |
|
| 2489 |
const int64_t row_diff = row_high - row_low;
|
| 2490 |
|
| 2491 |
int id;
|
| 2492 |
SYCL_CHECK(
|
| 2493 |
CHECK_TRY_ERROR(id = get_current_device_id()));
|
| 2494 |
-
|
|
|
|
| 2495 |
// the main device has a larger memory buffer to hold the results from all GPUs
|
| 2496 |
// ldc == nrows of the matrix that cuBLAS writes into
|
| 2497 |
int ldc = id == ctx.device ? ne0 : row_diff;
|
|
|
|
| 2498 |
|
| 2499 |
#ifdef GGML_SYCL_F16
|
| 2500 |
bool use_fp16 = true; // TODO(Yu) SYCL capability check
|
|
@@ -2531,9 +2533,9 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
|
| 2531 |
: src1_as_f16.get();
|
| 2532 |
ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool(), row_diff * src1_ncols);
|
| 2533 |
|
| 2534 |
-
const sycl::half alpha_f16 = 1.0f;
|
| 2535 |
-
const sycl::half beta_f16 = 0.0f;
|
| 2536 |
#if !GGML_SYCL_DNNL
|
|
|
|
|
|
|
| 2537 |
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
|
| 2538 |
*stream, oneapi::mkl::transpose::trans,
|
| 2539 |
oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
|
|
@@ -2570,9 +2572,9 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
|
| 2570 |
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
|
| 2571 |
const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
|
| 2572 |
|
| 2573 |
-
const float alpha = 1.0f;
|
| 2574 |
-
const float beta = 0.0f;
|
| 2575 |
#if !GGML_SYCL_DNNL
|
|
|
|
|
|
|
| 2576 |
# ifdef GGML_SYCL_NVIDIA
|
| 2577 |
SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
|
| 2578 |
oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ *stream }, oneapi::mkl::transpose::trans,
|
|
@@ -2590,9 +2592,9 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
|
| 2590 |
src0_ddf_i, DnnlGemmWrapper::to_dt<float>(), dst_dd_i, DnnlGemmWrapper::to_dt<float>());
|
| 2591 |
#endif
|
| 2592 |
}
|
| 2593 |
-
(
|
| 2594 |
-
(
|
| 2595 |
-
(
|
| 2596 |
}
|
| 2597 |
catch (sycl::exception const &exc) {
|
| 2598 |
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
|
@@ -2638,8 +2640,9 @@ static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, const ggml_tens
|
|
| 2638 |
item_ct1);
|
| 2639 |
});
|
| 2640 |
|
| 2641 |
-
(
|
| 2642 |
-
(
|
|
|
|
| 2643 |
}
|
| 2644 |
|
| 2645 |
inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
@@ -2654,9 +2657,10 @@ inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, const ggml_tensor
|
|
| 2654 |
|
| 2655 |
sum_rows_f32_sycl(src0_dd, dst_dd, ne, 1, main_stream);
|
| 2656 |
|
| 2657 |
-
(
|
| 2658 |
-
(
|
| 2659 |
-
(
|
|
|
|
| 2660 |
}
|
| 2661 |
|
| 2662 |
inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
@@ -2673,9 +2677,10 @@ inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, const ggml_te
|
|
| 2673 |
|
| 2674 |
sum_rows_f32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream);
|
| 2675 |
|
| 2676 |
-
(
|
| 2677 |
-
(
|
| 2678 |
-
(
|
|
|
|
| 2679 |
}
|
| 2680 |
|
| 2681 |
inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
@@ -2694,9 +2699,10 @@ inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, const ggml_ten
|
|
| 2694 |
|
| 2695 |
argsort_f32_i32_sycl(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream);
|
| 2696 |
|
| 2697 |
-
(
|
| 2698 |
-
(
|
| 2699 |
-
(
|
|
|
|
| 2700 |
}
|
| 2701 |
|
| 2702 |
inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
@@ -2713,9 +2719,10 @@ inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, const ggml_tens
|
|
| 2713 |
|
| 2714 |
argmax_f32_i32_sycl(src0_dd, (int *)dst_dd, ncols, nrows, main_stream);
|
| 2715 |
|
| 2716 |
-
(
|
| 2717 |
-
(
|
| 2718 |
-
(
|
|
|
|
| 2719 |
}
|
| 2720 |
|
| 2721 |
inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
@@ -2735,9 +2742,10 @@ inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, const gg
|
|
| 2735 |
|
| 2736 |
diag_mask_inf_f32_sycl(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
|
| 2737 |
|
| 2738 |
-
(
|
| 2739 |
-
(
|
| 2740 |
-
(
|
|
|
|
| 2741 |
}
|
| 2742 |
|
| 2743 |
inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
|
@@ -2758,9 +2766,10 @@ inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, const ggml_tenso
|
|
| 2758 |
*/
|
| 2759 |
SYCL_CHECK(0);
|
| 2760 |
|
| 2761 |
-
(
|
| 2762 |
-
(
|
| 2763 |
-
(
|
|
|
|
| 2764 |
}
|
| 2765 |
|
| 2766 |
inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
|
@@ -2783,9 +2792,10 @@ inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, const ggml_tenso
|
|
| 2783 |
*/
|
| 2784 |
SYCL_CHECK(0);
|
| 2785 |
|
| 2786 |
-
(
|
| 2787 |
-
(
|
| 2788 |
-
(
|
|
|
|
| 2789 |
}
|
| 2790 |
|
| 2791 |
static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) {
|
|
@@ -2862,7 +2872,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|
| 2862 |
|
| 2863 |
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
| 2864 |
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
| 2865 |
-
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
| 2866 |
|
| 2867 |
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
| 2868 |
const bool src1_is_contiguous = ggml_is_contiguous(src1);
|
|
@@ -3289,7 +3298,6 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
|
|
| 3289 |
|
| 3290 |
GGML_TENSOR_BINARY_OP_LOCALS
|
| 3291 |
|
| 3292 |
-
const int64_t ne_dst = ggml_nelements(dst);
|
| 3293 |
|
| 3294 |
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
| 3295 |
queue_ptr main_stream = ctx.stream();;
|
|
@@ -3397,6 +3405,7 @@ catch (sycl::exception const &exc) {
|
|
| 3397 |
|
| 3398 |
inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
|
| 3399 |
// TODO: accuracy issues in MMQ
|
|
|
|
| 3400 |
return false;
|
| 3401 |
}
|
| 3402 |
|
|
@@ -3772,7 +3781,7 @@ static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
|
|
| 3772 |
GGML_ABORT("fatal error");
|
| 3773 |
}
|
| 3774 |
|
| 3775 |
-
(
|
| 3776 |
}
|
| 3777 |
catch (sycl::exception const &exc) {
|
| 3778 |
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
|
@@ -3783,7 +3792,7 @@ catch (sycl::exception const &exc) {
|
|
| 3783 |
static void ggml_sycl_dup(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 3784 |
// TODO: why do we pass dst as src1 here?
|
| 3785 |
ggml_sycl_cpy(ctx, src0, dst, nullptr);
|
| 3786 |
-
(
|
| 3787 |
}
|
| 3788 |
|
| 3789 |
static void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -3828,13 +3837,16 @@ static void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, const ggml_tensor
|
|
| 3828 |
}
|
| 3829 |
|
| 3830 |
static void ggml_sycl_nop(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 3831 |
-
(
|
| 3832 |
-
(
|
| 3833 |
-
(
|
|
|
|
| 3834 |
}
|
| 3835 |
|
| 3836 |
void ggml_sycl_set_main_device(const int main_device) try {
|
| 3837 |
-
if (dpct::get_current_device_id() == main_device)
|
|
|
|
|
|
|
| 3838 |
check_allow_gpu_index(main_device);
|
| 3839 |
dpct::select_device(main_device);
|
| 3840 |
|
|
@@ -4202,6 +4214,7 @@ try
|
|
| 4202 |
{
|
| 4203 |
ggml_backend_sycl_context *sycl_ctx =
|
| 4204 |
(ggml_backend_sycl_context *)backend->context;
|
|
|
|
| 4205 |
sycl::event *sycl_event = static_cast<sycl::event *>(event->context);
|
| 4206 |
|
| 4207 |
const queue_ptr &stream = sycl_ctx->stream(sycl_ctx->device, 0);
|
|
@@ -4216,7 +4229,7 @@ catch (sycl::exception const &exc)
|
|
| 4216 |
}
|
| 4217 |
|
| 4218 |
static void ggml_backend_sycl_event_wait(ggml_backend_t backend, ggml_backend_event_t event) try {
|
| 4219 |
-
|
| 4220 |
sycl::event* sycl_event = static_cast<sycl::event*>(event->context);
|
| 4221 |
|
| 4222 |
if (ggml_backend_is_sycl(backend)) {
|
|
@@ -4624,6 +4637,7 @@ static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, cons
|
|
| 4624 |
// SYCL doesn't support registering host memory, left here for reference
|
| 4625 |
// "ggml_backend_register_host_buffer"
|
| 4626 |
// "ggml_backend_unregister_host_buffer"
|
|
|
|
| 4627 |
return nullptr;
|
| 4628 |
}
|
| 4629 |
|
|
|
|
| 47 |
|
| 48 |
info.device_count = dpct::dev_mgr::instance().device_count();
|
| 49 |
if (info.device_count == 0) {
|
| 50 |
+
GGML_LOG_ERROR("%s: failed to initialize: %s\n", GGML_SYCL_NAME, __func__);
|
| 51 |
return info;
|
| 52 |
}
|
| 53 |
|
|
|
|
| 64 |
#else
|
| 65 |
GGML_LOG_INFO("%s: SYCL_USE_XMX: no\n", __func__);
|
| 66 |
#endif
|
| 67 |
+
GGML_LOG_INFO("%s: found %d %s devices:\n", __func__, info.device_count, GGML_SYCL_NAME);
|
| 68 |
|
| 69 |
for (int i = 0; i < info.device_count; ++i) {
|
| 70 |
info.devices[i].vmm = 0;
|
|
|
|
| 137 |
|
| 138 |
for (int id = 0; id < device_count; ++id) {
|
| 139 |
sycl::device device = dpct::dev_mgr::instance().get_device(id);
|
|
|
|
| 140 |
std::string backend_type = get_device_backend_and_type(device);
|
| 141 |
int type_id = DeviceNums[backend_type]++;
|
| 142 |
std::stringstream device_type;
|
|
|
|
| 419 |
return true;
|
| 420 |
}
|
| 421 |
return false;
|
| 422 |
+
GGML_UNUSED(buffer);
|
| 423 |
+
} catch (const sycl::exception & exc) {
|
| 424 |
+
std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl;
|
| 425 |
+
std::exit(1);
|
| 426 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 427 |
|
| 428 |
static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer,
|
| 429 |
uint8_t value) try {
|
|
|
|
| 1089 |
ggml_sycl_buffer buffer_pool[MAX_SYCL_BUFFERS] = {};
|
| 1090 |
size_t pool_size = 0;
|
| 1091 |
|
| 1092 |
+
explicit ggml_sycl_pool_leg(queue_ptr qptr_, int device_) : device(device_), qptr(qptr_) {}
|
|
|
|
|
|
|
|
|
|
| 1093 |
|
| 1094 |
~ggml_sycl_pool_leg() {
|
| 1095 |
for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
|
|
|
|
| 1232 |
zeros[i] = 0.f;
|
| 1233 |
qzeros[i] = 0;
|
| 1234 |
}
|
| 1235 |
+
const TC xi = ix < kx ? *(const TC *)&x[iy * kx + ix] : zeros;
|
| 1236 |
float sum = xi[0];
|
| 1237 |
float amax = sycl::fabs(xi[0]);
|
| 1238 |
#pragma unroll
|
|
|
|
| 1793 |
switch (op) {
|
| 1794 |
case GGML_OP_POOL_AVG: res = 0; break;
|
| 1795 |
case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
|
| 1796 |
+
default:
|
| 1797 |
+
res = (To) sycl::nan(uint32_t(0));
|
| 1798 |
+
break;
|
| 1799 |
}
|
| 1800 |
|
| 1801 |
for (int i = bh; i < eh; i += 1) {
|
|
|
|
| 1814 |
switch (op) {
|
| 1815 |
case GGML_OP_POOL_AVG: res += (cur / (kh * kw)); break;
|
| 1816 |
case GGML_OP_POOL_MAX: res = sycl::max(res, (To)cur); break;
|
| 1817 |
+
default:
|
| 1818 |
+
res = (To) sycl::nan(uint32_t(0));
|
| 1819 |
+
break;
|
| 1820 |
}
|
| 1821 |
}
|
| 1822 |
}
|
|
|
|
| 1855 |
s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
|
| 1856 |
});
|
| 1857 |
|
| 1858 |
+
GGML_UNUSED(dst);
|
| 1859 |
+
GGML_UNUSED(ctx);
|
| 1860 |
}
|
| 1861 |
|
| 1862 |
template <typename src0_t>
|
|
|
|
| 1894 |
});
|
| 1895 |
}
|
| 1896 |
|
| 1897 |
+
GGML_UNUSED(dst);
|
| 1898 |
+
GGML_UNUSED(ctx);
|
| 1899 |
}
|
| 1900 |
|
|
|
|
| 1901 |
static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
|
| 1902 |
const int ky, const int kx_padded,
|
| 1903 |
queue_ptr stream) {
|
|
|
|
| 2465 |
|
| 2466 |
ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_repeat>>(ctx, dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
|
| 2467 |
|
| 2468 |
+
GGML_UNUSED(src1);
|
| 2469 |
+
GGML_UNUSED(src1_d);
|
| 2470 |
}
|
| 2471 |
|
| 2472 |
|
|
|
|
| 2485 |
const int64_t ne00 = src0->ne[0];
|
| 2486 |
const int64_t ne10 = src1->ne[0];
|
| 2487 |
|
|
|
|
| 2488 |
|
| 2489 |
const int64_t row_diff = row_high - row_low;
|
| 2490 |
|
| 2491 |
int id;
|
| 2492 |
SYCL_CHECK(
|
| 2493 |
CHECK_TRY_ERROR(id = get_current_device_id()));
|
| 2494 |
+
#if !GGML_SYCL_DNNL
|
| 2495 |
+
const int64_t ne0 = dst->ne[0];
|
| 2496 |
// the main device has a larger memory buffer to hold the results from all GPUs
|
| 2497 |
// ldc == nrows of the matrix that cuBLAS writes into
|
| 2498 |
int ldc = id == ctx.device ? ne0 : row_diff;
|
| 2499 |
+
#endif
|
| 2500 |
|
| 2501 |
#ifdef GGML_SYCL_F16
|
| 2502 |
bool use_fp16 = true; // TODO(Yu) SYCL capability check
|
|
|
|
| 2533 |
: src1_as_f16.get();
|
| 2534 |
ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool(), row_diff * src1_ncols);
|
| 2535 |
|
|
|
|
|
|
|
| 2536 |
#if !GGML_SYCL_DNNL
|
| 2537 |
+
const sycl::half alpha_f16 = 1.0f;
|
| 2538 |
+
const sycl::half beta_f16 = 0.0f;
|
| 2539 |
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
|
| 2540 |
*stream, oneapi::mkl::transpose::trans,
|
| 2541 |
oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
|
|
|
|
| 2572 |
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
|
| 2573 |
const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
|
| 2574 |
|
|
|
|
|
|
|
| 2575 |
#if !GGML_SYCL_DNNL
|
| 2576 |
+
const float alpha = 1.0f;
|
| 2577 |
+
const float beta = 0.0f;
|
| 2578 |
# ifdef GGML_SYCL_NVIDIA
|
| 2579 |
SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
|
| 2580 |
oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ *stream }, oneapi::mkl::transpose::trans,
|
|
|
|
| 2592 |
src0_ddf_i, DnnlGemmWrapper::to_dt<float>(), dst_dd_i, DnnlGemmWrapper::to_dt<float>());
|
| 2593 |
#endif
|
| 2594 |
}
|
| 2595 |
+
GGML_UNUSED(dst);
|
| 2596 |
+
GGML_UNUSED(src1_ddq_i);
|
| 2597 |
+
GGML_UNUSED(src1_padded_row_size);
|
| 2598 |
}
|
| 2599 |
catch (sycl::exception const &exc) {
|
| 2600 |
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
|
|
|
| 2640 |
item_ct1);
|
| 2641 |
});
|
| 2642 |
|
| 2643 |
+
GGML_UNUSED(src1);
|
| 2644 |
+
GGML_UNUSED(src1_dd);
|
| 2645 |
+
GGML_UNUSED(ctx);
|
| 2646 |
}
|
| 2647 |
|
| 2648 |
inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
|
|
| 2657 |
|
| 2658 |
sum_rows_f32_sycl(src0_dd, dst_dd, ne, 1, main_stream);
|
| 2659 |
|
| 2660 |
+
GGML_UNUSED(src1);
|
| 2661 |
+
GGML_UNUSED(dst);
|
| 2662 |
+
GGML_UNUSED(src1_dd);
|
| 2663 |
+
GGML_UNUSED(ctx);
|
| 2664 |
}
|
| 2665 |
|
| 2666 |
inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
|
|
| 2677 |
|
| 2678 |
sum_rows_f32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream);
|
| 2679 |
|
| 2680 |
+
GGML_UNUSED(src1);
|
| 2681 |
+
GGML_UNUSED(dst);
|
| 2682 |
+
GGML_UNUSED(src1_dd);
|
| 2683 |
+
GGML_UNUSED(ctx);
|
| 2684 |
}
|
| 2685 |
|
| 2686 |
inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
|
|
| 2699 |
|
| 2700 |
argsort_f32_i32_sycl(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream);
|
| 2701 |
|
| 2702 |
+
GGML_UNUSED(src1);
|
| 2703 |
+
GGML_UNUSED(dst);
|
| 2704 |
+
GGML_UNUSED(src1_dd);
|
| 2705 |
+
GGML_UNUSED(ctx);
|
| 2706 |
}
|
| 2707 |
|
| 2708 |
inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
|
|
| 2719 |
|
| 2720 |
argmax_f32_i32_sycl(src0_dd, (int *)dst_dd, ncols, nrows, main_stream);
|
| 2721 |
|
| 2722 |
+
GGML_UNUSED(src1);
|
| 2723 |
+
GGML_UNUSED(dst);
|
| 2724 |
+
GGML_UNUSED(src1_dd);
|
| 2725 |
+
GGML_UNUSED(ctx);
|
| 2726 |
}
|
| 2727 |
|
| 2728 |
inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
|
|
| 2742 |
|
| 2743 |
diag_mask_inf_f32_sycl(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
|
| 2744 |
|
| 2745 |
+
GGML_UNUSED(src1);
|
| 2746 |
+
GGML_UNUSED(dst);
|
| 2747 |
+
GGML_UNUSED(src1_dd);
|
| 2748 |
+
GGML_UNUSED(ctx);
|
| 2749 |
}
|
| 2750 |
|
| 2751 |
inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
|
|
|
| 2766 |
*/
|
| 2767 |
SYCL_CHECK(0);
|
| 2768 |
|
| 2769 |
+
GGML_UNUSED(src1);
|
| 2770 |
+
GGML_UNUSED(dst);
|
| 2771 |
+
GGML_UNUSED(src1_dd);
|
| 2772 |
+
GGML_UNUSED(ctx);
|
| 2773 |
}
|
| 2774 |
|
| 2775 |
inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
|
|
|
| 2792 |
*/
|
| 2793 |
SYCL_CHECK(0);
|
| 2794 |
|
| 2795 |
+
GGML_UNUSED(src1);
|
| 2796 |
+
GGML_UNUSED(dst);
|
| 2797 |
+
GGML_UNUSED(src1_dd);
|
| 2798 |
+
GGML_UNUSED(ctx);
|
| 2799 |
}
|
| 2800 |
|
| 2801 |
static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) {
|
|
|
|
| 2872 |
|
| 2873 |
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
| 2874 |
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
|
|
|
| 2875 |
|
| 2876 |
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
| 2877 |
const bool src1_is_contiguous = ggml_is_contiguous(src1);
|
|
|
|
| 3298 |
|
| 3299 |
GGML_TENSOR_BINARY_OP_LOCALS
|
| 3300 |
|
|
|
|
| 3301 |
|
| 3302 |
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
| 3303 |
queue_ptr main_stream = ctx.stream();;
|
|
|
|
| 3405 |
|
| 3406 |
inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
|
| 3407 |
// TODO: accuracy issues in MMQ
|
| 3408 |
+
GGML_UNUSED(type);
|
| 3409 |
return false;
|
| 3410 |
}
|
| 3411 |
|
|
|
|
| 3781 |
GGML_ABORT("fatal error");
|
| 3782 |
}
|
| 3783 |
|
| 3784 |
+
GGML_UNUSED(dst);
|
| 3785 |
}
|
| 3786 |
catch (sycl::exception const &exc) {
|
| 3787 |
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
|
|
|
| 3792 |
static void ggml_sycl_dup(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 3793 |
// TODO: why do we pass dst as src1 here?
|
| 3794 |
ggml_sycl_cpy(ctx, src0, dst, nullptr);
|
| 3795 |
+
GGML_UNUSED(src1);
|
| 3796 |
}
|
| 3797 |
|
| 3798 |
static void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
|
| 3837 |
}
|
| 3838 |
|
| 3839 |
static void ggml_sycl_nop(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 3840 |
+
GGML_UNUSED(src0);
|
| 3841 |
+
GGML_UNUSED(src1);
|
| 3842 |
+
GGML_UNUSED(dst);
|
| 3843 |
+
GGML_UNUSED(ctx);
|
| 3844 |
}
|
| 3845 |
|
| 3846 |
void ggml_sycl_set_main_device(const int main_device) try {
|
| 3847 |
+
if (dpct::get_current_device_id() == static_cast<unsigned int> (main_device)) {
|
| 3848 |
+
return;
|
| 3849 |
+
}
|
| 3850 |
check_allow_gpu_index(main_device);
|
| 3851 |
dpct::select_device(main_device);
|
| 3852 |
|
|
|
|
| 4214 |
{
|
| 4215 |
ggml_backend_sycl_context *sycl_ctx =
|
| 4216 |
(ggml_backend_sycl_context *)backend->context;
|
| 4217 |
+
|
| 4218 |
sycl::event *sycl_event = static_cast<sycl::event *>(event->context);
|
| 4219 |
|
| 4220 |
const queue_ptr &stream = sycl_ctx->stream(sycl_ctx->device, 0);
|
|
|
|
| 4229 |
}
|
| 4230 |
|
| 4231 |
static void ggml_backend_sycl_event_wait(ggml_backend_t backend, ggml_backend_event_t event) try {
|
| 4232 |
+
|
| 4233 |
sycl::event* sycl_event = static_cast<sycl::event*>(event->context);
|
| 4234 |
|
| 4235 |
if (ggml_backend_is_sycl(backend)) {
|
|
|
|
| 4637 |
// SYCL doesn't support registering host memory, left here for reference
|
| 4638 |
// "ggml_backend_register_host_buffer"
|
| 4639 |
// "ggml_backend_unregister_host_buffer"
|
| 4640 |
+
GGML_UNUSED(name);
|
| 4641 |
return nullptr;
|
| 4642 |
}
|
| 4643 |
|
|
@@ -120,6 +120,7 @@ void ggml_sycl_op_im2col(
|
|
| 120 |
im2col_sycl(src1_dd, (float *)dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
|
| 121 |
}
|
| 122 |
|
| 123 |
-
(
|
| 124 |
-
(
|
|
|
|
| 125 |
}
|
|
|
|
| 120 |
im2col_sycl(src1_dd, (float *)dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
|
| 121 |
}
|
| 122 |
|
| 123 |
+
GGML_UNUSED(src0);
|
| 124 |
+
GGML_UNUSED(src0_dd);
|
| 125 |
+
GGML_UNUSED(ctx);
|
| 126 |
}
|
|
@@ -813,7 +813,7 @@ load_tiles_q4_K(const void *__restrict__ vx, int *__restrict__ x_ql,
|
|
| 813 |
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
|
| 814 |
}
|
| 815 |
|
| 816 |
-
|
| 817 |
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
| 818 |
|
| 819 |
#pragma unroll
|
|
@@ -961,7 +961,7 @@ load_tiles_q5_K(const void *__restrict__ vx, int *__restrict__ x_ql,
|
|
| 961 |
x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
|
| 962 |
}
|
| 963 |
|
| 964 |
-
|
| 965 |
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
| 966 |
|
| 967 |
#pragma unroll
|
|
@@ -1109,7 +1109,7 @@ load_tiles_q6_K(const void *__restrict__ vx, int *__restrict__ x_ql,
|
|
| 1109 |
dpct::sub_sat());
|
| 1110 |
}
|
| 1111 |
|
| 1112 |
-
|
| 1113 |
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
| 1114 |
float * x_dmf = (float *) x_dm;
|
| 1115 |
|
|
@@ -3020,9 +3020,9 @@ void ggml_sycl_op_mul_mat_q(
|
|
| 3020 |
break;
|
| 3021 |
}
|
| 3022 |
|
| 3023 |
-
(
|
| 3024 |
-
(
|
| 3025 |
-
(
|
| 3026 |
}
|
| 3027 |
catch (sycl::exception const &exc) {
|
| 3028 |
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
|
|
|
| 813 |
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
|
| 814 |
}
|
| 815 |
|
| 816 |
+
constexpr int blocks_per_tile_x_row = QI4_K > WARP_SIZE ? 1 : WARP_SIZE / QI4_K; // == 1 if QK_K == 256
|
| 817 |
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
| 818 |
|
| 819 |
#pragma unroll
|
|
|
|
| 961 |
x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
|
| 962 |
}
|
| 963 |
|
| 964 |
+
constexpr int blocks_per_tile_x_row = QI5_K > WARP_SIZE ? 1 : WARP_SIZE / QI5_K; // == 1 if QK_K == 256
|
| 965 |
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
| 966 |
|
| 967 |
#pragma unroll
|
|
|
|
| 1109 |
dpct::sub_sat());
|
| 1110 |
}
|
| 1111 |
|
| 1112 |
+
constexpr int blocks_per_tile_x_row = QI6_K > WARP_SIZE ? 1 : WARP_SIZE / QI6_K; // == 1 if QK_K == 256
|
| 1113 |
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
| 1114 |
float * x_dmf = (float *) x_dm;
|
| 1115 |
|
|
|
|
| 3020 |
break;
|
| 3021 |
}
|
| 3022 |
|
| 3023 |
+
GGML_UNUSED(src1);
|
| 3024 |
+
GGML_UNUSED(dst);
|
| 3025 |
+
GGML_UNUSED(src1_ddf_i);
|
| 3026 |
}
|
| 3027 |
catch (sycl::exception const &exc) {
|
| 3028 |
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
|
@@ -753,11 +753,7 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
|
|
| 753 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 754 |
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
| 755 |
{
|
| 756 |
-
|
| 757 |
-
stream->submit([&](sycl::handler &cgh) {
|
| 758 |
-
auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
|
| 759 |
-
auto ksigns64_ptr_ct1 = &ksigns64[0];
|
| 760 |
-
|
| 761 |
cgh.parallel_for(
|
| 762 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 763 |
[=](sycl::nd_item<3> item_ct1)
|
|
@@ -780,9 +776,6 @@ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
|
|
| 780 |
{
|
| 781 |
|
| 782 |
stream->submit([&](sycl::handler &cgh) {
|
| 783 |
-
auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
|
| 784 |
-
auto ksigns64_ptr_ct1 = &ksigns64[0];
|
| 785 |
-
|
| 786 |
cgh.parallel_for(
|
| 787 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 788 |
[=](sycl::nd_item<3> item_ct1)
|
|
@@ -805,9 +798,6 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
|
|
| 805 |
{
|
| 806 |
|
| 807 |
stream->submit([&](sycl::handler &cgh) {
|
| 808 |
-
auto iq3xxs_grid_ptr_ct1 = &iq3xxs_grid[0];
|
| 809 |
-
auto ksigns64_ptr_ct1 = &ksigns64[0];
|
| 810 |
-
|
| 811 |
cgh.parallel_for(
|
| 812 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 813 |
[=](sycl::nd_item<3> item_ct1)
|
|
@@ -830,8 +820,6 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
|
|
| 830 |
{
|
| 831 |
|
| 832 |
stream->submit([&](sycl::handler &cgh) {
|
| 833 |
-
auto iq3s_grid_ptr_ct1 = &iq3s_grid[0];
|
| 834 |
-
|
| 835 |
cgh.parallel_for(
|
| 836 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 837 |
[=](sycl::nd_item<3> item_ct1)
|
|
@@ -854,9 +842,6 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
|
|
| 854 |
{
|
| 855 |
|
| 856 |
stream->submit([&](sycl::handler &cgh) {
|
| 857 |
-
auto iq1s_grid_ptr_ct1 = &iq1s_grid_gpu[0];
|
| 858 |
-
auto ksigns64_ptr_ct1 = &ksigns64[0];
|
| 859 |
-
|
| 860 |
cgh.parallel_for(
|
| 861 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 862 |
[=](sycl::nd_item<3> item_ct1)
|
|
@@ -954,7 +939,7 @@ void ggml_sycl_op_mul_mat_vec_q(
|
|
| 954 |
const size_t q8_1_bs = QK8_1;
|
| 955 |
// the main device has a larger memory buffer to hold the results from all GPUs
|
| 956 |
// nrows_dst == nrows of the matrix that the kernel writes into
|
| 957 |
-
|
| 958 |
for (int i = 0; i < src1_ncols; i++)
|
| 959 |
{
|
| 960 |
const size_t src1_ddq_i_offset = i * src1_padded_col_size * q8_1_ts / q8_1_bs;
|
|
@@ -1023,7 +1008,8 @@ void ggml_sycl_op_mul_mat_vec_q(
|
|
| 1023 |
break;
|
| 1024 |
}
|
| 1025 |
}
|
| 1026 |
-
(
|
| 1027 |
-
(
|
| 1028 |
-
(
|
|
|
|
| 1029 |
}
|
|
|
|
| 753 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 754 |
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
| 755 |
{
|
| 756 |
+
stream->submit([&](sycl::handler & cgh) {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 757 |
cgh.parallel_for(
|
| 758 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 759 |
[=](sycl::nd_item<3> item_ct1)
|
|
|
|
| 776 |
{
|
| 777 |
|
| 778 |
stream->submit([&](sycl::handler &cgh) {
|
|
|
|
|
|
|
|
|
|
| 779 |
cgh.parallel_for(
|
| 780 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 781 |
[=](sycl::nd_item<3> item_ct1)
|
|
|
|
| 798 |
{
|
| 799 |
|
| 800 |
stream->submit([&](sycl::handler &cgh) {
|
|
|
|
|
|
|
|
|
|
| 801 |
cgh.parallel_for(
|
| 802 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 803 |
[=](sycl::nd_item<3> item_ct1)
|
|
|
|
| 820 |
{
|
| 821 |
|
| 822 |
stream->submit([&](sycl::handler &cgh) {
|
|
|
|
|
|
|
| 823 |
cgh.parallel_for(
|
| 824 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 825 |
[=](sycl::nd_item<3> item_ct1)
|
|
|
|
| 842 |
{
|
| 843 |
|
| 844 |
stream->submit([&](sycl::handler &cgh) {
|
|
|
|
|
|
|
|
|
|
| 845 |
cgh.parallel_for(
|
| 846 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 847 |
[=](sycl::nd_item<3> item_ct1)
|
|
|
|
| 939 |
const size_t q8_1_bs = QK8_1;
|
| 940 |
// the main device has a larger memory buffer to hold the results from all GPUs
|
| 941 |
// nrows_dst == nrows of the matrix that the kernel writes into
|
| 942 |
+
|
| 943 |
for (int i = 0; i < src1_ncols; i++)
|
| 944 |
{
|
| 945 |
const size_t src1_ddq_i_offset = i * src1_padded_col_size * q8_1_ts / q8_1_bs;
|
|
|
|
| 1008 |
break;
|
| 1009 |
}
|
| 1010 |
}
|
| 1011 |
+
GGML_UNUSED(src1);
|
| 1012 |
+
GGML_UNUSED(dst);
|
| 1013 |
+
GGML_UNUSED(src1_ddf_i);
|
| 1014 |
+
GGML_UNUSED(ctx);
|
| 1015 |
}
|
|
@@ -31,7 +31,7 @@ static void norm_f32(const float* x, float* dst, const int ncols, const float ep
|
|
| 31 |
*/
|
| 32 |
item_ct1.barrier(sycl::access::fence_space::local_space);
|
| 33 |
mean_var = 0.f;
|
| 34 |
-
|
| 35 |
for (size_t i = 0; i < nreduce; i += 1)
|
| 36 |
{
|
| 37 |
mean_var += s_sum[lane_id + i * WARP_SIZE];
|
|
@@ -55,7 +55,7 @@ static void group_norm_f32(const float* x, float* dst, const int group_size, con
|
|
| 55 |
const int nthreads = item_ct1.get_local_range(2);
|
| 56 |
const int nwarps = nthreads / WARP_SIZE;
|
| 57 |
start += item_ct1.get_local_id(2);
|
| 58 |
-
|
| 59 |
|
| 60 |
if (end >= ne_elements) {
|
| 61 |
end = ne_elements;
|
|
@@ -163,7 +163,7 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const floa
|
|
| 163 |
converged control flow. You may need to adjust the code.
|
| 164 |
*/
|
| 165 |
item_ct1.barrier(sycl::access::fence_space::local_space);
|
| 166 |
-
|
| 167 |
tmp = 0.f;
|
| 168 |
for (size_t i = 0; i < nreduce; i += 1)
|
| 169 |
{
|
|
@@ -352,6 +352,7 @@ void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, const ggml_tensor*
|
|
| 352 |
(void)src1;
|
| 353 |
(void)dst;
|
| 354 |
(void)src1_dd;
|
|
|
|
| 355 |
}
|
| 356 |
|
| 357 |
void ggml_sycl_op_rms_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
|
|
|
|
| 31 |
*/
|
| 32 |
item_ct1.barrier(sycl::access::fence_space::local_space);
|
| 33 |
mean_var = 0.f;
|
| 34 |
+
size_t nreduce = nwarps / WARP_SIZE;
|
| 35 |
for (size_t i = 0; i < nreduce; i += 1)
|
| 36 |
{
|
| 37 |
mean_var += s_sum[lane_id + i * WARP_SIZE];
|
|
|
|
| 55 |
const int nthreads = item_ct1.get_local_range(2);
|
| 56 |
const int nwarps = nthreads / WARP_SIZE;
|
| 57 |
start += item_ct1.get_local_id(2);
|
| 58 |
+
size_t nreduce = nwarps / WARP_SIZE;
|
| 59 |
|
| 60 |
if (end >= ne_elements) {
|
| 61 |
end = ne_elements;
|
|
|
|
| 163 |
converged control flow. You may need to adjust the code.
|
| 164 |
*/
|
| 165 |
item_ct1.barrier(sycl::access::fence_space::local_space);
|
| 166 |
+
size_t nreduce = nwarps / WARP_SIZE;
|
| 167 |
tmp = 0.f;
|
| 168 |
for (size_t i = 0; i < nreduce; i += 1)
|
| 169 |
{
|
|
|
|
| 352 |
(void)src1;
|
| 353 |
(void)dst;
|
| 354 |
(void)src1_dd;
|
| 355 |
+
GGML_UNUSED(ctx);
|
| 356 |
}
|
| 357 |
|
| 358 |
void ggml_sycl_op_rms_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
|
|
@@ -269,7 +269,8 @@ void ggml_sycl_op_rope(
|
|
| 269 |
}
|
| 270 |
}
|
| 271 |
|
| 272 |
-
(
|
| 273 |
-
(
|
| 274 |
-
(
|
|
|
|
| 275 |
}
|
|
|
|
| 269 |
}
|
| 270 |
}
|
| 271 |
|
| 272 |
+
GGML_UNUSED(src1);
|
| 273 |
+
GGML_UNUSED(dst);
|
| 274 |
+
GGML_UNUSED(src1_dd);
|
| 275 |
+
GGML_UNUSED(ctx);
|
| 276 |
}
|
|
@@ -16,7 +16,7 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
|
|
| 16 |
const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
| 17 |
const int nthreads = block_size;
|
| 18 |
const int nwarps = nthreads / WARP_SIZE;
|
| 19 |
-
|
| 20 |
float slope = 1.0f;
|
| 21 |
|
| 22 |
// ALiBi
|
|
@@ -53,8 +53,9 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
|
|
| 53 |
if (block_size > WARP_SIZE) {
|
| 54 |
if (warp_id == 0) {
|
| 55 |
buf[lane_id] = -INFINITY;
|
| 56 |
-
for (size_t i = 1; i < nreduce; i += 1)
|
| 57 |
buf[lane_id + i * WARP_SIZE] = -INFINITY;
|
|
|
|
| 58 |
}
|
| 59 |
item_ct1.barrier(sycl::access::fence_space::local_space);
|
| 60 |
|
|
@@ -63,8 +64,7 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
|
|
| 63 |
}
|
| 64 |
item_ct1.barrier(sycl::access::fence_space::local_space);
|
| 65 |
max_val = buf[lane_id];
|
| 66 |
-
for (size_t i = 1; i < nreduce; i += 1)
|
| 67 |
-
{
|
| 68 |
max_val = std::max(max_val, buf[lane_id + i * WARP_SIZE]);
|
| 69 |
}
|
| 70 |
max_val = warp_reduce_max(max_val, item_ct1);
|
|
@@ -89,8 +89,9 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
|
|
| 89 |
item_ct1.barrier(sycl::access::fence_space::local_space);
|
| 90 |
if (warp_id == 0) {
|
| 91 |
buf[lane_id] = 0.f;
|
| 92 |
-
for (size_t i = 1; i < nreduce; i += 1)
|
| 93 |
buf[lane_id + i * WARP_SIZE] = 0.f;
|
|
|
|
| 94 |
}
|
| 95 |
item_ct1.barrier(sycl::access::fence_space::local_space);
|
| 96 |
|
|
@@ -100,8 +101,7 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
|
|
| 100 |
item_ct1.barrier(sycl::access::fence_space::local_space);
|
| 101 |
|
| 102 |
tmp = buf[lane_id];
|
| 103 |
-
for (size_t i = 1; i < nreduce; i += 1)
|
| 104 |
-
{
|
| 105 |
tmp += buf[lane_id + i * WARP_SIZE];
|
| 106 |
}
|
| 107 |
tmp = warp_reduce_sum(tmp, item_ct1);
|
|
|
|
| 16 |
const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
| 17 |
const int nthreads = block_size;
|
| 18 |
const int nwarps = nthreads / WARP_SIZE;
|
| 19 |
+
size_t nreduce = nwarps / WARP_SIZE;
|
| 20 |
float slope = 1.0f;
|
| 21 |
|
| 22 |
// ALiBi
|
|
|
|
| 53 |
if (block_size > WARP_SIZE) {
|
| 54 |
if (warp_id == 0) {
|
| 55 |
buf[lane_id] = -INFINITY;
|
| 56 |
+
for (size_t i = 1; i < nreduce; i += 1) {
|
| 57 |
buf[lane_id + i * WARP_SIZE] = -INFINITY;
|
| 58 |
+
}
|
| 59 |
}
|
| 60 |
item_ct1.barrier(sycl::access::fence_space::local_space);
|
| 61 |
|
|
|
|
| 64 |
}
|
| 65 |
item_ct1.barrier(sycl::access::fence_space::local_space);
|
| 66 |
max_val = buf[lane_id];
|
| 67 |
+
for (size_t i = 1; i < nreduce; i += 1) {
|
|
|
|
| 68 |
max_val = std::max(max_val, buf[lane_id + i * WARP_SIZE]);
|
| 69 |
}
|
| 70 |
max_val = warp_reduce_max(max_val, item_ct1);
|
|
|
|
| 89 |
item_ct1.barrier(sycl::access::fence_space::local_space);
|
| 90 |
if (warp_id == 0) {
|
| 91 |
buf[lane_id] = 0.f;
|
| 92 |
+
for (size_t i = 1; i < nreduce; i += 1) {
|
| 93 |
buf[lane_id + i * WARP_SIZE] = 0.f;
|
| 94 |
+
}
|
| 95 |
}
|
| 96 |
item_ct1.barrier(sycl::access::fence_space::local_space);
|
| 97 |
|
|
|
|
| 101 |
item_ct1.barrier(sycl::access::fence_space::local_space);
|
| 102 |
|
| 103 |
tmp = buf[lane_id];
|
| 104 |
+
for (size_t i = 1; i < nreduce; i += 1) {
|
|
|
|
| 105 |
tmp += buf[lane_id + i * WARP_SIZE];
|
| 106 |
}
|
| 107 |
tmp = warp_reduce_sum(tmp, item_ct1);
|
|
@@ -68,4 +68,5 @@ void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, const ggml
|
|
| 68 |
const int max_period = dst->op_params[1];
|
| 69 |
|
| 70 |
timestep_embedding_f32_sycl(src0_d, dst_d, src0->ne[0], dst->nb[1], dim, max_period, stream);
|
|
|
|
| 71 |
}
|
|
|
|
| 68 |
const int max_period = dst->op_params[1];
|
| 69 |
|
| 70 |
timestep_embedding_f32_sycl(src0_d, dst_d, src0->ne[0], dst->nb[1], dim, max_period, stream);
|
| 71 |
+
GGML_UNUSED(src1);
|
| 72 |
}
|
|
@@ -59,7 +59,7 @@ static void rwkv_wkv_f32_kernel(
|
|
| 59 |
float y = 0;
|
| 60 |
|
| 61 |
// Process in chunks of 4 for better vectorization
|
| 62 |
-
sycl::float4 k4, r4, tf4, td4, s4
|
| 63 |
#pragma unroll
|
| 64 |
for (int j = 0; j < head_size; j += 4) {
|
| 65 |
// Load data in vec4 chunks
|
|
@@ -135,4 +135,7 @@ void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, const ggml_tensor* s
|
|
| 135 |
);
|
| 136 |
});
|
| 137 |
});
|
|
|
|
|
|
|
|
|
|
| 138 |
}
|
|
|
|
| 59 |
float y = 0;
|
| 60 |
|
| 61 |
// Process in chunks of 4 for better vectorization
|
| 62 |
+
sycl::float4 k4, r4, tf4, td4, s4;
|
| 63 |
#pragma unroll
|
| 64 |
for (int j = 0; j < head_size; j += 4) {
|
| 65 |
// Load data in vec4 chunks
|
|
|
|
| 135 |
);
|
| 136 |
});
|
| 137 |
});
|
| 138 |
+
|
| 139 |
+
GGML_UNUSED(src0);
|
| 140 |
+
GGML_UNUSED(src1);
|
| 141 |
}
|