Spaces:
Running
Running
Neo Zhang Jianyu
arthw
commited on
Commit
·
691e6ac
1
Parent(s):
b6c412f
fix mul_mat_vec_q and *_vec_q error (llama/9939)
Browse filesCo-authored-by: arthw <[email protected]>
- ggml/src/ggml-sycl/mmvq.cpp +69 -67
ggml/src/ggml-sycl/mmvq.cpp
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
#include "mmvq.hpp"
|
| 2 |
#include "vecdotq.hpp"
|
| 3 |
-
|
| 4 |
|
| 5 |
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
|
| 6 |
static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
|
|
@@ -13,7 +13,8 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_
|
|
| 13 |
}
|
| 14 |
|
| 15 |
const int blocks_per_row = ncols / qk;
|
| 16 |
-
const int blocks_per_warp = vdr *
|
|
|
|
| 17 |
|
| 18 |
// partial sum for each thread
|
| 19 |
float tmp = 0.0f;
|
|
@@ -37,7 +38,7 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_
|
|
| 37 |
|
| 38 |
// sum up partial sums and write back result
|
| 39 |
#pragma unroll
|
| 40 |
-
for (int mask =
|
| 41 |
tmp +=
|
| 42 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 43 |
}
|
|
@@ -61,7 +62,8 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx,
|
|
| 61 |
}
|
| 62 |
|
| 63 |
const int blocks_per_row = ncols / qk;
|
| 64 |
-
const int blocks_per_warp = vdr *
|
|
|
|
| 65 |
|
| 66 |
// partial sum for each thread
|
| 67 |
float tmp = 0.0f;
|
|
@@ -85,7 +87,7 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx,
|
|
| 85 |
|
| 86 |
// sum up partial sums and write back result
|
| 87 |
#pragma unroll
|
| 88 |
-
for (int mask =
|
| 89 |
tmp +=
|
| 90 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 91 |
}
|
|
@@ -109,8 +111,8 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx,
|
|
| 109 |
}
|
| 110 |
|
| 111 |
const int blocks_per_row = ncols / qk;
|
| 112 |
-
const int blocks_per_warp = vdr *
|
| 113 |
-
|
| 114 |
// partial sum for each thread
|
| 115 |
float tmp = 0.0f;
|
| 116 |
|
|
@@ -133,7 +135,7 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx,
|
|
| 133 |
|
| 134 |
// sum up partial sums and write back result
|
| 135 |
#pragma unroll
|
| 136 |
-
for (int mask =
|
| 137 |
tmp +=
|
| 138 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 139 |
}
|
|
@@ -157,8 +159,8 @@ static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx,
|
|
| 157 |
}
|
| 158 |
|
| 159 |
const int blocks_per_row = ncols / qk;
|
| 160 |
-
const int blocks_per_warp = vdr *
|
| 161 |
-
|
| 162 |
// partial sum for each thread
|
| 163 |
float tmp = 0.0f;
|
| 164 |
|
|
@@ -181,7 +183,7 @@ static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx,
|
|
| 181 |
|
| 182 |
// sum up partial sums and write back result
|
| 183 |
#pragma unroll
|
| 184 |
-
for (int mask =
|
| 185 |
tmp +=
|
| 186 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 187 |
}
|
|
@@ -205,8 +207,8 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx,
|
|
| 205 |
}
|
| 206 |
|
| 207 |
const int blocks_per_row = ncols / qk;
|
| 208 |
-
const int blocks_per_warp = vdr *
|
| 209 |
-
|
| 210 |
// partial sum for each thread
|
| 211 |
float tmp = 0.0f;
|
| 212 |
|
|
@@ -229,7 +231,7 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx,
|
|
| 229 |
|
| 230 |
// sum up partial sums and write back result
|
| 231 |
#pragma unroll
|
| 232 |
-
for (int mask =
|
| 233 |
tmp +=
|
| 234 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 235 |
}
|
|
@@ -253,8 +255,8 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx,
|
|
| 253 |
}
|
| 254 |
|
| 255 |
const int blocks_per_row = ncols / qk;
|
| 256 |
-
const int blocks_per_warp = vdr *
|
| 257 |
-
|
| 258 |
// partial sum for each thread
|
| 259 |
float tmp = 0.0f;
|
| 260 |
|
|
@@ -277,7 +279,7 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx,
|
|
| 277 |
|
| 278 |
// sum up partial sums and write back result
|
| 279 |
#pragma unroll
|
| 280 |
-
for (int mask =
|
| 281 |
tmp +=
|
| 282 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 283 |
}
|
|
@@ -301,8 +303,8 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx,
|
|
| 301 |
}
|
| 302 |
|
| 303 |
const int blocks_per_row = ncols / qk;
|
| 304 |
-
const int blocks_per_warp = vdr *
|
| 305 |
-
|
| 306 |
// partial sum for each thread
|
| 307 |
float tmp = 0.0f;
|
| 308 |
|
|
@@ -325,7 +327,7 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx,
|
|
| 325 |
|
| 326 |
// sum up partial sums and write back result
|
| 327 |
#pragma unroll
|
| 328 |
-
for (int mask =
|
| 329 |
tmp +=
|
| 330 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 331 |
}
|
|
@@ -349,8 +351,8 @@ static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx,
|
|
| 349 |
}
|
| 350 |
|
| 351 |
const int blocks_per_row = ncols / qk;
|
| 352 |
-
const int blocks_per_warp = vdr *
|
| 353 |
-
|
| 354 |
// partial sum for each thread
|
| 355 |
float tmp = 0.0f;
|
| 356 |
|
|
@@ -373,7 +375,7 @@ static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx,
|
|
| 373 |
|
| 374 |
// sum up partial sums and write back result
|
| 375 |
#pragma unroll
|
| 376 |
-
for (int mask =
|
| 377 |
tmp +=
|
| 378 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 379 |
}
|
|
@@ -397,8 +399,8 @@ static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx,
|
|
| 397 |
}
|
| 398 |
|
| 399 |
const int blocks_per_row = ncols / qk;
|
| 400 |
-
const int blocks_per_warp = vdr *
|
| 401 |
-
|
| 402 |
// partial sum for each thread
|
| 403 |
float tmp = 0.0f;
|
| 404 |
|
|
@@ -421,7 +423,7 @@ static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx,
|
|
| 421 |
|
| 422 |
// sum up partial sums and write back result
|
| 423 |
#pragma unroll
|
| 424 |
-
for (int mask =
|
| 425 |
tmp +=
|
| 426 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 427 |
}
|
|
@@ -446,8 +448,8 @@ static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
|
|
| 446 |
}
|
| 447 |
|
| 448 |
const int blocks_per_row = ncols / qk;
|
| 449 |
-
const int blocks_per_warp = vdr *
|
| 450 |
-
|
| 451 |
// partial sum for each thread
|
| 452 |
float tmp = 0.0f;
|
| 453 |
|
|
@@ -470,7 +472,7 @@ static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
|
|
| 470 |
|
| 471 |
// sum up partial sums and write back result
|
| 472 |
#pragma unroll
|
| 473 |
-
for (int mask =
|
| 474 |
tmp +=
|
| 475 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 476 |
}
|
|
@@ -487,7 +489,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
|
|
| 487 |
GGML_ASSERT(ncols % QK4_0 == 0);
|
| 488 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 489 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 490 |
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y,
|
| 491 |
{
|
| 492 |
|
| 493 |
stream->submit([&](sycl::handler &cgh) {
|
|
@@ -495,7 +497,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
|
|
| 495 |
cgh.parallel_for(
|
| 496 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 497 |
[=](sycl::nd_item<3> item_ct1)
|
| 498 |
-
[[intel::reqd_sub_group_size(
|
| 499 |
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
|
| 500 |
VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
|
| 501 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -511,7 +513,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
|
| 511 |
GGML_ASSERT(ncols % QK4_1 == 0);
|
| 512 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 513 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 514 |
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y,
|
| 515 |
{
|
| 516 |
|
| 517 |
stream->submit([&](sycl::handler &cgh) {
|
|
@@ -519,7 +521,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
|
| 519 |
cgh.parallel_for(
|
| 520 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 521 |
[=](sycl::nd_item<3> item_ct1)
|
| 522 |
-
[[intel::reqd_sub_group_size(
|
| 523 |
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
|
| 524 |
VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
|
| 525 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -535,7 +537,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
|
| 535 |
GGML_ASSERT(ncols % QK5_0 == 0);
|
| 536 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 537 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 538 |
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y,
|
| 539 |
{
|
| 540 |
|
| 541 |
stream->submit([&](sycl::handler &cgh) {
|
|
@@ -543,7 +545,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
|
| 543 |
cgh.parallel_for(
|
| 544 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 545 |
[=](sycl::nd_item<3> item_ct1)
|
| 546 |
-
[[intel::reqd_sub_group_size(
|
| 547 |
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
|
| 548 |
VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
|
| 549 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -559,7 +561,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
|
| 559 |
GGML_ASSERT(ncols % QK5_1 == 0);
|
| 560 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 561 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 562 |
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y,
|
| 563 |
{
|
| 564 |
|
| 565 |
stream->submit([&](sycl::handler &cgh) {
|
|
@@ -567,7 +569,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
|
| 567 |
cgh.parallel_for(
|
| 568 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 569 |
[=](sycl::nd_item<3> item_ct1)
|
| 570 |
-
[[intel::reqd_sub_group_size(
|
| 571 |
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
|
| 572 |
VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
|
| 573 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -583,7 +585,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
|
| 583 |
GGML_ASSERT(ncols % QK8_0 == 0);
|
| 584 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 585 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 586 |
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y,
|
| 587 |
{
|
| 588 |
|
| 589 |
stream->submit([&](sycl::handler &cgh) {
|
|
@@ -591,7 +593,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
|
| 591 |
cgh.parallel_for(
|
| 592 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 593 |
[=](sycl::nd_item<3> item_ct1)
|
| 594 |
-
[[intel::reqd_sub_group_size(
|
| 595 |
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
|
| 596 |
VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
|
| 597 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -607,7 +609,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
|
| 607 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 608 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 609 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 610 |
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y,
|
| 611 |
{
|
| 612 |
|
| 613 |
stream->submit([&](sycl::handler &cgh) {
|
|
@@ -615,7 +617,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
|
| 615 |
cgh.parallel_for(
|
| 616 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 617 |
[=](sycl::nd_item<3> item_ct1)
|
| 618 |
-
[[intel::reqd_sub_group_size(
|
| 619 |
mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
|
| 620 |
VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
|
| 621 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -631,7 +633,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
|
| 631 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 632 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 633 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 634 |
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y,
|
| 635 |
{
|
| 636 |
|
| 637 |
stream->submit([&](sycl::handler &cgh) {
|
|
@@ -639,7 +641,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
|
| 639 |
cgh.parallel_for(
|
| 640 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 641 |
[=](sycl::nd_item<3> item_ct1)
|
| 642 |
-
[[intel::reqd_sub_group_size(
|
| 643 |
mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
|
| 644 |
VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
|
| 645 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -655,7 +657,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
|
| 655 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 656 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 657 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 658 |
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y,
|
| 659 |
{
|
| 660 |
|
| 661 |
stream->submit([&](sycl::handler &cgh) {
|
|
@@ -663,7 +665,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
|
| 663 |
cgh.parallel_for(
|
| 664 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 665 |
[=](sycl::nd_item<3> item_ct1)
|
| 666 |
-
[[intel::reqd_sub_group_size(
|
| 667 |
mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
|
| 668 |
VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
|
| 669 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -679,7 +681,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
|
| 679 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 680 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 681 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 682 |
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y,
|
| 683 |
{
|
| 684 |
|
| 685 |
stream->submit([&](sycl::handler &cgh) {
|
|
@@ -687,7 +689,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
|
| 687 |
cgh.parallel_for(
|
| 688 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 689 |
[=](sycl::nd_item<3> item_ct1)
|
| 690 |
-
[[intel::reqd_sub_group_size(
|
| 691 |
mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
|
| 692 |
VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
|
| 693 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -703,7 +705,7 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
|
| 703 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 704 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 705 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 706 |
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y,
|
| 707 |
{
|
| 708 |
|
| 709 |
stream->submit([&](sycl::handler &cgh) {
|
|
@@ -711,7 +713,7 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
|
| 711 |
cgh.parallel_for(
|
| 712 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 713 |
[=](sycl::nd_item<3> item_ct1)
|
| 714 |
-
[[intel::reqd_sub_group_size(
|
| 715 |
mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
|
| 716 |
VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
|
| 717 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -728,13 +730,13 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
|
|
| 728 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 729 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 730 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 731 |
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y,
|
| 732 |
{
|
| 733 |
stream->submit([&](sycl::handler &cgh) {
|
| 734 |
cgh.parallel_for(
|
| 735 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 736 |
[=](sycl::nd_item<3> item_ct1)
|
| 737 |
-
[[intel::reqd_sub_group_size(
|
| 738 |
mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS/2, block_iq2_xxs, 1>(
|
| 739 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 740 |
});
|
|
@@ -749,7 +751,7 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
|
|
| 749 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 750 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 751 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 752 |
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y,
|
| 753 |
{
|
| 754 |
|
| 755 |
stream->submit([&](sycl::handler &cgh) {
|
|
@@ -759,7 +761,7 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
|
|
| 759 |
cgh.parallel_for(
|
| 760 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 761 |
[=](sycl::nd_item<3> item_ct1)
|
| 762 |
-
[[intel::reqd_sub_group_size(
|
| 763 |
mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS/2, block_iq2_xs, 1>(
|
| 764 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 765 |
});
|
|
@@ -774,7 +776,7 @@ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
|
|
| 774 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 775 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 776 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 777 |
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y,
|
| 778 |
{
|
| 779 |
|
| 780 |
stream->submit([&](sycl::handler &cgh) {
|
|
@@ -784,7 +786,7 @@ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
|
|
| 784 |
cgh.parallel_for(
|
| 785 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 786 |
[=](sycl::nd_item<3> item_ct1)
|
| 787 |
-
[[intel::reqd_sub_group_size(
|
| 788 |
mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S/2, block_iq2_s, 1>(
|
| 789 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 790 |
});
|
|
@@ -799,7 +801,7 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
|
|
| 799 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 800 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 801 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 802 |
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y,
|
| 803 |
{
|
| 804 |
|
| 805 |
stream->submit([&](sycl::handler &cgh) {
|
|
@@ -809,7 +811,7 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
|
|
| 809 |
cgh.parallel_for(
|
| 810 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 811 |
[=](sycl::nd_item<3> item_ct1)
|
| 812 |
-
[[intel::reqd_sub_group_size(
|
| 813 |
mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS/2, block_iq3_xxs, 1>(
|
| 814 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 815 |
});
|
|
@@ -824,7 +826,7 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
|
|
| 824 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 825 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 826 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 827 |
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y,
|
| 828 |
{
|
| 829 |
|
| 830 |
stream->submit([&](sycl::handler &cgh) {
|
|
@@ -833,7 +835,7 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
|
|
| 833 |
cgh.parallel_for(
|
| 834 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 835 |
[=](sycl::nd_item<3> item_ct1)
|
| 836 |
-
[[intel::reqd_sub_group_size(
|
| 837 |
mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S/2, block_iq3_s, 1>(
|
| 838 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 839 |
});
|
|
@@ -848,7 +850,7 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
|
|
| 848 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 849 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 850 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 851 |
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y,
|
| 852 |
{
|
| 853 |
|
| 854 |
stream->submit([&](sycl::handler &cgh) {
|
|
@@ -858,7 +860,7 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
|
|
| 858 |
cgh.parallel_for(
|
| 859 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 860 |
[=](sycl::nd_item<3> item_ct1)
|
| 861 |
-
[[intel::reqd_sub_group_size(
|
| 862 |
mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
|
| 863 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 864 |
});
|
|
@@ -873,13 +875,13 @@ static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy,
|
|
| 873 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 874 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 875 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 876 |
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y,
|
| 877 |
{
|
| 878 |
stream->submit([&](sycl::handler &cgh) {
|
| 879 |
cgh.parallel_for(
|
| 880 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 881 |
[=](sycl::nd_item<3> item_ct1)
|
| 882 |
-
[[intel::reqd_sub_group_size(
|
| 883 |
mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
|
| 884 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 885 |
});
|
|
@@ -894,14 +896,14 @@ static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy,
|
|
| 894 |
GGML_ASSERT(ncols % QK4_NL == 0);
|
| 895 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 896 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 897 |
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y,
|
| 898 |
{
|
| 899 |
|
| 900 |
stream->submit([&](sycl::handler &cgh) {
|
| 901 |
cgh.parallel_for(
|
| 902 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 903 |
[=](sycl::nd_item<3> item_ct1)
|
| 904 |
-
[[intel::reqd_sub_group_size(
|
| 905 |
mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 2>(
|
| 906 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 907 |
});
|
|
@@ -916,14 +918,14 @@ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
|
|
| 916 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 917 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 918 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 919 |
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y,
|
| 920 |
{
|
| 921 |
|
| 922 |
stream->submit([&](sycl::handler &cgh) {
|
| 923 |
cgh.parallel_for(
|
| 924 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 925 |
[=](sycl::nd_item<3> item_ct1)
|
| 926 |
-
[[intel::reqd_sub_group_size(
|
| 927 |
mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS/4, block_iq4_xs, 1>(
|
| 928 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 929 |
});
|
|
|
|
| 1 |
#include "mmvq.hpp"
|
| 2 |
#include "vecdotq.hpp"
|
| 3 |
+
#include <cassert>
|
| 4 |
|
| 5 |
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
|
| 6 |
static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
|
|
|
|
| 13 |
}
|
| 14 |
|
| 15 |
const int blocks_per_row = ncols / qk;
|
| 16 |
+
const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
|
| 17 |
+
assert(blocks_per_warp>0);
|
| 18 |
|
| 19 |
// partial sum for each thread
|
| 20 |
float tmp = 0.0f;
|
|
|
|
| 38 |
|
| 39 |
// sum up partial sums and write back result
|
| 40 |
#pragma unroll
|
| 41 |
+
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
| 42 |
tmp +=
|
| 43 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 44 |
}
|
|
|
|
| 62 |
}
|
| 63 |
|
| 64 |
const int blocks_per_row = ncols / qk;
|
| 65 |
+
const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
|
| 66 |
+
assert(blocks_per_warp>0);
|
| 67 |
|
| 68 |
// partial sum for each thread
|
| 69 |
float tmp = 0.0f;
|
|
|
|
| 87 |
|
| 88 |
// sum up partial sums and write back result
|
| 89 |
#pragma unroll
|
| 90 |
+
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
| 91 |
tmp +=
|
| 92 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 93 |
}
|
|
|
|
| 111 |
}
|
| 112 |
|
| 113 |
const int blocks_per_row = ncols / qk;
|
| 114 |
+
const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
|
| 115 |
+
assert(blocks_per_warp>0);
|
| 116 |
// partial sum for each thread
|
| 117 |
float tmp = 0.0f;
|
| 118 |
|
|
|
|
| 135 |
|
| 136 |
// sum up partial sums and write back result
|
| 137 |
#pragma unroll
|
| 138 |
+
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
| 139 |
tmp +=
|
| 140 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 141 |
}
|
|
|
|
| 159 |
}
|
| 160 |
|
| 161 |
const int blocks_per_row = ncols / qk;
|
| 162 |
+
const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
|
| 163 |
+
assert(blocks_per_warp>0);
|
| 164 |
// partial sum for each thread
|
| 165 |
float tmp = 0.0f;
|
| 166 |
|
|
|
|
| 183 |
|
| 184 |
// sum up partial sums and write back result
|
| 185 |
#pragma unroll
|
| 186 |
+
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
| 187 |
tmp +=
|
| 188 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 189 |
}
|
|
|
|
| 207 |
}
|
| 208 |
|
| 209 |
const int blocks_per_row = ncols / qk;
|
| 210 |
+
const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
|
| 211 |
+
assert(blocks_per_warp>0);
|
| 212 |
// partial sum for each thread
|
| 213 |
float tmp = 0.0f;
|
| 214 |
|
|
|
|
| 231 |
|
| 232 |
// sum up partial sums and write back result
|
| 233 |
#pragma unroll
|
| 234 |
+
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
| 235 |
tmp +=
|
| 236 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 237 |
}
|
|
|
|
| 255 |
}
|
| 256 |
|
| 257 |
const int blocks_per_row = ncols / qk;
|
| 258 |
+
const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
|
| 259 |
+
assert(blocks_per_warp>0);
|
| 260 |
// partial sum for each thread
|
| 261 |
float tmp = 0.0f;
|
| 262 |
|
|
|
|
| 279 |
|
| 280 |
// sum up partial sums and write back result
|
| 281 |
#pragma unroll
|
| 282 |
+
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
| 283 |
tmp +=
|
| 284 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 285 |
}
|
|
|
|
| 303 |
}
|
| 304 |
|
| 305 |
const int blocks_per_row = ncols / qk;
|
| 306 |
+
const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
|
| 307 |
+
assert(blocks_per_warp>0);
|
| 308 |
// partial sum for each thread
|
| 309 |
float tmp = 0.0f;
|
| 310 |
|
|
|
|
| 327 |
|
| 328 |
// sum up partial sums and write back result
|
| 329 |
#pragma unroll
|
| 330 |
+
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
| 331 |
tmp +=
|
| 332 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 333 |
}
|
|
|
|
| 351 |
}
|
| 352 |
|
| 353 |
const int blocks_per_row = ncols / qk;
|
| 354 |
+
const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
|
| 355 |
+
assert(blocks_per_warp>0);
|
| 356 |
// partial sum for each thread
|
| 357 |
float tmp = 0.0f;
|
| 358 |
|
|
|
|
| 375 |
|
| 376 |
// sum up partial sums and write back result
|
| 377 |
#pragma unroll
|
| 378 |
+
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
| 379 |
tmp +=
|
| 380 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 381 |
}
|
|
|
|
| 399 |
}
|
| 400 |
|
| 401 |
const int blocks_per_row = ncols / qk;
|
| 402 |
+
const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
|
| 403 |
+
assert(blocks_per_warp>0);
|
| 404 |
// partial sum for each thread
|
| 405 |
float tmp = 0.0f;
|
| 406 |
|
|
|
|
| 423 |
|
| 424 |
// sum up partial sums and write back result
|
| 425 |
#pragma unroll
|
| 426 |
+
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
| 427 |
tmp +=
|
| 428 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 429 |
}
|
|
|
|
| 448 |
}
|
| 449 |
|
| 450 |
const int blocks_per_row = ncols / qk;
|
| 451 |
+
const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
|
| 452 |
+
assert(blocks_per_warp>0);
|
| 453 |
// partial sum for each thread
|
| 454 |
float tmp = 0.0f;
|
| 455 |
|
|
|
|
| 472 |
|
| 473 |
// sum up partial sums and write back result
|
| 474 |
#pragma unroll
|
| 475 |
+
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
| 476 |
tmp +=
|
| 477 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 478 |
}
|
|
|
|
| 489 |
GGML_ASSERT(ncols % QK4_0 == 0);
|
| 490 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 491 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 492 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
| 493 |
{
|
| 494 |
|
| 495 |
stream->submit([&](sycl::handler &cgh) {
|
|
|
|
| 497 |
cgh.parallel_for(
|
| 498 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 499 |
[=](sycl::nd_item<3> item_ct1)
|
| 500 |
+
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
| 501 |
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
|
| 502 |
VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
|
| 503 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
|
|
| 513 |
GGML_ASSERT(ncols % QK4_1 == 0);
|
| 514 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 515 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 516 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
| 517 |
{
|
| 518 |
|
| 519 |
stream->submit([&](sycl::handler &cgh) {
|
|
|
|
| 521 |
cgh.parallel_for(
|
| 522 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 523 |
[=](sycl::nd_item<3> item_ct1)
|
| 524 |
+
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
| 525 |
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
|
| 526 |
VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
|
| 527 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
|
|
| 537 |
GGML_ASSERT(ncols % QK5_0 == 0);
|
| 538 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 539 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 540 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
| 541 |
{
|
| 542 |
|
| 543 |
stream->submit([&](sycl::handler &cgh) {
|
|
|
|
| 545 |
cgh.parallel_for(
|
| 546 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 547 |
[=](sycl::nd_item<3> item_ct1)
|
| 548 |
+
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
| 549 |
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
|
| 550 |
VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
|
| 551 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
|
|
| 561 |
GGML_ASSERT(ncols % QK5_1 == 0);
|
| 562 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 563 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 564 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
| 565 |
{
|
| 566 |
|
| 567 |
stream->submit([&](sycl::handler &cgh) {
|
|
|
|
| 569 |
cgh.parallel_for(
|
| 570 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 571 |
[=](sycl::nd_item<3> item_ct1)
|
| 572 |
+
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
| 573 |
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
|
| 574 |
VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
|
| 575 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
|
|
| 585 |
GGML_ASSERT(ncols % QK8_0 == 0);
|
| 586 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 587 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 588 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
| 589 |
{
|
| 590 |
|
| 591 |
stream->submit([&](sycl::handler &cgh) {
|
|
|
|
| 593 |
cgh.parallel_for(
|
| 594 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 595 |
[=](sycl::nd_item<3> item_ct1)
|
| 596 |
+
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
| 597 |
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
|
| 598 |
VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
|
| 599 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
|
|
| 609 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 610 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 611 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 612 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
| 613 |
{
|
| 614 |
|
| 615 |
stream->submit([&](sycl::handler &cgh) {
|
|
|
|
| 617 |
cgh.parallel_for(
|
| 618 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 619 |
[=](sycl::nd_item<3> item_ct1)
|
| 620 |
+
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
| 621 |
mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
|
| 622 |
VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
|
| 623 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
|
|
| 633 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 634 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 635 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 636 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
| 637 |
{
|
| 638 |
|
| 639 |
stream->submit([&](sycl::handler &cgh) {
|
|
|
|
| 641 |
cgh.parallel_for(
|
| 642 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 643 |
[=](sycl::nd_item<3> item_ct1)
|
| 644 |
+
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
| 645 |
mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
|
| 646 |
VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
|
| 647 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
|
|
| 657 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 658 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 659 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 660 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
| 661 |
{
|
| 662 |
|
| 663 |
stream->submit([&](sycl::handler &cgh) {
|
|
|
|
| 665 |
cgh.parallel_for(
|
| 666 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 667 |
[=](sycl::nd_item<3> item_ct1)
|
| 668 |
+
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
| 669 |
mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
|
| 670 |
VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
|
| 671 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
|
|
| 681 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 682 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 683 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 684 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
| 685 |
{
|
| 686 |
|
| 687 |
stream->submit([&](sycl::handler &cgh) {
|
|
|
|
| 689 |
cgh.parallel_for(
|
| 690 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 691 |
[=](sycl::nd_item<3> item_ct1)
|
| 692 |
+
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
| 693 |
mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
|
| 694 |
VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
|
| 695 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
|
|
| 705 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 706 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 707 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 708 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
| 709 |
{
|
| 710 |
|
| 711 |
stream->submit([&](sycl::handler &cgh) {
|
|
|
|
| 713 |
cgh.parallel_for(
|
| 714 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 715 |
[=](sycl::nd_item<3> item_ct1)
|
| 716 |
+
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
| 717 |
mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
|
| 718 |
VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
|
| 719 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
|
|
| 730 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 731 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 732 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 733 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
| 734 |
{
|
| 735 |
stream->submit([&](sycl::handler &cgh) {
|
| 736 |
cgh.parallel_for(
|
| 737 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 738 |
[=](sycl::nd_item<3> item_ct1)
|
| 739 |
+
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
| 740 |
mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS/2, block_iq2_xxs, 1>(
|
| 741 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 742 |
});
|
|
|
|
| 751 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 752 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 753 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 754 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
| 755 |
{
|
| 756 |
|
| 757 |
stream->submit([&](sycl::handler &cgh) {
|
|
|
|
| 761 |
cgh.parallel_for(
|
| 762 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 763 |
[=](sycl::nd_item<3> item_ct1)
|
| 764 |
+
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
| 765 |
mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS/2, block_iq2_xs, 1>(
|
| 766 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 767 |
});
|
|
|
|
| 776 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 777 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 778 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 779 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
| 780 |
{
|
| 781 |
|
| 782 |
stream->submit([&](sycl::handler &cgh) {
|
|
|
|
| 786 |
cgh.parallel_for(
|
| 787 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 788 |
[=](sycl::nd_item<3> item_ct1)
|
| 789 |
+
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
| 790 |
mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S/2, block_iq2_s, 1>(
|
| 791 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 792 |
});
|
|
|
|
| 801 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 802 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 803 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 804 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
| 805 |
{
|
| 806 |
|
| 807 |
stream->submit([&](sycl::handler &cgh) {
|
|
|
|
| 811 |
cgh.parallel_for(
|
| 812 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 813 |
[=](sycl::nd_item<3> item_ct1)
|
| 814 |
+
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
| 815 |
mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS/2, block_iq3_xxs, 1>(
|
| 816 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 817 |
});
|
|
|
|
| 826 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 827 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 828 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 829 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
| 830 |
{
|
| 831 |
|
| 832 |
stream->submit([&](sycl::handler &cgh) {
|
|
|
|
| 835 |
cgh.parallel_for(
|
| 836 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 837 |
[=](sycl::nd_item<3> item_ct1)
|
| 838 |
+
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
| 839 |
mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S/2, block_iq3_s, 1>(
|
| 840 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 841 |
});
|
|
|
|
| 850 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 851 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 852 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 853 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
| 854 |
{
|
| 855 |
|
| 856 |
stream->submit([&](sycl::handler &cgh) {
|
|
|
|
| 860 |
cgh.parallel_for(
|
| 861 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 862 |
[=](sycl::nd_item<3> item_ct1)
|
| 863 |
+
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
| 864 |
mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
|
| 865 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 866 |
});
|
|
|
|
| 875 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 876 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 877 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 878 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
| 879 |
{
|
| 880 |
stream->submit([&](sycl::handler &cgh) {
|
| 881 |
cgh.parallel_for(
|
| 882 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 883 |
[=](sycl::nd_item<3> item_ct1)
|
| 884 |
+
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
| 885 |
mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
|
| 886 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 887 |
});
|
|
|
|
| 896 |
GGML_ASSERT(ncols % QK4_NL == 0);
|
| 897 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 898 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 899 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
| 900 |
{
|
| 901 |
|
| 902 |
stream->submit([&](sycl::handler &cgh) {
|
| 903 |
cgh.parallel_for(
|
| 904 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 905 |
[=](sycl::nd_item<3> item_ct1)
|
| 906 |
+
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
| 907 |
mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 2>(
|
| 908 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 909 |
});
|
|
|
|
| 918 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 919 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 920 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 921 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
| 922 |
{
|
| 923 |
|
| 924 |
stream->submit([&](sycl::handler &cgh) {
|
| 925 |
cgh.parallel_for(
|
| 926 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 927 |
[=](sycl::nd_item<3> item_ct1)
|
| 928 |
+
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
| 929 |
mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS/4, block_iq4_xs, 1>(
|
| 930 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 931 |
});
|