Neo Zhang Jianyu arthw commited on
Commit
691e6ac
·
1 Parent(s): b6c412f

fix mul_mat_vec_q and *_vec_q error (llama/9939)

Browse files

Co-authored-by: arthw <[email protected]>

Files changed (1) hide show
  1. ggml/src/ggml-sycl/mmvq.cpp +69 -67
ggml/src/ggml-sycl/mmvq.cpp CHANGED
@@ -1,6 +1,6 @@
1
  #include "mmvq.hpp"
2
  #include "vecdotq.hpp"
3
-
4
 
5
  template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
6
  static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
@@ -13,7 +13,8 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_
13
  }
14
 
15
  const int blocks_per_row = ncols / qk;
16
- const int blocks_per_warp = vdr * WARP_SIZE / qi;
 
17
 
18
  // partial sum for each thread
19
  float tmp = 0.0f;
@@ -37,7 +38,7 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_
37
 
38
  // sum up partial sums and write back result
39
  #pragma unroll
40
- for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
41
  tmp +=
42
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
43
  }
@@ -61,7 +62,8 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx,
61
  }
62
 
63
  const int blocks_per_row = ncols / qk;
64
- const int blocks_per_warp = vdr * WARP_SIZE / qi;
 
65
 
66
  // partial sum for each thread
67
  float tmp = 0.0f;
@@ -85,7 +87,7 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx,
85
 
86
  // sum up partial sums and write back result
87
  #pragma unroll
88
- for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
89
  tmp +=
90
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
91
  }
@@ -109,8 +111,8 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx,
109
  }
110
 
111
  const int blocks_per_row = ncols / qk;
112
- const int blocks_per_warp = vdr * WARP_SIZE / qi;
113
-
114
  // partial sum for each thread
115
  float tmp = 0.0f;
116
 
@@ -133,7 +135,7 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx,
133
 
134
  // sum up partial sums and write back result
135
  #pragma unroll
136
- for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
137
  tmp +=
138
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
139
  }
@@ -157,8 +159,8 @@ static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx,
157
  }
158
 
159
  const int blocks_per_row = ncols / qk;
160
- const int blocks_per_warp = vdr * WARP_SIZE / qi;
161
-
162
  // partial sum for each thread
163
  float tmp = 0.0f;
164
 
@@ -181,7 +183,7 @@ static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx,
181
 
182
  // sum up partial sums and write back result
183
  #pragma unroll
184
- for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
185
  tmp +=
186
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
187
  }
@@ -205,8 +207,8 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx,
205
  }
206
 
207
  const int blocks_per_row = ncols / qk;
208
- const int blocks_per_warp = vdr * WARP_SIZE / qi;
209
-
210
  // partial sum for each thread
211
  float tmp = 0.0f;
212
 
@@ -229,7 +231,7 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx,
229
 
230
  // sum up partial sums and write back result
231
  #pragma unroll
232
- for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
233
  tmp +=
234
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
235
  }
@@ -253,8 +255,8 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx,
253
  }
254
 
255
  const int blocks_per_row = ncols / qk;
256
- const int blocks_per_warp = vdr * WARP_SIZE / qi;
257
-
258
  // partial sum for each thread
259
  float tmp = 0.0f;
260
 
@@ -277,7 +279,7 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx,
277
 
278
  // sum up partial sums and write back result
279
  #pragma unroll
280
- for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
281
  tmp +=
282
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
283
  }
@@ -301,8 +303,8 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx,
301
  }
302
 
303
  const int blocks_per_row = ncols / qk;
304
- const int blocks_per_warp = vdr * WARP_SIZE / qi;
305
-
306
  // partial sum for each thread
307
  float tmp = 0.0f;
308
 
@@ -325,7 +327,7 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx,
325
 
326
  // sum up partial sums and write back result
327
  #pragma unroll
328
- for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
329
  tmp +=
330
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
331
  }
@@ -349,8 +351,8 @@ static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx,
349
  }
350
 
351
  const int blocks_per_row = ncols / qk;
352
- const int blocks_per_warp = vdr * WARP_SIZE / qi;
353
-
354
  // partial sum for each thread
355
  float tmp = 0.0f;
356
 
@@ -373,7 +375,7 @@ static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx,
373
 
374
  // sum up partial sums and write back result
375
  #pragma unroll
376
- for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
377
  tmp +=
378
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
379
  }
@@ -397,8 +399,8 @@ static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx,
397
  }
398
 
399
  const int blocks_per_row = ncols / qk;
400
- const int blocks_per_warp = vdr * WARP_SIZE / qi;
401
-
402
  // partial sum for each thread
403
  float tmp = 0.0f;
404
 
@@ -421,7 +423,7 @@ static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx,
421
 
422
  // sum up partial sums and write back result
423
  #pragma unroll
424
- for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
425
  tmp +=
426
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
427
  }
@@ -446,8 +448,8 @@ static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
446
  }
447
 
448
  const int blocks_per_row = ncols / qk;
449
- const int blocks_per_warp = vdr * WARP_SIZE / qi;
450
-
451
  // partial sum for each thread
452
  float tmp = 0.0f;
453
 
@@ -470,7 +472,7 @@ static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
470
 
471
  // sum up partial sums and write back result
472
  #pragma unroll
473
- for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
474
  tmp +=
475
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
476
  }
@@ -487,7 +489,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
487
  GGML_ASSERT(ncols % QK4_0 == 0);
488
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
489
  const sycl::range<3> block_nums(1, 1, block_num_y);
490
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
491
  {
492
 
493
  stream->submit([&](sycl::handler &cgh) {
@@ -495,7 +497,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
495
  cgh.parallel_for(
496
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
497
  [=](sycl::nd_item<3> item_ct1)
498
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
499
  mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
500
  VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
501
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -511,7 +513,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
511
  GGML_ASSERT(ncols % QK4_1 == 0);
512
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
513
  const sycl::range<3> block_nums(1, 1, block_num_y);
514
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
515
  {
516
 
517
  stream->submit([&](sycl::handler &cgh) {
@@ -519,7 +521,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
519
  cgh.parallel_for(
520
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
521
  [=](sycl::nd_item<3> item_ct1)
522
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
523
  mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
524
  VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
525
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -535,7 +537,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
535
  GGML_ASSERT(ncols % QK5_0 == 0);
536
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
537
  const sycl::range<3> block_nums(1, 1, block_num_y);
538
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
539
  {
540
 
541
  stream->submit([&](sycl::handler &cgh) {
@@ -543,7 +545,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
543
  cgh.parallel_for(
544
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
545
  [=](sycl::nd_item<3> item_ct1)
546
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
547
  mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
548
  VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
549
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -559,7 +561,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
559
  GGML_ASSERT(ncols % QK5_1 == 0);
560
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
561
  const sycl::range<3> block_nums(1, 1, block_num_y);
562
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
563
  {
564
 
565
  stream->submit([&](sycl::handler &cgh) {
@@ -567,7 +569,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
567
  cgh.parallel_for(
568
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
569
  [=](sycl::nd_item<3> item_ct1)
570
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
571
  mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
572
  VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
573
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -583,7 +585,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
583
  GGML_ASSERT(ncols % QK8_0 == 0);
584
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
585
  const sycl::range<3> block_nums(1, 1, block_num_y);
586
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
587
  {
588
 
589
  stream->submit([&](sycl::handler &cgh) {
@@ -591,7 +593,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
591
  cgh.parallel_for(
592
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
593
  [=](sycl::nd_item<3> item_ct1)
594
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
595
  mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
596
  VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
597
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -607,7 +609,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
607
  GGML_ASSERT(ncols % QK_K == 0);
608
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
609
  const sycl::range<3> block_nums(1, 1, block_num_y);
610
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
611
  {
612
 
613
  stream->submit([&](sycl::handler &cgh) {
@@ -615,7 +617,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
615
  cgh.parallel_for(
616
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
617
  [=](sycl::nd_item<3> item_ct1)
618
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
619
  mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
620
  VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
621
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -631,7 +633,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
631
  GGML_ASSERT(ncols % QK_K == 0);
632
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
633
  const sycl::range<3> block_nums(1, 1, block_num_y);
634
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
635
  {
636
 
637
  stream->submit([&](sycl::handler &cgh) {
@@ -639,7 +641,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
639
  cgh.parallel_for(
640
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
641
  [=](sycl::nd_item<3> item_ct1)
642
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
643
  mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
644
  VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
645
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -655,7 +657,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
655
  GGML_ASSERT(ncols % QK_K == 0);
656
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
657
  const sycl::range<3> block_nums(1, 1, block_num_y);
658
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
659
  {
660
 
661
  stream->submit([&](sycl::handler &cgh) {
@@ -663,7 +665,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
663
  cgh.parallel_for(
664
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
665
  [=](sycl::nd_item<3> item_ct1)
666
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
667
  mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
668
  VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
669
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -679,7 +681,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
679
  GGML_ASSERT(ncols % QK_K == 0);
680
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
681
  const sycl::range<3> block_nums(1, 1, block_num_y);
682
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
683
  {
684
 
685
  stream->submit([&](sycl::handler &cgh) {
@@ -687,7 +689,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
687
  cgh.parallel_for(
688
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
689
  [=](sycl::nd_item<3> item_ct1)
690
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
691
  mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
692
  VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
693
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -703,7 +705,7 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
703
  GGML_ASSERT(ncols % QK_K == 0);
704
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
705
  const sycl::range<3> block_nums(1, 1, block_num_y);
706
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
707
  {
708
 
709
  stream->submit([&](sycl::handler &cgh) {
@@ -711,7 +713,7 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
711
  cgh.parallel_for(
712
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
713
  [=](sycl::nd_item<3> item_ct1)
714
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
715
  mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
716
  VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
717
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -728,13 +730,13 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
728
  GGML_ASSERT(ncols % QK_K == 0);
729
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
730
  const sycl::range<3> block_nums(1, 1, block_num_y);
731
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
732
  {
733
  stream->submit([&](sycl::handler &cgh) {
734
  cgh.parallel_for(
735
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
736
  [=](sycl::nd_item<3> item_ct1)
737
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
738
  mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS/2, block_iq2_xxs, 1>(
739
  vx, vy, dst, ncols, nrows, item_ct1);
740
  });
@@ -749,7 +751,7 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
749
  GGML_ASSERT(ncols % QK_K == 0);
750
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
751
  const sycl::range<3> block_nums(1, 1, block_num_y);
752
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
753
  {
754
 
755
  stream->submit([&](sycl::handler &cgh) {
@@ -759,7 +761,7 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
759
  cgh.parallel_for(
760
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
761
  [=](sycl::nd_item<3> item_ct1)
762
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
763
  mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS/2, block_iq2_xs, 1>(
764
  vx, vy, dst, ncols, nrows, item_ct1);
765
  });
@@ -774,7 +776,7 @@ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
774
  GGML_ASSERT(ncols % QK_K == 0);
775
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
776
  const sycl::range<3> block_nums(1, 1, block_num_y);
777
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
778
  {
779
 
780
  stream->submit([&](sycl::handler &cgh) {
@@ -784,7 +786,7 @@ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
784
  cgh.parallel_for(
785
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
786
  [=](sycl::nd_item<3> item_ct1)
787
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
788
  mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S/2, block_iq2_s, 1>(
789
  vx, vy, dst, ncols, nrows, item_ct1);
790
  });
@@ -799,7 +801,7 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
799
  GGML_ASSERT(ncols % QK_K == 0);
800
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
801
  const sycl::range<3> block_nums(1, 1, block_num_y);
802
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
803
  {
804
 
805
  stream->submit([&](sycl::handler &cgh) {
@@ -809,7 +811,7 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
809
  cgh.parallel_for(
810
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
811
  [=](sycl::nd_item<3> item_ct1)
812
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
813
  mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS/2, block_iq3_xxs, 1>(
814
  vx, vy, dst, ncols, nrows, item_ct1);
815
  });
@@ -824,7 +826,7 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
824
  GGML_ASSERT(ncols % QK_K == 0);
825
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
826
  const sycl::range<3> block_nums(1, 1, block_num_y);
827
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
828
  {
829
 
830
  stream->submit([&](sycl::handler &cgh) {
@@ -833,7 +835,7 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
833
  cgh.parallel_for(
834
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
835
  [=](sycl::nd_item<3> item_ct1)
836
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
837
  mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S/2, block_iq3_s, 1>(
838
  vx, vy, dst, ncols, nrows, item_ct1);
839
  });
@@ -848,7 +850,7 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
848
  GGML_ASSERT(ncols % QK_K == 0);
849
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
850
  const sycl::range<3> block_nums(1, 1, block_num_y);
851
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
852
  {
853
 
854
  stream->submit([&](sycl::handler &cgh) {
@@ -858,7 +860,7 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
858
  cgh.parallel_for(
859
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
860
  [=](sycl::nd_item<3> item_ct1)
861
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
862
  mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
863
  vx, vy, dst, ncols, nrows, item_ct1);
864
  });
@@ -873,13 +875,13 @@ static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy,
873
  GGML_ASSERT(ncols % QK_K == 0);
874
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
875
  const sycl::range<3> block_nums(1, 1, block_num_y);
876
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
877
  {
878
  stream->submit([&](sycl::handler &cgh) {
879
  cgh.parallel_for(
880
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
881
  [=](sycl::nd_item<3> item_ct1)
882
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
883
  mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
884
  vx, vy, dst, ncols, nrows, item_ct1);
885
  });
@@ -894,14 +896,14 @@ static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy,
894
  GGML_ASSERT(ncols % QK4_NL == 0);
895
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
896
  const sycl::range<3> block_nums(1, 1, block_num_y);
897
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
898
  {
899
 
900
  stream->submit([&](sycl::handler &cgh) {
901
  cgh.parallel_for(
902
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
903
  [=](sycl::nd_item<3> item_ct1)
904
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
905
  mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 2>(
906
  vx, vy, dst, ncols, nrows, item_ct1);
907
  });
@@ -916,14 +918,14 @@ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
916
  GGML_ASSERT(ncols % QK_K == 0);
917
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
918
  const sycl::range<3> block_nums(1, 1, block_num_y);
919
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
920
  {
921
 
922
  stream->submit([&](sycl::handler &cgh) {
923
  cgh.parallel_for(
924
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
925
  [=](sycl::nd_item<3> item_ct1)
926
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
927
  mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS/4, block_iq4_xs, 1>(
928
  vx, vy, dst, ncols, nrows, item_ct1);
929
  });
 
1
  #include "mmvq.hpp"
2
  #include "vecdotq.hpp"
3
+ #include <cassert>
4
 
5
  template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
6
  static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
 
13
  }
14
 
15
  const int blocks_per_row = ncols / qk;
16
+ const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
17
+ assert(blocks_per_warp>0);
18
 
19
  // partial sum for each thread
20
  float tmp = 0.0f;
 
38
 
39
  // sum up partial sums and write back result
40
  #pragma unroll
41
+ for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
42
  tmp +=
43
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
44
  }
 
62
  }
63
 
64
  const int blocks_per_row = ncols / qk;
65
+ const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
66
+ assert(blocks_per_warp>0);
67
 
68
  // partial sum for each thread
69
  float tmp = 0.0f;
 
87
 
88
  // sum up partial sums and write back result
89
  #pragma unroll
90
+ for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
91
  tmp +=
92
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
93
  }
 
111
  }
112
 
113
  const int blocks_per_row = ncols / qk;
114
+ const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
115
+ assert(blocks_per_warp>0);
116
  // partial sum for each thread
117
  float tmp = 0.0f;
118
 
 
135
 
136
  // sum up partial sums and write back result
137
  #pragma unroll
138
+ for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
139
  tmp +=
140
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
141
  }
 
159
  }
160
 
161
  const int blocks_per_row = ncols / qk;
162
+ const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
163
+ assert(blocks_per_warp>0);
164
  // partial sum for each thread
165
  float tmp = 0.0f;
166
 
 
183
 
184
  // sum up partial sums and write back result
185
  #pragma unroll
186
+ for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
187
  tmp +=
188
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
189
  }
 
207
  }
208
 
209
  const int blocks_per_row = ncols / qk;
210
+ const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
211
+ assert(blocks_per_warp>0);
212
  // partial sum for each thread
213
  float tmp = 0.0f;
214
 
 
231
 
232
  // sum up partial sums and write back result
233
  #pragma unroll
234
+ for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
235
  tmp +=
236
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
237
  }
 
255
  }
256
 
257
  const int blocks_per_row = ncols / qk;
258
+ const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
259
+ assert(blocks_per_warp>0);
260
  // partial sum for each thread
261
  float tmp = 0.0f;
262
 
 
279
 
280
  // sum up partial sums and write back result
281
  #pragma unroll
282
+ for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
283
  tmp +=
284
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
285
  }
 
303
  }
304
 
305
  const int blocks_per_row = ncols / qk;
306
+ const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
307
+ assert(blocks_per_warp>0);
308
  // partial sum for each thread
309
  float tmp = 0.0f;
310
 
 
327
 
328
  // sum up partial sums and write back result
329
  #pragma unroll
330
+ for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
331
  tmp +=
332
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
333
  }
 
351
  }
352
 
353
  const int blocks_per_row = ncols / qk;
354
+ const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
355
+ assert(blocks_per_warp>0);
356
  // partial sum for each thread
357
  float tmp = 0.0f;
358
 
 
375
 
376
  // sum up partial sums and write back result
377
  #pragma unroll
378
+ for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
379
  tmp +=
380
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
381
  }
 
399
  }
400
 
401
  const int blocks_per_row = ncols / qk;
402
+ const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
403
+ assert(blocks_per_warp>0);
404
  // partial sum for each thread
405
  float tmp = 0.0f;
406
 
 
423
 
424
  // sum up partial sums and write back result
425
  #pragma unroll
426
+ for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
427
  tmp +=
428
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
429
  }
 
448
  }
449
 
450
  const int blocks_per_row = ncols / qk;
451
+ const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
452
+ assert(blocks_per_warp>0);
453
  // partial sum for each thread
454
  float tmp = 0.0f;
455
 
 
472
 
473
  // sum up partial sums and write back result
474
  #pragma unroll
475
+ for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
476
  tmp +=
477
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
478
  }
 
489
  GGML_ASSERT(ncols % QK4_0 == 0);
490
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
491
  const sycl::range<3> block_nums(1, 1, block_num_y);
492
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
493
  {
494
 
495
  stream->submit([&](sycl::handler &cgh) {
 
497
  cgh.parallel_for(
498
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
499
  [=](sycl::nd_item<3> item_ct1)
500
+ [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
501
  mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
502
  VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
503
  vx, vy, dst, ncols, nrows, item_ct1);
 
513
  GGML_ASSERT(ncols % QK4_1 == 0);
514
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
515
  const sycl::range<3> block_nums(1, 1, block_num_y);
516
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
517
  {
518
 
519
  stream->submit([&](sycl::handler &cgh) {
 
521
  cgh.parallel_for(
522
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
523
  [=](sycl::nd_item<3> item_ct1)
524
+ [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
525
  mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
526
  VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
527
  vx, vy, dst, ncols, nrows, item_ct1);
 
537
  GGML_ASSERT(ncols % QK5_0 == 0);
538
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
539
  const sycl::range<3> block_nums(1, 1, block_num_y);
540
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
541
  {
542
 
543
  stream->submit([&](sycl::handler &cgh) {
 
545
  cgh.parallel_for(
546
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
547
  [=](sycl::nd_item<3> item_ct1)
548
+ [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
549
  mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
550
  VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
551
  vx, vy, dst, ncols, nrows, item_ct1);
 
561
  GGML_ASSERT(ncols % QK5_1 == 0);
562
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
563
  const sycl::range<3> block_nums(1, 1, block_num_y);
564
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
565
  {
566
 
567
  stream->submit([&](sycl::handler &cgh) {
 
569
  cgh.parallel_for(
570
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
571
  [=](sycl::nd_item<3> item_ct1)
572
+ [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
573
  mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
574
  VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
575
  vx, vy, dst, ncols, nrows, item_ct1);
 
585
  GGML_ASSERT(ncols % QK8_0 == 0);
586
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
587
  const sycl::range<3> block_nums(1, 1, block_num_y);
588
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
589
  {
590
 
591
  stream->submit([&](sycl::handler &cgh) {
 
593
  cgh.parallel_for(
594
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
595
  [=](sycl::nd_item<3> item_ct1)
596
+ [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
597
  mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
598
  VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
599
  vx, vy, dst, ncols, nrows, item_ct1);
 
609
  GGML_ASSERT(ncols % QK_K == 0);
610
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
611
  const sycl::range<3> block_nums(1, 1, block_num_y);
612
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
613
  {
614
 
615
  stream->submit([&](sycl::handler &cgh) {
 
617
  cgh.parallel_for(
618
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
619
  [=](sycl::nd_item<3> item_ct1)
620
+ [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
621
  mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
622
  VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
623
  vx, vy, dst, ncols, nrows, item_ct1);
 
633
  GGML_ASSERT(ncols % QK_K == 0);
634
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
635
  const sycl::range<3> block_nums(1, 1, block_num_y);
636
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
637
  {
638
 
639
  stream->submit([&](sycl::handler &cgh) {
 
641
  cgh.parallel_for(
642
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
643
  [=](sycl::nd_item<3> item_ct1)
644
+ [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
645
  mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
646
  VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
647
  vx, vy, dst, ncols, nrows, item_ct1);
 
657
  GGML_ASSERT(ncols % QK_K == 0);
658
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
659
  const sycl::range<3> block_nums(1, 1, block_num_y);
660
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
661
  {
662
 
663
  stream->submit([&](sycl::handler &cgh) {
 
665
  cgh.parallel_for(
666
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
667
  [=](sycl::nd_item<3> item_ct1)
668
+ [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
669
  mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
670
  VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
671
  vx, vy, dst, ncols, nrows, item_ct1);
 
681
  GGML_ASSERT(ncols % QK_K == 0);
682
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
683
  const sycl::range<3> block_nums(1, 1, block_num_y);
684
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
685
  {
686
 
687
  stream->submit([&](sycl::handler &cgh) {
 
689
  cgh.parallel_for(
690
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
691
  [=](sycl::nd_item<3> item_ct1)
692
+ [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
693
  mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
694
  VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
695
  vx, vy, dst, ncols, nrows, item_ct1);
 
705
  GGML_ASSERT(ncols % QK_K == 0);
706
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
707
  const sycl::range<3> block_nums(1, 1, block_num_y);
708
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
709
  {
710
 
711
  stream->submit([&](sycl::handler &cgh) {
 
713
  cgh.parallel_for(
714
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
715
  [=](sycl::nd_item<3> item_ct1)
716
+ [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
717
  mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
718
  VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
719
  vx, vy, dst, ncols, nrows, item_ct1);
 
730
  GGML_ASSERT(ncols % QK_K == 0);
731
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
732
  const sycl::range<3> block_nums(1, 1, block_num_y);
733
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
734
  {
735
  stream->submit([&](sycl::handler &cgh) {
736
  cgh.parallel_for(
737
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
738
  [=](sycl::nd_item<3> item_ct1)
739
+ [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
740
  mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS/2, block_iq2_xxs, 1>(
741
  vx, vy, dst, ncols, nrows, item_ct1);
742
  });
 
751
  GGML_ASSERT(ncols % QK_K == 0);
752
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
753
  const sycl::range<3> block_nums(1, 1, block_num_y);
754
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
755
  {
756
 
757
  stream->submit([&](sycl::handler &cgh) {
 
761
  cgh.parallel_for(
762
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
763
  [=](sycl::nd_item<3> item_ct1)
764
+ [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
765
  mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS/2, block_iq2_xs, 1>(
766
  vx, vy, dst, ncols, nrows, item_ct1);
767
  });
 
776
  GGML_ASSERT(ncols % QK_K == 0);
777
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
778
  const sycl::range<3> block_nums(1, 1, block_num_y);
779
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
780
  {
781
 
782
  stream->submit([&](sycl::handler &cgh) {
 
786
  cgh.parallel_for(
787
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
788
  [=](sycl::nd_item<3> item_ct1)
789
+ [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
790
  mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S/2, block_iq2_s, 1>(
791
  vx, vy, dst, ncols, nrows, item_ct1);
792
  });
 
801
  GGML_ASSERT(ncols % QK_K == 0);
802
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
803
  const sycl::range<3> block_nums(1, 1, block_num_y);
804
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
805
  {
806
 
807
  stream->submit([&](sycl::handler &cgh) {
 
811
  cgh.parallel_for(
812
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
813
  [=](sycl::nd_item<3> item_ct1)
814
+ [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
815
  mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS/2, block_iq3_xxs, 1>(
816
  vx, vy, dst, ncols, nrows, item_ct1);
817
  });
 
826
  GGML_ASSERT(ncols % QK_K == 0);
827
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
828
  const sycl::range<3> block_nums(1, 1, block_num_y);
829
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
830
  {
831
 
832
  stream->submit([&](sycl::handler &cgh) {
 
835
  cgh.parallel_for(
836
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
837
  [=](sycl::nd_item<3> item_ct1)
838
+ [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
839
  mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S/2, block_iq3_s, 1>(
840
  vx, vy, dst, ncols, nrows, item_ct1);
841
  });
 
850
  GGML_ASSERT(ncols % QK_K == 0);
851
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
852
  const sycl::range<3> block_nums(1, 1, block_num_y);
853
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
854
  {
855
 
856
  stream->submit([&](sycl::handler &cgh) {
 
860
  cgh.parallel_for(
861
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
862
  [=](sycl::nd_item<3> item_ct1)
863
+ [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
864
  mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
865
  vx, vy, dst, ncols, nrows, item_ct1);
866
  });
 
875
  GGML_ASSERT(ncols % QK_K == 0);
876
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
877
  const sycl::range<3> block_nums(1, 1, block_num_y);
878
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
879
  {
880
  stream->submit([&](sycl::handler &cgh) {
881
  cgh.parallel_for(
882
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
883
  [=](sycl::nd_item<3> item_ct1)
884
+ [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
885
  mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
886
  vx, vy, dst, ncols, nrows, item_ct1);
887
  });
 
896
  GGML_ASSERT(ncols % QK4_NL == 0);
897
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
898
  const sycl::range<3> block_nums(1, 1, block_num_y);
899
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
900
  {
901
 
902
  stream->submit([&](sycl::handler &cgh) {
903
  cgh.parallel_for(
904
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
905
  [=](sycl::nd_item<3> item_ct1)
906
+ [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
907
  mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 2>(
908
  vx, vy, dst, ncols, nrows, item_ct1);
909
  });
 
918
  GGML_ASSERT(ncols % QK_K == 0);
919
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
920
  const sycl::range<3> block_nums(1, 1, block_num_y);
921
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
922
  {
923
 
924
  stream->submit([&](sycl::handler &cgh) {
925
  cgh.parallel_for(
926
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
927
  [=](sycl::nd_item<3> item_ct1)
928
+ [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
929
  mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS/4, block_iq4_xs, 1>(
930
  vx, vy, dst, ncols, nrows, item_ct1);
931
  });