qnixsynapse Abhilash Majumder commited on
Commit
050e6ce
·
1 Parent(s): f76ba41

SYCL: Reduce most of the compiler warnings (llama/10748)

Browse files

* Try to reduce some unused and typecast warnings

* Reduce compiler warnings step 2

* add a newline at the end of the file

* Initialize nreduce as size_t

* [SYCL] Remove pragma directives from mmq.cpp

* SYCL: mmq add condition to prevent blocks_per_tile_x_row variable from becoming 0

* SYCL softmax: Initialize nreduce as size_t

* ggml-sycl.cpp: fix some trailing whitespaces

* SYCL: remove the unused variables instead of commenting it out

* SYCL poo2d kernel: set NAN for invalid pooling op

* SYCL gemm.hpp: remove pragma directives

* SYCL gemm.hpp: use const cast to properly support dnnl::memory

* SYCL: wkv6 remove a comment

* SYCL: clean comments step 2

* SYCL: clean comments and variables step 3

* SYCL: Use GGML_UNUSED for unused variables

* SYCL: remove extra empty lines and a comment

* Remove TODO

* cleanup spaces

* add a stdout for unsupported op

* use sycl printf over fprintf

* remove prints for CI

* SYCL ggml-sycl: pool2D use sycl::nan and remove if-else block

---------

Co-authored-by: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>

ggml/src/ggml-sycl/common.cpp CHANGED
@@ -11,6 +11,7 @@
11
  //
12
 
13
  #include "common.hpp"
 
14
 
15
  int get_current_device_id() {
16
  return dpct::dev_mgr::instance().current_device_id();
@@ -28,11 +29,7 @@ void* ggml_sycl_host_malloc(size_t size) try {
28
 
29
  if (err != 0) {
30
  // clear the error
31
- fprintf(
32
- stderr,
33
- "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
34
- size / 1024.0 / 1024.0,
35
- "syclGetErrorString is not supported");
36
  return nullptr;
37
  }
38
 
@@ -66,18 +63,12 @@ int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block
66
  void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
67
  const ggml_tensor *src1, ggml_tensor *dst,
68
  const ggml_sycl_op_flatten_t op) try {
69
- const int64_t nrows0 = ggml_nrows(src0);
70
 
71
  const bool use_src1 = src1 != nullptr;
72
- const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
73
 
74
  GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
75
  GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
76
 
77
- ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
78
- ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
79
- ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
80
-
81
  // dd = data device
82
  float * src0_ddf = (float *) src0->data;
83
  float * src1_ddf = use_src1 ? (float *) src1->data : nullptr;
 
11
  //
12
 
13
  #include "common.hpp"
14
+ #include "ggml-impl.h"
15
 
16
  int get_current_device_id() {
17
  return dpct::dev_mgr::instance().current_device_id();
 
29
 
30
  if (err != 0) {
31
  // clear the error
32
+ GGML_LOG_ERROR("WARNING: failed to allocate %.2f MB of pinned memory: %s\n", size / 1024.0 / 1024.0, "syclGetErrorString is not supported");
 
 
 
 
33
  return nullptr;
34
  }
35
 
 
63
  void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
64
  const ggml_tensor *src1, ggml_tensor *dst,
65
  const ggml_sycl_op_flatten_t op) try {
 
66
 
67
  const bool use_src1 = src1 != nullptr;
 
68
 
69
  GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
70
  GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
71
 
 
 
 
 
72
  // dd = data device
73
  float * src0_ddf = (float *) src0->data;
74
  float * src1_ddf = use_src1 ? (float *) src1->data : nullptr;
ggml/src/ggml-sycl/common.hpp CHANGED
@@ -626,6 +626,7 @@ struct bin_bcast_sycl {
626
  });
627
  }
628
  }
 
629
  }
630
  };
631
 
 
626
  });
627
  }
628
  }
629
+ GGML_UNUSED(ctx);
630
  }
631
  };
632
 
ggml/src/ggml-sycl/concat.cpp CHANGED
@@ -47,7 +47,7 @@ static void concat_f32_dim1(const float *x, const float *y, float *dst,
47
  // operation
48
  int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
49
  item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
50
- if (item_ct1.get_group(1) < ne01) { // src0
51
  int offset_src =
52
  nidx + item_ct1.get_group(1) * ne0 + item_ct1.get_group(0) * ne0 * ne01;
53
  dst[offset_dst] = x[offset_src];
@@ -70,7 +70,7 @@ static void concat_f32_dim2(const float *x, const float *y, float *dst,
70
  // operation
71
  int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
72
  item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
73
- if (item_ct1.get_group(0) < ne02) { // src0
74
  int offset_src = nidx + item_ct1.get_group(1) * ne0 +
75
  item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
76
  dst[offset_dst] = x[offset_src];
 
47
  // operation
48
  int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
49
  item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
50
+ if (item_ct1.get_group(1) < (size_t) ne01) { // src0
51
  int offset_src =
52
  nidx + item_ct1.get_group(1) * ne0 + item_ct1.get_group(0) * ne0 * ne01;
53
  dst[offset_dst] = x[offset_src];
 
70
  // operation
71
  int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
72
  item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
73
+ if (item_ct1.get_group(0) < (size_t) ne02) { // src0
74
  int offset_src = nidx + item_ct1.get_group(1) * ne0 +
75
  item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
76
  dst[offset_dst] = x[offset_src];
ggml/src/ggml-sycl/convert.cpp CHANGED
@@ -424,7 +424,7 @@ static void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y,
424
  const int64_t global_id = item_ct1.get_local_id(2) + work_group_size * item_ct1.get_group(2);
425
 
426
  // make each work-item deal with more elements since sycl global range can not exceed max int
427
- const src_t * x = (src_t *) vx;
428
  for (int64_t i = global_id; i < k; i += work_group_size * item_ct1.get_group_range(2)) {
429
  y[i] = x[i];
430
  }
 
424
  const int64_t global_id = item_ct1.get_local_id(2) + work_group_size * item_ct1.get_group(2);
425
 
426
  // make each work-item deal with more elements since sycl global range can not exceed max int
427
+ const src_t * x = (const src_t *) vx;
428
  for (int64_t i = global_id; i < k; i += work_group_size * item_ct1.get_group_range(2)) {
429
  y[i] = x[i];
430
  }
ggml/src/ggml-sycl/dmmv.cpp CHANGED
@@ -1015,9 +1015,9 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
1015
  break;
1016
  }
1017
 
1018
- (void) src1;
1019
- (void) dst;
1020
- (void) src1_ddq_i;
1021
- (void) src1_ncols;
1022
- (void) src1_padded_row_size;
1023
  }
 
1015
  break;
1016
  }
1017
 
1018
+ GGML_UNUSED(src1);
1019
+ GGML_UNUSED(dst);
1020
+ GGML_UNUSED(src1_ddq_i);
1021
+ GGML_UNUSED(src1_ncols);
1022
+ GGML_UNUSED(src1_padded_row_size);
1023
  }
ggml/src/ggml-sycl/dpct/helper.hpp CHANGED
@@ -1237,7 +1237,7 @@ namespace dpct
1237
 
1238
  std::map<byte_t *, allocation>::iterator get_map_iterator(const void *ptr)
1239
  {
1240
- auto it = m_map.upper_bound((byte_t *)ptr);
1241
  if (it == m_map.end())
1242
  {
1243
  // Not a virtual pointer.
 
1237
 
1238
  std::map<byte_t *, allocation>::iterator get_map_iterator(const void *ptr)
1239
  {
1240
+ auto it = m_map.upper_bound(const_cast<byte_t *>(reinterpret_cast<const byte_t *>(ptr)));
1241
  if (it == m_map.end())
1242
  {
1243
  // Not a virtual pointer.
ggml/src/ggml-sycl/element_wise.cpp CHANGED
@@ -237,7 +237,7 @@ void upscale_f32(const float *x, float *dst, const int nb00, const int nb01,
237
  int i02 = i12 / sf2;
238
  int i03 = i13 / sf3;
239
 
240
- dst[index] = *(float *)((char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
241
  }
242
 
243
  void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02,
@@ -251,8 +251,7 @@ void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const i
251
  // operation
252
  int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
253
  item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
254
- if (nidx < ne00 && item_ct1.get_group(1) < ne01 &&
255
- item_ct1.get_group(0) < ne02) {
256
  int offset_src = nidx + item_ct1.get_group(1) * ne00 +
257
  item_ct1.get_group(0) * ne00 * ne01;
258
  dst[offset_dst] = x[offset_src];
@@ -520,9 +519,10 @@ inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, const ggml_tensor
520
 
521
  silu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
522
 
523
- (void) src1;
524
- (void) dst;
525
- (void) src1_dd;
 
526
  }
527
 
528
  inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
@@ -535,9 +535,10 @@ inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, const ggml_tensor
535
 
536
  gelu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
537
 
538
- (void) src1;
539
- (void) dst;
540
- (void) src1_dd;
 
541
  }
542
  inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
543
  const ggml_tensor *src1, ggml_tensor *dst,
@@ -550,9 +551,10 @@ inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, const ggml_
550
 
551
  gelu_quick_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
552
 
553
- (void) src1;
554
- (void) dst;
555
- (void) src1_dd;
 
556
  }
557
 
558
  inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
@@ -564,9 +566,10 @@ inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, const ggml_tensor
564
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
565
  tanh_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
566
 
567
- (void) src1;
568
- (void) dst;
569
- (void) src1_dd;
 
570
  }
571
 
572
  inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
@@ -579,9 +582,10 @@ inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, const ggml_tensor
579
 
580
  relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
581
 
582
- (void) src1;
583
- (void) dst;
584
- (void) src1_dd;
 
585
  }
586
 
587
  inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -595,9 +599,10 @@ inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, const ggml
595
 
596
  hardsigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
597
 
598
- (void) src1;
599
- (void) dst;
600
- (void) src1_dd;
 
601
  }
602
 
603
  inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -610,9 +615,10 @@ inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, const ggml_t
610
 
611
  hardswish_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
612
 
613
- (void) src1;
614
- (void) dst;
615
- (void) src1_dd;
 
616
  }
617
 
618
  inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -625,9 +631,10 @@ inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, const ggml_tensor
625
 
626
  exp_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
627
 
628
- (void) src1;
629
- (void) dst;
630
- (void) src1_dd;
 
631
  }
632
 
633
  inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -640,9 +647,10 @@ inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, const ggml_tensor
640
 
641
  log_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
642
 
643
- (void) src1;
644
- (void) dst;
645
- (void) src1_dd;
 
646
  }
647
 
648
  inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -655,9 +663,10 @@ inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, const ggml_ten
655
 
656
  sigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
657
 
658
- (void) src1;
659
- (void) dst;
660
- (void) src1_dd;
 
661
  }
662
 
663
  inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -670,9 +679,10 @@ inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, const ggml_tensor
670
 
671
  sqrt_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
672
 
673
- (void) src1;
674
- (void) dst;
675
- (void) src1_dd;
 
676
  }
677
 
678
  inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -685,9 +695,10 @@ inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, const ggml_tensor
685
 
686
  sin_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
687
 
688
- (void) src1;
689
- (void) dst;
690
- (void) src1_dd;
 
691
  }
692
 
693
  inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -700,9 +711,10 @@ inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, const ggml_tensor
700
 
701
  cos_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
702
 
703
- (void) src1;
704
- (void) dst;
705
- (void) src1_dd;
 
706
  }
707
 
708
  inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -715,9 +727,10 @@ inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, const ggml_tensor
715
 
716
  step_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
717
 
718
- (void) src1;
719
- (void) dst;
720
- (void) src1_dd;
 
721
  }
722
 
723
  inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -730,9 +743,10 @@ inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, const ggml_tensor
730
 
731
  neg_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
732
 
733
- (void) src1;
734
- (void) dst;
735
- (void) src1_dd;
 
736
  }
737
 
738
  inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -749,9 +763,10 @@ inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, const ggml_
749
 
750
  leaky_relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream);
751
 
752
- (void) src1;
753
- (void) dst;
754
- (void) src1_dd;
 
755
  }
756
 
757
  inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
@@ -764,9 +779,10 @@ inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor
764
 
765
  sqr_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
766
 
767
- (void) src1;
768
- (void) dst;
769
- (void) src1_dd;
 
770
  }
771
 
772
  inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -787,9 +803,10 @@ inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, const ggml_ten
787
  dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3,
788
  main_stream);
789
 
790
- (void) src1;
791
- (void) dst;
792
- (void) src1_dd;
 
793
  }
794
 
795
  inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
@@ -805,9 +822,10 @@ inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, const ggml_tensor
805
  src0->ne[0], src0->ne[1], src0->ne[2],
806
  dst->ne[0], dst->ne[1], dst->ne[2], main_stream);
807
 
808
- (void) src1;
809
- (void) dst;
810
- (void) src1_dd;
 
811
  }
812
 
813
  inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
@@ -827,7 +845,8 @@ inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, const ggml_tensor
827
 
828
  acc_f32_sycl(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream);
829
 
830
- (void) dst;
 
831
  }
832
 
833
  inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
 
237
  int i02 = i12 / sf2;
238
  int i03 = i13 / sf3;
239
 
240
+ dst[index] = *(const float *)((const char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
241
  }
242
 
243
  void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02,
 
251
  // operation
252
  int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
253
  item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
254
+ if (nidx < ne00 && item_ct1.get_group(1) < (size_t) ne01 && item_ct1.get_group(0) < (size_t) ne02) {
 
255
  int offset_src = nidx + item_ct1.get_group(1) * ne00 +
256
  item_ct1.get_group(0) * ne00 * ne01;
257
  dst[offset_dst] = x[offset_src];
 
519
 
520
  silu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
521
 
522
+ GGML_UNUSED(src1);
523
+ GGML_UNUSED(dst);
524
+ GGML_UNUSED(src1_dd);
525
+ GGML_UNUSED(ctx);
526
  }
527
 
528
  inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
 
535
 
536
  gelu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
537
 
538
+ GGML_UNUSED(src1);
539
+ GGML_UNUSED(dst);
540
+ GGML_UNUSED(src1_dd);
541
+ GGML_UNUSED(ctx);
542
  }
543
  inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
544
  const ggml_tensor *src1, ggml_tensor *dst,
 
551
 
552
  gelu_quick_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
553
 
554
+ GGML_UNUSED(src1);
555
+ GGML_UNUSED(dst);
556
+ GGML_UNUSED(src1_dd);
557
+ GGML_UNUSED(ctx);
558
  }
559
 
560
  inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
 
566
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
567
  tanh_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
568
 
569
+ GGML_UNUSED(src1);
570
+ GGML_UNUSED(dst);
571
+ GGML_UNUSED(src1_dd);
572
+ GGML_UNUSED(ctx);
573
  }
574
 
575
  inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
 
582
 
583
  relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
584
 
585
+ GGML_UNUSED(src1);
586
+ GGML_UNUSED(dst);
587
+ GGML_UNUSED(src1_dd);
588
+ GGML_UNUSED(ctx);
589
  }
590
 
591
  inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
 
599
 
600
  hardsigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
601
 
602
+ GGML_UNUSED(src1);
603
+ GGML_UNUSED(dst);
604
+ GGML_UNUSED(src1_dd);
605
+ GGML_UNUSED(ctx);
606
  }
607
 
608
  inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
 
615
 
616
  hardswish_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
617
 
618
+ GGML_UNUSED(src1);
619
+ GGML_UNUSED(dst);
620
+ GGML_UNUSED(src1_dd);
621
+ GGML_UNUSED(ctx);
622
  }
623
 
624
  inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
 
631
 
632
  exp_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
633
 
634
+ GGML_UNUSED(src1);
635
+ GGML_UNUSED(dst);
636
+ GGML_UNUSED(src1_dd);
637
+ GGML_UNUSED(ctx);
638
  }
639
 
640
  inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
 
647
 
648
  log_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
649
 
650
+ GGML_UNUSED(src1);
651
+ GGML_UNUSED(dst);
652
+ GGML_UNUSED(src1_dd);
653
+ GGML_UNUSED(ctx);
654
  }
655
 
656
  inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
 
663
 
664
  sigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
665
 
666
+ GGML_UNUSED(src1);
667
+ GGML_UNUSED(dst);
668
+ GGML_UNUSED(src1_dd);
669
+ GGML_UNUSED(ctx);
670
  }
671
 
672
  inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
 
679
 
680
  sqrt_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
681
 
682
+ GGML_UNUSED(src1);
683
+ GGML_UNUSED(dst);
684
+ GGML_UNUSED(src1_dd);
685
+ GGML_UNUSED(ctx);
686
  }
687
 
688
  inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
 
695
 
696
  sin_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
697
 
698
+ GGML_UNUSED(src1);
699
+ GGML_UNUSED(dst);
700
+ GGML_UNUSED(src1_dd);
701
+ GGML_UNUSED(ctx);
702
  }
703
 
704
  inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
 
711
 
712
  cos_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
713
 
714
+ GGML_UNUSED(src1);
715
+ GGML_UNUSED(dst);
716
+ GGML_UNUSED(src1_dd);
717
+ GGML_UNUSED(ctx);
718
  }
719
 
720
  inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
 
727
 
728
  step_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
729
 
730
+ GGML_UNUSED(src1);
731
+ GGML_UNUSED(dst);
732
+ GGML_UNUSED(src1_dd);
733
+ GGML_UNUSED(ctx);
734
  }
735
 
736
  inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
 
743
 
744
  neg_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
745
 
746
+ GGML_UNUSED(src1);
747
+ GGML_UNUSED(dst);
748
+ GGML_UNUSED(src1_dd);
749
+ GGML_UNUSED(ctx);
750
  }
751
 
752
  inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
 
763
 
764
  leaky_relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream);
765
 
766
+ GGML_UNUSED(src1);
767
+ GGML_UNUSED(dst);
768
+ GGML_UNUSED(src1_dd);
769
+ GGML_UNUSED(ctx);
770
  }
771
 
772
  inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
 
779
 
780
  sqr_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
781
 
782
+ GGML_UNUSED(src1);
783
+ GGML_UNUSED(dst);
784
+ GGML_UNUSED(src1_dd);
785
+ GGML_UNUSED(ctx);
786
  }
787
 
788
  inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
 
803
  dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3,
804
  main_stream);
805
 
806
+ GGML_UNUSED(src1);
807
+ GGML_UNUSED(dst);
808
+ GGML_UNUSED(src1_dd);
809
+ GGML_UNUSED(ctx);
810
  }
811
 
812
  inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
 
822
  src0->ne[0], src0->ne[1], src0->ne[2],
823
  dst->ne[0], dst->ne[1], dst->ne[2], main_stream);
824
 
825
+ GGML_UNUSED(src1);
826
+ GGML_UNUSED(dst);
827
+ GGML_UNUSED(src1_dd);
828
+ GGML_UNUSED(ctx);
829
  }
830
 
831
  inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
 
845
 
846
  acc_f32_sycl(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream);
847
 
848
+ GGML_UNUSED(dst);
849
+ GGML_UNUSED(ctx);
850
  }
851
 
852
  inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
ggml/src/ggml-sycl/gemm.hpp CHANGED
@@ -51,8 +51,8 @@ public:
51
  const auto a_in_md = dnnl::memory::desc(a_dims, at, a_trans ? tag::ba : tag::ab);
52
  const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_trans ? tag::ba : tag::ab);
53
  const auto c_md = dnnl::memory::desc(c_dims, ct, tag::ab);
54
- auto a_mem = dnnl::memory(a_in_md, eng, (void*)a);
55
- auto b_mem = dnnl::memory(b_in_md, eng, (void*)b);
56
  auto matmul_pd = dnnl::matmul::primitive_desc(eng, a_in_md, b_in_md, c_md);
57
  auto c_mem = dnnl::memory(matmul_pd.dst_desc(), eng, c);
58
 
@@ -79,8 +79,8 @@ public:
79
  const auto a_in_md = dnnl::memory::desc(a_dims, at, a_trans ? tag::ba : tag::ab);
80
  const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_trans ? tag::ba : tag::ab);
81
  const auto c_md = dnnl::memory::desc(c_dims, ct, tag::ab);
82
- auto a_mem = dnnl::memory(a_in_md, eng, (void*)a);
83
- auto b_mem = dnnl::memory(b_in_md, eng, (void*)b);
84
  auto matmul_pd = dnnl::matmul::primitive_desc(eng, a_in_md, b_in_md, c_md);
85
  auto c_mem = dnnl::memory(matmul_pd.dst_desc(), eng, c);
86
 
 
51
  const auto a_in_md = dnnl::memory::desc(a_dims, at, a_trans ? tag::ba : tag::ab);
52
  const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_trans ? tag::ba : tag::ab);
53
  const auto c_md = dnnl::memory::desc(c_dims, ct, tag::ab);
54
+ auto a_mem = dnnl::memory(a_in_md, eng, const_cast<void*>(a));
55
+ auto b_mem = dnnl::memory(b_in_md, eng, const_cast<void*>(b));
56
  auto matmul_pd = dnnl::matmul::primitive_desc(eng, a_in_md, b_in_md, c_md);
57
  auto c_mem = dnnl::memory(matmul_pd.dst_desc(), eng, c);
58
 
 
79
  const auto a_in_md = dnnl::memory::desc(a_dims, at, a_trans ? tag::ba : tag::ab);
80
  const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_trans ? tag::ba : tag::ab);
81
  const auto c_md = dnnl::memory::desc(c_dims, ct, tag::ab);
82
+ auto a_mem = dnnl::memory(a_in_md, eng, const_cast<void*>(a));
83
+ auto b_mem = dnnl::memory(b_in_md, eng, const_cast<void*>(b));
84
  auto matmul_pd = dnnl::matmul::primitive_desc(eng, a_in_md, b_in_md, c_md);
85
  auto c_mem = dnnl::memory(matmul_pd.dst_desc(), eng, c);
86
 
ggml/src/ggml-sycl/ggml-sycl.cpp CHANGED
@@ -47,7 +47,7 @@ static ggml_sycl_device_info ggml_sycl_init() {
47
 
48
  info.device_count = dpct::dev_mgr::instance().device_count();
49
  if (info.device_count == 0) {
50
- GGML_LOG_ERROR("%s: failed to initialize " GGML_SYCL_NAME ": %s\n", __func__);
51
  return info;
52
  }
53
 
@@ -64,7 +64,7 @@ static ggml_sycl_device_info ggml_sycl_init() {
64
  #else
65
  GGML_LOG_INFO("%s: SYCL_USE_XMX: no\n", __func__);
66
  #endif
67
- GGML_LOG_INFO("%s: found %d " GGML_SYCL_NAME " devices:\n", __func__, info.device_count);
68
 
69
  for (int i = 0; i < info.device_count; ++i) {
70
  info.devices[i].vmm = 0;
@@ -137,7 +137,6 @@ void ggml_backend_sycl_print_sycl_devices() {
137
 
138
  for (int id = 0; id < device_count; ++id) {
139
  sycl::device device = dpct::dev_mgr::instance().get_device(id);
140
- sycl::backend backend = device.get_backend();
141
  std::string backend_type = get_device_backend_and_type(device);
142
  int type_id = DeviceNums[backend_type]++;
143
  std::stringstream device_type;
@@ -420,13 +419,11 @@ ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
420
  return true;
421
  }
422
  return false;
 
 
 
 
423
  }
424
- catch (sycl::exception const &exc) {
425
- std::cerr << exc.what() << "Exception caught at file:" << __FILE__
426
- << ", line:" << __LINE__ << std::endl;
427
- std::exit(1);
428
- }
429
-
430
 
431
  static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer,
432
  uint8_t value) try {
@@ -1092,10 +1089,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
1092
  ggml_sycl_buffer buffer_pool[MAX_SYCL_BUFFERS] = {};
1093
  size_t pool_size = 0;
1094
 
1095
- explicit ggml_sycl_pool_leg(queue_ptr qptr_, int device_) :
1096
- qptr(qptr_),
1097
- device(device_) {
1098
- }
1099
 
1100
  ~ggml_sycl_pool_leg() {
1101
  for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
@@ -1238,7 +1232,7 @@ static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy,
1238
  zeros[i] = 0.f;
1239
  qzeros[i] = 0;
1240
  }
1241
- const TC xi = ix < kx ? *(TC *)&x[iy * kx + ix] : zeros;
1242
  float sum = xi[0];
1243
  float amax = sycl::fabs(xi[0]);
1244
  #pragma unroll
@@ -1799,6 +1793,9 @@ static void pool2d_nchw_kernel(
1799
  switch (op) {
1800
  case GGML_OP_POOL_AVG: res = 0; break;
1801
  case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
 
 
 
1802
  }
1803
 
1804
  for (int i = bh; i < eh; i += 1) {
@@ -1817,6 +1814,9 @@ static void pool2d_nchw_kernel(
1817
  switch (op) {
1818
  case GGML_OP_POOL_AVG: res += (cur / (kh * kw)); break;
1819
  case GGML_OP_POOL_MAX: res = sycl::max(res, (To)cur); break;
 
 
 
1820
  }
1821
  }
1822
  }
@@ -1855,7 +1855,8 @@ static void get_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
1855
  s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
1856
  });
1857
 
1858
- (void) dst;
 
1859
  }
1860
 
1861
  template <typename src0_t>
@@ -1893,10 +1894,10 @@ static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tens
1893
  });
1894
  }
1895
 
1896
- (void) dst;
 
1897
  }
1898
 
1899
-
1900
  static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
1901
  const int ky, const int kx_padded,
1902
  queue_ptr stream) {
@@ -2464,8 +2465,8 @@ static void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, const ggml_tens
2464
 
2465
  ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_repeat>>(ctx, dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
2466
 
2467
- (void) src1;
2468
- (void) src1_d;
2469
  }
2470
 
2471
 
@@ -2484,17 +2485,18 @@ inline void ggml_sycl_op_mul_mat_sycl(
2484
  const int64_t ne00 = src0->ne[0];
2485
  const int64_t ne10 = src1->ne[0];
2486
 
2487
- const int64_t ne0 = dst->ne[0];
2488
 
2489
  const int64_t row_diff = row_high - row_low;
2490
 
2491
  int id;
2492
  SYCL_CHECK(
2493
  CHECK_TRY_ERROR(id = get_current_device_id()));
2494
-
 
2495
  // the main device has a larger memory buffer to hold the results from all GPUs
2496
  // ldc == nrows of the matrix that cuBLAS writes into
2497
  int ldc = id == ctx.device ? ne0 : row_diff;
 
2498
 
2499
  #ifdef GGML_SYCL_F16
2500
  bool use_fp16 = true; // TODO(Yu) SYCL capability check
@@ -2531,9 +2533,9 @@ inline void ggml_sycl_op_mul_mat_sycl(
2531
  : src1_as_f16.get();
2532
  ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool(), row_diff * src1_ncols);
2533
 
2534
- const sycl::half alpha_f16 = 1.0f;
2535
- const sycl::half beta_f16 = 0.0f;
2536
  #if !GGML_SYCL_DNNL
 
 
2537
  SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
2538
  *stream, oneapi::mkl::transpose::trans,
2539
  oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
@@ -2570,9 +2572,9 @@ inline void ggml_sycl_op_mul_mat_sycl(
2570
  const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
2571
  const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
2572
 
2573
- const float alpha = 1.0f;
2574
- const float beta = 0.0f;
2575
  #if !GGML_SYCL_DNNL
 
 
2576
  # ifdef GGML_SYCL_NVIDIA
2577
  SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
2578
  oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ *stream }, oneapi::mkl::transpose::trans,
@@ -2590,9 +2592,9 @@ inline void ggml_sycl_op_mul_mat_sycl(
2590
  src0_ddf_i, DnnlGemmWrapper::to_dt<float>(), dst_dd_i, DnnlGemmWrapper::to_dt<float>());
2591
  #endif
2592
  }
2593
- (void) dst;
2594
- (void) src1_ddq_i;
2595
- (void) src1_padded_row_size;
2596
  }
2597
  catch (sycl::exception const &exc) {
2598
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -2638,8 +2640,9 @@ static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, const ggml_tens
2638
  item_ct1);
2639
  });
2640
 
2641
- (void) src1;
2642
- (void) src1_dd;
 
2643
  }
2644
 
2645
  inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -2654,9 +2657,10 @@ inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, const ggml_tensor
2654
 
2655
  sum_rows_f32_sycl(src0_dd, dst_dd, ne, 1, main_stream);
2656
 
2657
- (void) src1;
2658
- (void) dst;
2659
- (void) src1_dd;
 
2660
  }
2661
 
2662
  inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -2673,9 +2677,10 @@ inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, const ggml_te
2673
 
2674
  sum_rows_f32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream);
2675
 
2676
- (void) src1;
2677
- (void) dst;
2678
- (void) src1_dd;
 
2679
  }
2680
 
2681
  inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -2694,9 +2699,10 @@ inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, const ggml_ten
2694
 
2695
  argsort_f32_i32_sycl(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream);
2696
 
2697
- (void) src1;
2698
- (void) dst;
2699
- (void) src1_dd;
 
2700
  }
2701
 
2702
  inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -2713,9 +2719,10 @@ inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, const ggml_tens
2713
 
2714
  argmax_f32_i32_sycl(src0_dd, (int *)dst_dd, ncols, nrows, main_stream);
2715
 
2716
- (void) src1;
2717
- (void) dst;
2718
- (void) src1_dd;
 
2719
  }
2720
 
2721
  inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -2735,9 +2742,10 @@ inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, const gg
2735
 
2736
  diag_mask_inf_f32_sycl(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
2737
 
2738
- (void) src1;
2739
- (void) dst;
2740
- (void) src1_dd;
 
2741
  }
2742
 
2743
  inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
@@ -2758,9 +2766,10 @@ inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, const ggml_tenso
2758
  */
2759
  SYCL_CHECK(0);
2760
 
2761
- (void) src1;
2762
- (void) dst;
2763
- (void) src1_dd;
 
2764
  }
2765
 
2766
  inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
@@ -2783,9 +2792,10 @@ inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, const ggml_tenso
2783
  */
2784
  SYCL_CHECK(0);
2785
 
2786
- (void) src1;
2787
- (void) dst;
2788
- (void) src1_dd;
 
2789
  }
2790
 
2791
  static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) {
@@ -2862,7 +2872,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
2862
 
2863
  ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
2864
  ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
2865
- ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
2866
 
2867
  const bool src0_is_contiguous = ggml_is_contiguous(src0);
2868
  const bool src1_is_contiguous = ggml_is_contiguous(src1);
@@ -3289,7 +3298,6 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
3289
 
3290
  GGML_TENSOR_BINARY_OP_LOCALS
3291
 
3292
- const int64_t ne_dst = ggml_nelements(dst);
3293
 
3294
  SYCL_CHECK(ggml_sycl_set_device(ctx.device));
3295
  queue_ptr main_stream = ctx.stream();;
@@ -3397,6 +3405,7 @@ catch (sycl::exception const &exc) {
3397
 
3398
  inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
3399
  // TODO: accuracy issues in MMQ
 
3400
  return false;
3401
  }
3402
 
@@ -3772,7 +3781,7 @@ static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
3772
  GGML_ABORT("fatal error");
3773
  }
3774
 
3775
- (void) dst;
3776
  }
3777
  catch (sycl::exception const &exc) {
3778
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -3783,7 +3792,7 @@ catch (sycl::exception const &exc) {
3783
  static void ggml_sycl_dup(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3784
  // TODO: why do we pass dst as src1 here?
3785
  ggml_sycl_cpy(ctx, src0, dst, nullptr);
3786
- (void) src1;
3787
  }
3788
 
3789
  static void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3828,13 +3837,16 @@ static void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, const ggml_tensor
3828
  }
3829
 
3830
  static void ggml_sycl_nop(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3831
- (void) src0;
3832
- (void) src1;
3833
- (void) dst;
 
3834
  }
3835
 
3836
  void ggml_sycl_set_main_device(const int main_device) try {
3837
- if (dpct::get_current_device_id() == main_device) return;
 
 
3838
  check_allow_gpu_index(main_device);
3839
  dpct::select_device(main_device);
3840
 
@@ -4202,6 +4214,7 @@ try
4202
  {
4203
  ggml_backend_sycl_context *sycl_ctx =
4204
  (ggml_backend_sycl_context *)backend->context;
 
4205
  sycl::event *sycl_event = static_cast<sycl::event *>(event->context);
4206
 
4207
  const queue_ptr &stream = sycl_ctx->stream(sycl_ctx->device, 0);
@@ -4216,7 +4229,7 @@ catch (sycl::exception const &exc)
4216
  }
4217
 
4218
  static void ggml_backend_sycl_event_wait(ggml_backend_t backend, ggml_backend_event_t event) try {
4219
- ggml_backend_sycl_context* sycl_ctx = static_cast<ggml_backend_sycl_context*>(backend->context);
4220
  sycl::event* sycl_event = static_cast<sycl::event*>(event->context);
4221
 
4222
  if (ggml_backend_is_sycl(backend)) {
@@ -4624,6 +4637,7 @@ static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, cons
4624
  // SYCL doesn't support registering host memory, left here for reference
4625
  // "ggml_backend_register_host_buffer"
4626
  // "ggml_backend_unregister_host_buffer"
 
4627
  return nullptr;
4628
  }
4629
 
 
47
 
48
  info.device_count = dpct::dev_mgr::instance().device_count();
49
  if (info.device_count == 0) {
50
+ GGML_LOG_ERROR("%s: failed to initialize: %s\n", GGML_SYCL_NAME, __func__);
51
  return info;
52
  }
53
 
 
64
  #else
65
  GGML_LOG_INFO("%s: SYCL_USE_XMX: no\n", __func__);
66
  #endif
67
+ GGML_LOG_INFO("%s: found %d %s devices:\n", __func__, info.device_count, GGML_SYCL_NAME);
68
 
69
  for (int i = 0; i < info.device_count; ++i) {
70
  info.devices[i].vmm = 0;
 
137
 
138
  for (int id = 0; id < device_count; ++id) {
139
  sycl::device device = dpct::dev_mgr::instance().get_device(id);
 
140
  std::string backend_type = get_device_backend_and_type(device);
141
  int type_id = DeviceNums[backend_type]++;
142
  std::stringstream device_type;
 
419
  return true;
420
  }
421
  return false;
422
+ GGML_UNUSED(buffer);
423
+ } catch (const sycl::exception & exc) {
424
+ std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl;
425
+ std::exit(1);
426
  }
 
 
 
 
 
 
427
 
428
  static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer,
429
  uint8_t value) try {
 
1089
  ggml_sycl_buffer buffer_pool[MAX_SYCL_BUFFERS] = {};
1090
  size_t pool_size = 0;
1091
 
1092
+ explicit ggml_sycl_pool_leg(queue_ptr qptr_, int device_) : device(device_), qptr(qptr_) {}
 
 
 
1093
 
1094
  ~ggml_sycl_pool_leg() {
1095
  for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
 
1232
  zeros[i] = 0.f;
1233
  qzeros[i] = 0;
1234
  }
1235
+ const TC xi = ix < kx ? *(const TC *)&x[iy * kx + ix] : zeros;
1236
  float sum = xi[0];
1237
  float amax = sycl::fabs(xi[0]);
1238
  #pragma unroll
 
1793
  switch (op) {
1794
  case GGML_OP_POOL_AVG: res = 0; break;
1795
  case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
1796
+ default:
1797
+ res = (To) sycl::nan(uint32_t(0));
1798
+ break;
1799
  }
1800
 
1801
  for (int i = bh; i < eh; i += 1) {
 
1814
  switch (op) {
1815
  case GGML_OP_POOL_AVG: res += (cur / (kh * kw)); break;
1816
  case GGML_OP_POOL_MAX: res = sycl::max(res, (To)cur); break;
1817
+ default:
1818
+ res = (To) sycl::nan(uint32_t(0));
1819
+ break;
1820
  }
1821
  }
1822
  }
 
1855
  s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
1856
  });
1857
 
1858
+ GGML_UNUSED(dst);
1859
+ GGML_UNUSED(ctx);
1860
  }
1861
 
1862
  template <typename src0_t>
 
1894
  });
1895
  }
1896
 
1897
+ GGML_UNUSED(dst);
1898
+ GGML_UNUSED(ctx);
1899
  }
1900
 
 
1901
  static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
1902
  const int ky, const int kx_padded,
1903
  queue_ptr stream) {
 
2465
 
2466
  ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_repeat>>(ctx, dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
2467
 
2468
+ GGML_UNUSED(src1);
2469
+ GGML_UNUSED(src1_d);
2470
  }
2471
 
2472
 
 
2485
  const int64_t ne00 = src0->ne[0];
2486
  const int64_t ne10 = src1->ne[0];
2487
 
 
2488
 
2489
  const int64_t row_diff = row_high - row_low;
2490
 
2491
  int id;
2492
  SYCL_CHECK(
2493
  CHECK_TRY_ERROR(id = get_current_device_id()));
2494
+ #if !GGML_SYCL_DNNL
2495
+ const int64_t ne0 = dst->ne[0];
2496
  // the main device has a larger memory buffer to hold the results from all GPUs
2497
  // ldc == nrows of the matrix that cuBLAS writes into
2498
  int ldc = id == ctx.device ? ne0 : row_diff;
2499
+ #endif
2500
 
2501
  #ifdef GGML_SYCL_F16
2502
  bool use_fp16 = true; // TODO(Yu) SYCL capability check
 
2533
  : src1_as_f16.get();
2534
  ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool(), row_diff * src1_ncols);
2535
 
 
 
2536
  #if !GGML_SYCL_DNNL
2537
+ const sycl::half alpha_f16 = 1.0f;
2538
+ const sycl::half beta_f16 = 0.0f;
2539
  SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
2540
  *stream, oneapi::mkl::transpose::trans,
2541
  oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
 
2572
  const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
2573
  const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
2574
 
 
 
2575
  #if !GGML_SYCL_DNNL
2576
+ const float alpha = 1.0f;
2577
+ const float beta = 0.0f;
2578
  # ifdef GGML_SYCL_NVIDIA
2579
  SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
2580
  oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ *stream }, oneapi::mkl::transpose::trans,
 
2592
  src0_ddf_i, DnnlGemmWrapper::to_dt<float>(), dst_dd_i, DnnlGemmWrapper::to_dt<float>());
2593
  #endif
2594
  }
2595
+ GGML_UNUSED(dst);
2596
+ GGML_UNUSED(src1_ddq_i);
2597
+ GGML_UNUSED(src1_padded_row_size);
2598
  }
2599
  catch (sycl::exception const &exc) {
2600
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
 
2640
  item_ct1);
2641
  });
2642
 
2643
+ GGML_UNUSED(src1);
2644
+ GGML_UNUSED(src1_dd);
2645
+ GGML_UNUSED(ctx);
2646
  }
2647
 
2648
  inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
 
2657
 
2658
  sum_rows_f32_sycl(src0_dd, dst_dd, ne, 1, main_stream);
2659
 
2660
+ GGML_UNUSED(src1);
2661
+ GGML_UNUSED(dst);
2662
+ GGML_UNUSED(src1_dd);
2663
+ GGML_UNUSED(ctx);
2664
  }
2665
 
2666
  inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
 
2677
 
2678
  sum_rows_f32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream);
2679
 
2680
+ GGML_UNUSED(src1);
2681
+ GGML_UNUSED(dst);
2682
+ GGML_UNUSED(src1_dd);
2683
+ GGML_UNUSED(ctx);
2684
  }
2685
 
2686
  inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
 
2699
 
2700
  argsort_f32_i32_sycl(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream);
2701
 
2702
+ GGML_UNUSED(src1);
2703
+ GGML_UNUSED(dst);
2704
+ GGML_UNUSED(src1_dd);
2705
+ GGML_UNUSED(ctx);
2706
  }
2707
 
2708
  inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
 
2719
 
2720
  argmax_f32_i32_sycl(src0_dd, (int *)dst_dd, ncols, nrows, main_stream);
2721
 
2722
+ GGML_UNUSED(src1);
2723
+ GGML_UNUSED(dst);
2724
+ GGML_UNUSED(src1_dd);
2725
+ GGML_UNUSED(ctx);
2726
  }
2727
 
2728
  inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
 
2742
 
2743
  diag_mask_inf_f32_sycl(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
2744
 
2745
+ GGML_UNUSED(src1);
2746
+ GGML_UNUSED(dst);
2747
+ GGML_UNUSED(src1_dd);
2748
+ GGML_UNUSED(ctx);
2749
  }
2750
 
2751
  inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
 
2766
  */
2767
  SYCL_CHECK(0);
2768
 
2769
+ GGML_UNUSED(src1);
2770
+ GGML_UNUSED(dst);
2771
+ GGML_UNUSED(src1_dd);
2772
+ GGML_UNUSED(ctx);
2773
  }
2774
 
2775
  inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
 
2792
  */
2793
  SYCL_CHECK(0);
2794
 
2795
+ GGML_UNUSED(src1);
2796
+ GGML_UNUSED(dst);
2797
+ GGML_UNUSED(src1_dd);
2798
+ GGML_UNUSED(ctx);
2799
  }
2800
 
2801
  static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) {
 
2872
 
2873
  ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
2874
  ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
 
2875
 
2876
  const bool src0_is_contiguous = ggml_is_contiguous(src0);
2877
  const bool src1_is_contiguous = ggml_is_contiguous(src1);
 
3298
 
3299
  GGML_TENSOR_BINARY_OP_LOCALS
3300
 
 
3301
 
3302
  SYCL_CHECK(ggml_sycl_set_device(ctx.device));
3303
  queue_ptr main_stream = ctx.stream();;
 
3405
 
3406
  inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
3407
  // TODO: accuracy issues in MMQ
3408
+ GGML_UNUSED(type);
3409
  return false;
3410
  }
3411
 
 
3781
  GGML_ABORT("fatal error");
3782
  }
3783
 
3784
+ GGML_UNUSED(dst);
3785
  }
3786
  catch (sycl::exception const &exc) {
3787
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
 
3792
  static void ggml_sycl_dup(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3793
  // TODO: why do we pass dst as src1 here?
3794
  ggml_sycl_cpy(ctx, src0, dst, nullptr);
3795
+ GGML_UNUSED(src1);
3796
  }
3797
 
3798
  static void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 
3837
  }
3838
 
3839
  static void ggml_sycl_nop(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3840
+ GGML_UNUSED(src0);
3841
+ GGML_UNUSED(src1);
3842
+ GGML_UNUSED(dst);
3843
+ GGML_UNUSED(ctx);
3844
  }
3845
 
3846
  void ggml_sycl_set_main_device(const int main_device) try {
3847
+ if (dpct::get_current_device_id() == static_cast<unsigned int> (main_device)) {
3848
+ return;
3849
+ }
3850
  check_allow_gpu_index(main_device);
3851
  dpct::select_device(main_device);
3852
 
 
4214
  {
4215
  ggml_backend_sycl_context *sycl_ctx =
4216
  (ggml_backend_sycl_context *)backend->context;
4217
+
4218
  sycl::event *sycl_event = static_cast<sycl::event *>(event->context);
4219
 
4220
  const queue_ptr &stream = sycl_ctx->stream(sycl_ctx->device, 0);
 
4229
  }
4230
 
4231
  static void ggml_backend_sycl_event_wait(ggml_backend_t backend, ggml_backend_event_t event) try {
4232
+
4233
  sycl::event* sycl_event = static_cast<sycl::event*>(event->context);
4234
 
4235
  if (ggml_backend_is_sycl(backend)) {
 
4637
  // SYCL doesn't support registering host memory, left here for reference
4638
  // "ggml_backend_register_host_buffer"
4639
  // "ggml_backend_unregister_host_buffer"
4640
+ GGML_UNUSED(name);
4641
  return nullptr;
4642
  }
4643
 
ggml/src/ggml-sycl/im2col.cpp CHANGED
@@ -120,6 +120,7 @@ void ggml_sycl_op_im2col(
120
  im2col_sycl(src1_dd, (float *)dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
121
  }
122
 
123
- (void) src0;
124
- (void) src0_dd;
 
125
  }
 
120
  im2col_sycl(src1_dd, (float *)dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
121
  }
122
 
123
+ GGML_UNUSED(src0);
124
+ GGML_UNUSED(src0_dd);
125
+ GGML_UNUSED(ctx);
126
  }
ggml/src/ggml-sycl/mmq.cpp CHANGED
@@ -813,7 +813,7 @@ load_tiles_q4_K(const void *__restrict__ vx, int *__restrict__ x_ql,
813
  x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
814
  }
815
 
816
- const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
817
  const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
818
 
819
  #pragma unroll
@@ -961,7 +961,7 @@ load_tiles_q5_K(const void *__restrict__ vx, int *__restrict__ x_ql,
961
  x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
962
  }
963
 
964
- const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
965
  const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
966
 
967
  #pragma unroll
@@ -1109,7 +1109,7 @@ load_tiles_q6_K(const void *__restrict__ vx, int *__restrict__ x_ql,
1109
  dpct::sub_sat());
1110
  }
1111
 
1112
- const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
1113
  const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
1114
  float * x_dmf = (float *) x_dm;
1115
 
@@ -3020,9 +3020,9 @@ void ggml_sycl_op_mul_mat_q(
3020
  break;
3021
  }
3022
 
3023
- (void) src1;
3024
- (void) dst;
3025
- (void) src1_ddf_i;
3026
  }
3027
  catch (sycl::exception const &exc) {
3028
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
 
813
  x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
814
  }
815
 
816
+ constexpr int blocks_per_tile_x_row = QI4_K > WARP_SIZE ? 1 : WARP_SIZE / QI4_K; // == 1 if QK_K == 256
817
  const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
818
 
819
  #pragma unroll
 
961
  x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
962
  }
963
 
964
+ constexpr int blocks_per_tile_x_row = QI5_K > WARP_SIZE ? 1 : WARP_SIZE / QI5_K; // == 1 if QK_K == 256
965
  const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
966
 
967
  #pragma unroll
 
1109
  dpct::sub_sat());
1110
  }
1111
 
1112
+ constexpr int blocks_per_tile_x_row = QI6_K > WARP_SIZE ? 1 : WARP_SIZE / QI6_K; // == 1 if QK_K == 256
1113
  const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
1114
  float * x_dmf = (float *) x_dm;
1115
 
 
3020
  break;
3021
  }
3022
 
3023
+ GGML_UNUSED(src1);
3024
+ GGML_UNUSED(dst);
3025
+ GGML_UNUSED(src1_ddf_i);
3026
  }
3027
  catch (sycl::exception const &exc) {
3028
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
ggml/src/ggml-sycl/mmvq.cpp CHANGED
@@ -753,11 +753,7 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
753
  const sycl::range<3> block_nums(1, 1, block_num_y);
754
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
755
  {
756
-
757
- stream->submit([&](sycl::handler &cgh) {
758
- auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
759
- auto ksigns64_ptr_ct1 = &ksigns64[0];
760
-
761
  cgh.parallel_for(
762
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
763
  [=](sycl::nd_item<3> item_ct1)
@@ -780,9 +776,6 @@ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
780
  {
781
 
782
  stream->submit([&](sycl::handler &cgh) {
783
- auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
784
- auto ksigns64_ptr_ct1 = &ksigns64[0];
785
-
786
  cgh.parallel_for(
787
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
788
  [=](sycl::nd_item<3> item_ct1)
@@ -805,9 +798,6 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
805
  {
806
 
807
  stream->submit([&](sycl::handler &cgh) {
808
- auto iq3xxs_grid_ptr_ct1 = &iq3xxs_grid[0];
809
- auto ksigns64_ptr_ct1 = &ksigns64[0];
810
-
811
  cgh.parallel_for(
812
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
813
  [=](sycl::nd_item<3> item_ct1)
@@ -830,8 +820,6 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
830
  {
831
 
832
  stream->submit([&](sycl::handler &cgh) {
833
- auto iq3s_grid_ptr_ct1 = &iq3s_grid[0];
834
-
835
  cgh.parallel_for(
836
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
837
  [=](sycl::nd_item<3> item_ct1)
@@ -854,9 +842,6 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
854
  {
855
 
856
  stream->submit([&](sycl::handler &cgh) {
857
- auto iq1s_grid_ptr_ct1 = &iq1s_grid_gpu[0];
858
- auto ksigns64_ptr_ct1 = &ksigns64[0];
859
-
860
  cgh.parallel_for(
861
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
862
  [=](sycl::nd_item<3> item_ct1)
@@ -954,7 +939,7 @@ void ggml_sycl_op_mul_mat_vec_q(
954
  const size_t q8_1_bs = QK8_1;
955
  // the main device has a larger memory buffer to hold the results from all GPUs
956
  // nrows_dst == nrows of the matrix that the kernel writes into
957
- const int64_t nrows_dst = id == ctx.device ? ne00 : row_diff;
958
  for (int i = 0; i < src1_ncols; i++)
959
  {
960
  const size_t src1_ddq_i_offset = i * src1_padded_col_size * q8_1_ts / q8_1_bs;
@@ -1023,7 +1008,8 @@ void ggml_sycl_op_mul_mat_vec_q(
1023
  break;
1024
  }
1025
  }
1026
- (void) src1;
1027
- (void) dst;
1028
- (void) src1_ddf_i;
 
1029
  }
 
753
  const sycl::range<3> block_nums(1, 1, block_num_y);
754
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
755
  {
756
+ stream->submit([&](sycl::handler & cgh) {
 
 
 
 
757
  cgh.parallel_for(
758
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
759
  [=](sycl::nd_item<3> item_ct1)
 
776
  {
777
 
778
  stream->submit([&](sycl::handler &cgh) {
 
 
 
779
  cgh.parallel_for(
780
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
781
  [=](sycl::nd_item<3> item_ct1)
 
798
  {
799
 
800
  stream->submit([&](sycl::handler &cgh) {
 
 
 
801
  cgh.parallel_for(
802
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
803
  [=](sycl::nd_item<3> item_ct1)
 
820
  {
821
 
822
  stream->submit([&](sycl::handler &cgh) {
 
 
823
  cgh.parallel_for(
824
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
825
  [=](sycl::nd_item<3> item_ct1)
 
842
  {
843
 
844
  stream->submit([&](sycl::handler &cgh) {
 
 
 
845
  cgh.parallel_for(
846
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
847
  [=](sycl::nd_item<3> item_ct1)
 
939
  const size_t q8_1_bs = QK8_1;
940
  // the main device has a larger memory buffer to hold the results from all GPUs
941
  // nrows_dst == nrows of the matrix that the kernel writes into
942
+
943
  for (int i = 0; i < src1_ncols; i++)
944
  {
945
  const size_t src1_ddq_i_offset = i * src1_padded_col_size * q8_1_ts / q8_1_bs;
 
1008
  break;
1009
  }
1010
  }
1011
+ GGML_UNUSED(src1);
1012
+ GGML_UNUSED(dst);
1013
+ GGML_UNUSED(src1_ddf_i);
1014
+ GGML_UNUSED(ctx);
1015
  }
ggml/src/ggml-sycl/norm.cpp CHANGED
@@ -31,7 +31,7 @@ static void norm_f32(const float* x, float* dst, const int ncols, const float ep
31
  */
32
  item_ct1.barrier(sycl::access::fence_space::local_space);
33
  mean_var = 0.f;
34
- int nreduce = nwarps / WARP_SIZE;
35
  for (size_t i = 0; i < nreduce; i += 1)
36
  {
37
  mean_var += s_sum[lane_id + i * WARP_SIZE];
@@ -55,7 +55,7 @@ static void group_norm_f32(const float* x, float* dst, const int group_size, con
55
  const int nthreads = item_ct1.get_local_range(2);
56
  const int nwarps = nthreads / WARP_SIZE;
57
  start += item_ct1.get_local_id(2);
58
- int nreduce = nwarps / WARP_SIZE;
59
 
60
  if (end >= ne_elements) {
61
  end = ne_elements;
@@ -163,7 +163,7 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const floa
163
  converged control flow. You may need to adjust the code.
164
  */
165
  item_ct1.barrier(sycl::access::fence_space::local_space);
166
- int nreduce = nwarps / WARP_SIZE;
167
  tmp = 0.f;
168
  for (size_t i = 0; i < nreduce; i += 1)
169
  {
@@ -352,6 +352,7 @@ void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, const ggml_tensor*
352
  (void)src1;
353
  (void)dst;
354
  (void)src1_dd;
 
355
  }
356
 
357
  void ggml_sycl_op_rms_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
 
31
  */
32
  item_ct1.barrier(sycl::access::fence_space::local_space);
33
  mean_var = 0.f;
34
+ size_t nreduce = nwarps / WARP_SIZE;
35
  for (size_t i = 0; i < nreduce; i += 1)
36
  {
37
  mean_var += s_sum[lane_id + i * WARP_SIZE];
 
55
  const int nthreads = item_ct1.get_local_range(2);
56
  const int nwarps = nthreads / WARP_SIZE;
57
  start += item_ct1.get_local_id(2);
58
+ size_t nreduce = nwarps / WARP_SIZE;
59
 
60
  if (end >= ne_elements) {
61
  end = ne_elements;
 
163
  converged control flow. You may need to adjust the code.
164
  */
165
  item_ct1.barrier(sycl::access::fence_space::local_space);
166
+ size_t nreduce = nwarps / WARP_SIZE;
167
  tmp = 0.f;
168
  for (size_t i = 0; i < nreduce; i += 1)
169
  {
 
352
  (void)src1;
353
  (void)dst;
354
  (void)src1_dd;
355
+ GGML_UNUSED(ctx);
356
  }
357
 
358
  void ggml_sycl_op_rms_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
ggml/src/ggml-sycl/rope.cpp CHANGED
@@ -269,7 +269,8 @@ void ggml_sycl_op_rope(
269
  }
270
  }
271
 
272
- (void) src1;
273
- (void) dst;
274
- (void) src1_dd;
 
275
  }
 
269
  }
270
  }
271
 
272
+ GGML_UNUSED(src1);
273
+ GGML_UNUSED(dst);
274
+ GGML_UNUSED(src1_dd);
275
+ GGML_UNUSED(ctx);
276
  }
ggml/src/ggml-sycl/softmax.cpp CHANGED
@@ -16,7 +16,7 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
16
  const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
17
  const int nthreads = block_size;
18
  const int nwarps = nthreads / WARP_SIZE;
19
- int nreduce = nwarps / WARP_SIZE;
20
  float slope = 1.0f;
21
 
22
  // ALiBi
@@ -53,8 +53,9 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
53
  if (block_size > WARP_SIZE) {
54
  if (warp_id == 0) {
55
  buf[lane_id] = -INFINITY;
56
- for (size_t i = 1; i < nreduce; i += 1)
57
  buf[lane_id + i * WARP_SIZE] = -INFINITY;
 
58
  }
59
  item_ct1.barrier(sycl::access::fence_space::local_space);
60
 
@@ -63,8 +64,7 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
63
  }
64
  item_ct1.barrier(sycl::access::fence_space::local_space);
65
  max_val = buf[lane_id];
66
- for (size_t i = 1; i < nreduce; i += 1)
67
- {
68
  max_val = std::max(max_val, buf[lane_id + i * WARP_SIZE]);
69
  }
70
  max_val = warp_reduce_max(max_val, item_ct1);
@@ -89,8 +89,9 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
89
  item_ct1.barrier(sycl::access::fence_space::local_space);
90
  if (warp_id == 0) {
91
  buf[lane_id] = 0.f;
92
- for (size_t i = 1; i < nreduce; i += 1)
93
  buf[lane_id + i * WARP_SIZE] = 0.f;
 
94
  }
95
  item_ct1.barrier(sycl::access::fence_space::local_space);
96
 
@@ -100,8 +101,7 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
100
  item_ct1.barrier(sycl::access::fence_space::local_space);
101
 
102
  tmp = buf[lane_id];
103
- for (size_t i = 1; i < nreduce; i += 1)
104
- {
105
  tmp += buf[lane_id + i * WARP_SIZE];
106
  }
107
  tmp = warp_reduce_sum(tmp, item_ct1);
 
16
  const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
17
  const int nthreads = block_size;
18
  const int nwarps = nthreads / WARP_SIZE;
19
+ size_t nreduce = nwarps / WARP_SIZE;
20
  float slope = 1.0f;
21
 
22
  // ALiBi
 
53
  if (block_size > WARP_SIZE) {
54
  if (warp_id == 0) {
55
  buf[lane_id] = -INFINITY;
56
+ for (size_t i = 1; i < nreduce; i += 1) {
57
  buf[lane_id + i * WARP_SIZE] = -INFINITY;
58
+ }
59
  }
60
  item_ct1.barrier(sycl::access::fence_space::local_space);
61
 
 
64
  }
65
  item_ct1.barrier(sycl::access::fence_space::local_space);
66
  max_val = buf[lane_id];
67
+ for (size_t i = 1; i < nreduce; i += 1) {
 
68
  max_val = std::max(max_val, buf[lane_id + i * WARP_SIZE]);
69
  }
70
  max_val = warp_reduce_max(max_val, item_ct1);
 
89
  item_ct1.barrier(sycl::access::fence_space::local_space);
90
  if (warp_id == 0) {
91
  buf[lane_id] = 0.f;
92
+ for (size_t i = 1; i < nreduce; i += 1) {
93
  buf[lane_id + i * WARP_SIZE] = 0.f;
94
+ }
95
  }
96
  item_ct1.barrier(sycl::access::fence_space::local_space);
97
 
 
101
  item_ct1.barrier(sycl::access::fence_space::local_space);
102
 
103
  tmp = buf[lane_id];
104
+ for (size_t i = 1; i < nreduce; i += 1) {
 
105
  tmp += buf[lane_id + i * WARP_SIZE];
106
  }
107
  tmp = warp_reduce_sum(tmp, item_ct1);
ggml/src/ggml-sycl/tsembd.cpp CHANGED
@@ -68,4 +68,5 @@ void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, const ggml
68
  const int max_period = dst->op_params[1];
69
 
70
  timestep_embedding_f32_sycl(src0_d, dst_d, src0->ne[0], dst->nb[1], dim, max_period, stream);
 
71
  }
 
68
  const int max_period = dst->op_params[1];
69
 
70
  timestep_embedding_f32_sycl(src0_d, dst_d, src0->ne[0], dst->nb[1], dim, max_period, stream);
71
+ GGML_UNUSED(src1);
72
  }
ggml/src/ggml-sycl/wkv6.cpp CHANGED
@@ -59,7 +59,7 @@ static void rwkv_wkv_f32_kernel(
59
  float y = 0;
60
 
61
  // Process in chunks of 4 for better vectorization
62
- sycl::float4 k4, r4, tf4, td4, s4, kv4;
63
  #pragma unroll
64
  for (int j = 0; j < head_size; j += 4) {
65
  // Load data in vec4 chunks
@@ -135,4 +135,7 @@ void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, const ggml_tensor* s
135
  );
136
  });
137
  });
 
 
 
138
  }
 
59
  float y = 0;
60
 
61
  // Process in chunks of 4 for better vectorization
62
+ sycl::float4 k4, r4, tf4, td4, s4;
63
  #pragma unroll
64
  for (int j = 0; j < head_size; j += 4) {
65
  // Load data in vec4 chunks
 
135
  );
136
  });
137
  });
138
+
139
+ GGML_UNUSED(src0);
140
+ GGML_UNUSED(src1);
141
  }