Akarshan Biswas commited on
Commit
d2ce872
·
1 Parent(s): 5fb7320

SYCL: Add all missing unary kernels (llama/13074)

Browse files

* SYCL: Add all missing unary kernels

ggml-ci

* decouple kernel launch range from data size using strided loop

* use ciel_div helper for num_blocks
ggml-ci

* clean auto imported header files

ggml/src/ggml-sycl/common.hpp CHANGED
@@ -493,5 +493,9 @@ static __dpct_inline__ Tp* get_pointer(sycl::local_accessor<Tp, dim> acc) {
493
 
494
  int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size);
495
 
 
 
 
 
496
  bool gpu_has_xmx(sycl::device &dev);
497
  #endif // GGML_SYCL_COMMON_HPP
 
493
 
494
  int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size);
495
 
496
+ constexpr size_t ceil_div(const size_t m, const size_t n) {
497
+ return (m + n - 1) / n;
498
+ }
499
+
500
  bool gpu_has_xmx(sycl::device &dev);
501
  #endif // GGML_SYCL_COMMON_HPP
ggml/src/ggml-sycl/element_wise.cpp CHANGED
@@ -21,6 +21,27 @@ static void acc_f32(const float * x, const float * y, float * dst, const int ne,
21
  }
22
  }
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  template<typename T>
25
  static void gelu(const T * x, T * dst, const int k,
26
  const sycl::nd_item<3> &item_ct1) {
@@ -335,6 +356,37 @@ static void silu_sycl(const T *x, T *dst, const int k,
335
  });
336
  }
337
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
338
  template<typename T>
339
  static void gelu_quick_sycl(const T *x, T *dst, const int k,
340
  queue_ptr stream) {
@@ -574,6 +626,106 @@ static void clamp_sycl(const T *x, T *dst, const float min,
574
  });
575
  }
576
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
577
  inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
578
  #if defined (GGML_SYCL_F16)
579
  GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
@@ -1388,3 +1540,20 @@ void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1388
  GGML_SYCL_DEBUG("call %s done\n", __func__);
1389
  }
1390
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  }
22
  }
23
 
24
+ template<typename T>
25
+ static void sgn(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) {
26
+ for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) {
27
+ dst[i] = x[i] > static_cast<T>(0.f) ? static_cast<T>(1.f) : ((x[i] < static_cast<T>(0.f) ? static_cast<T>(-1.f) : static_cast<T>(0.f)));
28
+ }
29
+ }
30
+
31
+ template<typename T>
32
+ static void abs_op(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) {
33
+ for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) {
34
+ dst[i] = sycl::fabs(x[i]);
35
+ }
36
+ }
37
+
38
+ template<typename T>
39
+ static void elu_op(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) {
40
+ for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) {
41
+ dst[i] = (x[i] > static_cast<T>(0.f)) ? x[i] : sycl::expm1(x[i]);
42
+ }
43
+ }
44
+
45
  template<typename T>
46
  static void gelu(const T * x, T * dst, const int k,
47
  const sycl::nd_item<3> &item_ct1) {
 
356
  });
357
  }
358
 
359
+ template<typename T>
360
+ static void sgn_sycl(const T * x, T * dst, const int k, queue_ptr stream) {
361
+ // hard code for now
362
+ const int num_blocks = ceil_div(k, 256);
363
+ stream->parallel_for(
364
+ sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range(1, 1, 256)), sycl::range(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) {
365
+ sgn(x, dst, k, item_ct1);
366
+ });
367
+ }
368
+
369
+ template<typename T>
370
+ static void abs_sycl(const T * x, T * dst, const int k, queue_ptr stream) {
371
+ // hard code for now
372
+ const int num_blocks = ceil_div(k, 256);
373
+ stream->parallel_for(
374
+ sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) {
375
+ abs_op(x, dst, k, item_ct1);
376
+ });
377
+ }
378
+
379
+
380
+ template<typename T>
381
+ static void elu_sycl(const T * x, T * dst, const int k, queue_ptr stream) {
382
+ // hard code for now
383
+ const int num_blocks = ceil_div(k, 256);
384
+ stream->parallel_for(
385
+ sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) {
386
+ elu_op(x, dst, k, item_ct1);
387
+ });
388
+ }
389
+
390
  template<typename T>
391
  static void gelu_quick_sycl(const T *x, T *dst, const int k,
392
  queue_ptr stream) {
 
626
  });
627
  }
628
 
629
+ inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
630
+ #if defined (GGML_SYCL_F16)
631
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
632
+ GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
633
+
634
+ #else
635
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
636
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
637
+ #endif
638
+ GGML_ASSERT(dst->src[0]->type == dst->type);
639
+ dpct::queue_ptr main_stream = ctx.stream();
640
+ SYCL_CHECK(ggml_sycl_set_device(ctx.device));
641
+ switch (dst->type) {
642
+ #if defined (GGML_SYCL_F16)
643
+ case GGML_TYPE_F16:
644
+ {
645
+ auto data_pts = cast_data<sycl::half>(dst);
646
+ sgn_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
647
+ break;
648
+ }
649
+ #endif
650
+ case GGML_TYPE_F32:
651
+ {
652
+ auto data_pts = cast_data<float>(dst);
653
+ sgn_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
654
+ break;
655
+ }
656
+ default:
657
+ GGML_ABORT("GGML tensor type not supported!\n");
658
+ break;
659
+ }
660
+ }
661
+
662
+ inline void ggml_sycl_op_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
663
+ #if defined (GGML_SYCL_F16)
664
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
665
+ GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
666
+
667
+ #else
668
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
669
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
670
+ #endif
671
+ GGML_ASSERT(dst->src[0]->type == dst->type);
672
+ dpct::queue_ptr main_stream = ctx.stream();
673
+ SYCL_CHECK(ggml_sycl_set_device(ctx.device));
674
+ switch (dst->type) {
675
+ #if defined (GGML_SYCL_F16)
676
+ case GGML_TYPE_F16:
677
+ {
678
+ auto data_pts = cast_data<sycl::half>(dst);
679
+ abs_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
680
+ break;
681
+ }
682
+ #endif
683
+ case GGML_TYPE_F32:
684
+ {
685
+ auto data_pts = cast_data<float>(dst);
686
+ abs_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
687
+ break;
688
+ }
689
+ default:
690
+ GGML_ABORT("GGML tensor type not supported!\n");
691
+ break;
692
+ }
693
+ }
694
+
695
+
696
+ inline void ggml_sycl_op_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
697
+ #if defined (GGML_SYCL_F16)
698
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
699
+ GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
700
+
701
+ #else
702
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
703
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
704
+ #endif
705
+ GGML_ASSERT(dst->src[0]->type == dst->type);
706
+ dpct::queue_ptr main_stream = ctx.stream();
707
+ SYCL_CHECK(ggml_sycl_set_device(ctx.device));
708
+ switch (dst->type) {
709
+ #if defined (GGML_SYCL_F16)
710
+ case GGML_TYPE_F16:
711
+ {
712
+ auto data_pts = cast_data<sycl::half>(dst);
713
+ elu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
714
+ break;
715
+ }
716
+ #endif
717
+ case GGML_TYPE_F32:
718
+ {
719
+ auto data_pts = cast_data<float>(dst);
720
+ elu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
721
+ break;
722
+ }
723
+ default:
724
+ GGML_ABORT("GGML tensor type not supported!\n");
725
+ break;
726
+ }
727
+ }
728
+
729
  inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
730
  #if defined (GGML_SYCL_F16)
731
  GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
 
1540
  GGML_SYCL_DEBUG("call %s done\n", __func__);
1541
  }
1542
 
1543
+ void ggml_sycl_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1544
+ GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1545
+ ggml_sycl_op_sgn(ctx, dst);
1546
+ GGML_SYCL_DEBUG("call %s done\n", __func__);
1547
+ }
1548
+
1549
+ void ggml_sycl_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1550
+ GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1551
+ ggml_sycl_op_abs(ctx, dst);
1552
+ GGML_SYCL_DEBUG("call %s done\n", __func__);
1553
+ }
1554
+
1555
+ void ggml_sycl_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1556
+ GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1557
+ ggml_sycl_op_elu(ctx, dst);
1558
+ GGML_SYCL_DEBUG("call %s done\n", __func__);
1559
+ }
ggml/src/ggml-sycl/element_wise.hpp CHANGED
@@ -66,5 +66,10 @@ void ggml_sycl_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
66
 
67
  void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
68
 
 
 
 
 
 
69
  #endif // GGML_SYCL_ELEMENTWISE_HPP
70
 
 
66
 
67
  void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
68
 
69
+ void ggml_sycl_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
70
+
71
+ void ggml_sycl_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
72
+
73
+ void ggml_sycl_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
74
  #endif // GGML_SYCL_ELEMENTWISE_HPP
75
 
ggml/src/ggml-sycl/ggml-sycl.cpp CHANGED
@@ -38,6 +38,7 @@
38
 
39
  #include "ggml-sycl/backend.hpp"
40
  #include "ggml-sycl/common.hpp"
 
41
  #include "ggml-sycl/presets.hpp"
42
  #include "ggml-sycl/gemm.hpp"
43
  #include "ggml-sycl/sycl_hw.hpp"
@@ -3355,6 +3356,15 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
3355
  case GGML_UNARY_OP_EXP:
3356
  ggml_sycl_exp(ctx, dst);
3357
  break;
 
 
 
 
 
 
 
 
 
3358
  default:
3359
  return false;
3360
  }
@@ -3837,6 +3847,9 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
3837
  case GGML_UNARY_OP_GELU_QUICK:
3838
  case GGML_UNARY_OP_TANH:
3839
  case GGML_UNARY_OP_EXP:
 
 
 
3840
  #if defined (GGML_SYCL_F16)
3841
  return ggml_is_contiguous(op->src[0]) && (op->type == op->src[0]->type);
3842
  #else
 
38
 
39
  #include "ggml-sycl/backend.hpp"
40
  #include "ggml-sycl/common.hpp"
41
+ #include "ggml-sycl/element_wise.hpp"
42
  #include "ggml-sycl/presets.hpp"
43
  #include "ggml-sycl/gemm.hpp"
44
  #include "ggml-sycl/sycl_hw.hpp"
 
3356
  case GGML_UNARY_OP_EXP:
3357
  ggml_sycl_exp(ctx, dst);
3358
  break;
3359
+ case GGML_UNARY_OP_SGN:
3360
+ ggml_sycl_sgn(ctx, dst);
3361
+ break;
3362
+ case GGML_UNARY_OP_ABS:
3363
+ ggml_sycl_abs(ctx, dst);
3364
+ break;
3365
+ case GGML_UNARY_OP_ELU:
3366
+ ggml_sycl_elu(ctx, dst);
3367
+ break;
3368
  default:
3369
  return false;
3370
  }
 
3847
  case GGML_UNARY_OP_GELU_QUICK:
3848
  case GGML_UNARY_OP_TANH:
3849
  case GGML_UNARY_OP_EXP:
3850
+ case GGML_UNARY_OP_SGN:
3851
+ case GGML_UNARY_OP_ABS:
3852
+ case GGML_UNARY_OP_ELU:
3853
  #if defined (GGML_SYCL_F16)
3854
  return ggml_is_contiguous(op->src[0]) && (op->type == op->src[0]->type);
3855
  #else