lhez Shangqing Gu commited on
Commit
291a5b7
·
1 Parent(s): 5d27bbf

opencl: split ggml-opencl.cl into multiple files and cleanup (llama/12886)

Browse files

---------

Co-authored-by: Shangqing Gu <[email protected]>

Files changed (37) hide show
  1. ggml/src/ggml-opencl/CMakeLists.txt +35 -10
  2. ggml/src/ggml-opencl/ggml-opencl.cpp +782 -271
  3. ggml/src/ggml-opencl/kernels/add.cl +83 -0
  4. ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
  5. ggml/src/ggml-opencl/kernels/cpy.cl +184 -0
  6. ggml/src/ggml-opencl/kernels/cvt.cl +118 -0
  7. ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
  8. ggml/src/ggml-opencl/kernels/gelu.cl +62 -0
  9. ggml/src/ggml-opencl/kernels/gemv_noshuffle.cl +268 -0
  10. ggml/src/ggml-opencl/kernels/gemv_noshuffle_general.cl +274 -0
  11. ggml/src/ggml-opencl/kernels/get_rows.cl +163 -0
  12. ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
  13. ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
  14. ggml/src/ggml-opencl/kernels/mul.cl +79 -0
  15. ggml/src/ggml-opencl/kernels/mul_mat_Ab_Bi_8x4.cl +139 -0
  16. ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
  17. ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
  18. ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
  19. ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
  20. ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
  21. ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
  22. ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
  23. ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
  24. ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
  25. ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
  26. ggml/src/ggml-opencl/kernels/mul_mv_q6_k.cl +190 -0
  27. ggml/src/ggml-opencl/kernels/norm.cl +81 -0
  28. ggml/src/ggml-opencl/kernels/relu.cl +16 -0
  29. ggml/src/ggml-opencl/kernels/rms_norm.cl +96 -0
  30. ggml/src/ggml-opencl/kernels/rope.cl +721 -0
  31. ggml/src/ggml-opencl/kernels/scale.cl +16 -0
  32. ggml/src/ggml-opencl/kernels/silu.cl +30 -0
  33. ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +87 -0
  34. ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +87 -0
  35. ggml/src/ggml-opencl/kernels/softmax_f16.cl +86 -0
  36. ggml/src/ggml-opencl/kernels/softmax_f32.cl +86 -0
  37. ggml/src/ggml-opencl/kernels/transpose.cl +84 -0
ggml/src/ggml-opencl/CMakeLists.txt CHANGED
@@ -54,16 +54,41 @@ function(ggml_opencl_add_kernel KNAME)
54
  endfunction()
55
 
56
  set(GGML_OPENCL_KERNELS
57
- ggml-opencl
58
- ggml-opencl_mm
59
- ggml-opencl_cvt
60
- ggml-opencl_gemv_noshuffle
61
- ggml-opencl_gemv_noshuffle_general
62
- ggml-opencl_mul_mat_Ab_Bi_8x4
63
- ggml-opencl_transpose_16
64
- ggml-opencl_transpose_32
65
- ggml-opencl_transpose_32_16
66
- ggml-opencl_im2col
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  )
68
 
69
  foreach (K ${GGML_OPENCL_KERNELS})
 
54
  endfunction()
55
 
56
  set(GGML_OPENCL_KERNELS
57
+ add
58
+ clamp
59
+ cpy
60
+ cvt
61
+ diag_mask_inf
62
+ gelu
63
+ gemv_noshuffle_general
64
+ gemv_noshuffle
65
+ get_rows
66
+ im2col_f32
67
+ im2col_f16
68
+ mul_mat_Ab_Bi_8x4
69
+ mul_mv_f16_f16
70
+ mul_mv_f16_f32_1row
71
+ mul_mv_f16_f32_l4
72
+ mul_mv_f16_f32
73
+ mul_mv_f32_f32
74
+ mul_mv_q4_0_f32
75
+ mul_mv_q4_0_f32_v
76
+ mul_mv_q4_0_f32_8x_flat
77
+ mul_mv_q4_0_f32_1d_8x_flat
78
+ mul_mv_q4_0_f32_1d_16x_flat
79
+ mul_mv_q6_k
80
+ mul
81
+ norm
82
+ relu
83
+ rms_norm
84
+ rope
85
+ scale
86
+ silu
87
+ softmax_4_f32
88
+ softmax_4_f16
89
+ softmax_f32
90
+ softmax_f16
91
+ transpose
92
  )
93
 
94
  foreach (K ${GGML_OPENCL_KERNELS})
ggml/src/ggml-opencl/ggml-opencl.cpp CHANGED
@@ -64,11 +64,33 @@ enum ADRENO_GPU_GEN {
64
  X1E,
65
  };
66
 
 
 
 
 
 
67
  struct ggml_cl_version {
68
  cl_uint major = 0;
69
  cl_uint minor = 0;
70
  };
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  // Parses a version string of form "XX.YY ". On an error returns ggml_cl_version with all zeroes.
73
  static ggml_cl_version parse_cl_version(std::string_view str) {
74
  size_t major_str_begin = 0;
@@ -173,24 +195,30 @@ static ADRENO_GPU_GEN get_adreno_gpu_gen(const char *device_name) {
173
  return ADRENO_GPU_GEN::ADRENO_UNKNOWN;
174
  }
175
 
176
- static int get_adreno_cl_compiler_version(const char *driver_version) {
177
  std::string driver_ver_str(driver_version);
 
178
  size_t compiler_ver_pos = driver_ver_str.find("E031");
179
  size_t compiler_ver_len = 13;
180
- size_t compiler_ver_offset = 5;
 
 
181
 
182
  if (compiler_ver_pos == std::string::npos) {
183
  compiler_ver_pos = driver_ver_str.find("DX");
184
  if (compiler_ver_pos == std::string::npos) {
185
- return -1;
186
  }
 
187
  compiler_ver_len = 11;
188
- compiler_ver_offset = 3;
189
  }
190
 
191
  std::string compiler_ver_str = driver_ver_str.substr(compiler_ver_pos, compiler_ver_len);
192
- std::string major_ver_str = compiler_ver_str.substr(compiler_ver_offset, 2);
193
- return std::atoi(major_ver_str.c_str());
 
 
194
  }
195
 
196
  // backend device context
@@ -215,16 +243,48 @@ struct ggml_backend_opencl_context {
215
  cl_int alignment;
216
  size_t max_alloc_size;
217
  bool fp16_support;
 
 
218
 
219
  int adreno_wave_size;
220
 
221
  cl_context context;
222
  cl_command_queue queue;
223
 
224
- cl_program program;
225
- cl_program program_1;
226
- cl_program program_2;
227
- cl_program program_im2col;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
 
229
  cl_kernel kernel_add, kernel_add_row;
230
  cl_kernel kernel_mul, kernel_mul_row;
@@ -249,19 +309,17 @@ struct ggml_backend_opencl_context {
249
  cl_kernel kernel_mul_mat_f16_f32;
250
  cl_kernel kernel_mul_mat_f16_f32_l4;
251
  cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
252
- cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0, kernel_mul_mat_q4_0_f32_flat;
253
  cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
254
- cl_kernel kernel_convert_block_q4_0_noshuffle, kernel_mul_mat_q4_0_f32_flat_v0,
255
- kernel_mul_mat_q4_0_f32_flat_img_v0;
256
  cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
257
  cl_kernel kernel_mul_mv_q6_K_f32;
258
  cl_kernel kernel_im2col_f32, kernel_im2col_f16;
259
 
260
  #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
261
  // Transpose kernels
262
- cl_program program_transpose_32;
263
- cl_program program_transpose_32_16;
264
- cl_program program_transpose_16;
265
  cl_kernel kernel_transpose_32;
266
  cl_kernel kernel_transpose_32_16;
267
  cl_kernel kernel_transpose_16;
@@ -374,6 +432,681 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co
374
  return p;
375
  }
376
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
  static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
378
  static bool initialized = false;
379
  static ggml_backend_opencl_context *backend_ctx = nullptr;
@@ -612,11 +1345,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
612
  GGML_LOG_INFO("ggml_opencl: OpenCL driver: %s\n", driver_version);
613
  backend_ctx->driver_version = driver_version;
614
 
615
- int adreno_cl_compiler_version = get_adreno_cl_compiler_version(driver_version);
616
- bool has_vector_subgroup_broadcast =
617
- adreno_cl_compiler_version >= 47 || adreno_cl_compiler_version == 17;
 
618
  GGML_LOG_INFO("ggml_opencl: vector subgroup broadcast support: %s\n",
619
- has_vector_subgroup_broadcast ? "true" : "false");
620
 
621
  size_t ext_str_size;
622
  clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size);
@@ -691,247 +1425,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
691
  #endif
692
  CL_CHECK((backend_ctx->queue = clCreateCommandQueue(context, device, command_queue_props, &err), err));
693
 
694
- #ifdef GGML_OPENCL_EMBED_KERNELS
695
- const std::string kernel_src {
696
- #include "ggml-opencl.cl.h"
697
- };
698
- #else
699
- const std::string kernel_src = read_file("ggml-opencl.cl");
700
- #endif
701
-
702
- auto opencl_c_std =
703
- std::string("CL") + std::to_string(opencl_c_version.major) + "." + std::to_string(opencl_c_version.minor);
704
-
705
- std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
706
- " -cl-mad-enable -cl-unsafe-math-optimizations"
707
- " -cl-finite-math-only -cl-fast-relaxed-math";
708
- backend_ctx->program = build_program_from_source(context, device, kernel_src.c_str(), compile_opts);
709
-
710
- // Non matmul kernels.
711
- CL_CHECK((backend_ctx->kernel_get_rows_f32 = clCreateKernel(backend_ctx->program, "kernel_get_rows_f32", &err), err));
712
- CL_CHECK((backend_ctx->kernel_get_rows_f16 = clCreateKernel(backend_ctx->program, "kernel_get_rows_f16", &err), err));
713
- CL_CHECK((backend_ctx->kernel_get_rows_q4_0 = clCreateKernel(backend_ctx->program, "kernel_get_rows_q4_0", &err), err));
714
- CL_CHECK((backend_ctx->kernel_add = clCreateKernel(backend_ctx->program, "kernel_add", &err), err));
715
- CL_CHECK((backend_ctx->kernel_add_row = clCreateKernel(backend_ctx->program, "kernel_add_row", &err), err));
716
- CL_CHECK((backend_ctx->kernel_mul = clCreateKernel(backend_ctx->program, "kernel_mul", &err), err));
717
- CL_CHECK((backend_ctx->kernel_mul_row = clCreateKernel(backend_ctx->program, "kernel_mul_row", &err), err));
718
- CL_CHECK((backend_ctx->kernel_scale = clCreateKernel(backend_ctx->program, "kernel_scale", &err), err));
719
- CL_CHECK((backend_ctx->kernel_silu = clCreateKernel(backend_ctx->program, "kernel_silu", &err), err));
720
- CL_CHECK((backend_ctx->kernel_silu_4 = clCreateKernel(backend_ctx->program, "kernel_silu_4", &err), err));
721
- CL_CHECK((backend_ctx->kernel_gelu = clCreateKernel(backend_ctx->program, "kernel_gelu", &err), err));
722
- CL_CHECK((backend_ctx->kernel_gelu_4 = clCreateKernel(backend_ctx->program, "kernel_gelu_4", &err), err));
723
- CL_CHECK((backend_ctx->kernel_gelu_quick = clCreateKernel(backend_ctx->program, "kernel_gelu_quick", &err), err));
724
- CL_CHECK((backend_ctx->kernel_gelu_quick_4 = clCreateKernel(backend_ctx->program, "kernel_gelu_quick_4", &err), err));
725
- CL_CHECK((backend_ctx->kernel_relu = clCreateKernel(backend_ctx->program, "kernel_relu", &err), err));
726
- CL_CHECK((backend_ctx->kernel_clamp = clCreateKernel(backend_ctx->program, "kernel_clamp", &err), err));
727
- CL_CHECK((backend_ctx->kernel_norm = clCreateKernel(backend_ctx->program, "kernel_norm", &err), err));
728
- CL_CHECK((backend_ctx->kernel_rms_norm = clCreateKernel(backend_ctx->program, "kernel_rms_norm", &err), err));
729
- CL_CHECK((backend_ctx->kernel_diag_mask_inf = clCreateKernel(backend_ctx->program, "kernel_diag_mask_inf", &err), err));
730
- CL_CHECK((backend_ctx->kernel_diag_mask_inf_8 = clCreateKernel(backend_ctx->program, "kernel_diag_mask_inf_8", &err), err));
731
- CL_CHECK((backend_ctx->kernel_soft_max = clCreateKernel(backend_ctx->program, "kernel_soft_max", &err), err));
732
- CL_CHECK((backend_ctx->kernel_soft_max_4 = clCreateKernel(backend_ctx->program, "kernel_soft_max_4", &err), err));
733
- CL_CHECK((backend_ctx->kernel_soft_max_f16 = clCreateKernel(backend_ctx->program, "kernel_soft_max_f16", &err), err));
734
- CL_CHECK((backend_ctx->kernel_soft_max_4_f16 = clCreateKernel(backend_ctx->program, "kernel_soft_max_4_f16", &err), err));
735
- CL_CHECK((backend_ctx->kernel_rope_norm_f32 = clCreateKernel(backend_ctx->program, "kernel_rope_norm_f32", &err), err));
736
- CL_CHECK((backend_ctx->kernel_rope_norm_f16 = clCreateKernel(backend_ctx->program, "kernel_rope_norm_f16", &err), err));
737
- CL_CHECK((backend_ctx->kernel_rope_neox_f32 = clCreateKernel(backend_ctx->program, "kernel_rope_neox_f32", &err), err));
738
- CL_CHECK((backend_ctx->kernel_rope_neox_f16 = clCreateKernel(backend_ctx->program, "kernel_rope_neox_f16", &err), err));
739
- CL_CHECK((backend_ctx->kernel_rope_multi_f32 = clCreateKernel(backend_ctx->program, "kernel_rope_multi_f32", &err), err));
740
- CL_CHECK((backend_ctx->kernel_rope_multi_f16 = clCreateKernel(backend_ctx->program, "kernel_rope_multi_f16", &err), err));
741
- CL_CHECK((backend_ctx->kernel_rope_vision_f32 = clCreateKernel(backend_ctx->program, "kernel_rope_vision_f32", &err), err));
742
- CL_CHECK((backend_ctx->kernel_rope_vision_f16 = clCreateKernel(backend_ctx->program, "kernel_rope_vision_f16", &err), err));
743
- CL_CHECK((backend_ctx->kernel_cpy_f16_f16 = clCreateKernel(backend_ctx->program, "kernel_cpy_f16_f16", &err), err));
744
- CL_CHECK((backend_ctx->kernel_cpy_f16_f32 = clCreateKernel(backend_ctx->program, "kernel_cpy_f16_f32", &err), err));
745
- CL_CHECK((backend_ctx->kernel_cpy_f32_f16 = clCreateKernel(backend_ctx->program, "kernel_cpy_f32_f16", &err), err));
746
- CL_CHECK((backend_ctx->kernel_cpy_f32_f32 = clCreateKernel(backend_ctx->program, "kernel_cpy_f32_f32", &err), err));
747
-
748
- // Matmul kernels.
749
- CL_CHECK((backend_ctx->kernel_mul_mat_f32_f32 = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f32_f32", &err), err));
750
- CL_CHECK((backend_ctx->kernel_mul_mat_f16_f16 = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f16_f16", &err), err));
751
- CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_1row = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f16_f32_1row", &err), err));
752
- CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32 = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f16_f32", &err), err));
753
- CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4 = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f16_f32_l4", &err), err));
754
- CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32 = clCreateKernel(backend_ctx->program, "kernel_mul_mat_q4_0_f32", &err), err));
755
- CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_v = clCreateKernel(backend_ctx->program, "kernel_mul_mat_q4_0_f32_v", &err), err));
756
-
757
- CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_flat = clCreateKernel(backend_ctx->program, "kernel_mul_mat_q4_0_f32_flat", &err), err));
758
- CL_CHECK((backend_ctx->kernel_convert_block_q4_0 = clCreateKernel(backend_ctx->program, "kernel_convert_block_q4_0", &err), err));
759
- CL_CHECK((backend_ctx->kernel_restore_block_q4_0 = clCreateKernel(backend_ctx->program, "kernel_restore_block_q4_0", &err), err));
760
- CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat = clCreateKernel(backend_ctx->program, "kernel_mul_mat_q4_0_f32_8x_flat", &err), err));
761
-
762
- // Load additional mulmat kernels.
763
- #ifdef GGML_OPENCL_EMBED_KERNELS
764
- const std::string kernel_src_1 {
765
- #include "ggml-opencl_mm.cl.h"
766
- };
767
- #else
768
- const std::string kernel_src_1 = read_file("ggml-opencl_mm.cl");
769
- #endif
770
- backend_ctx->program_1 = build_program_from_source(context, device, kernel_src_1.c_str(), compile_opts);
771
-
772
- CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_1d_8x_flat = clCreateKernel(backend_ctx->program_1, "kernel_mul_mat_q4_0_f32_1d_8x_flat", &err), err));
773
- CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_1d_16x_flat = clCreateKernel(backend_ctx->program_1, "kernel_mul_mat_q4_0_f32_1d_16x_flat", &err), err));
774
- CL_CHECK((backend_ctx->kernel_mul_mv_q6_K_f32 = clCreateKernel(backend_ctx->program_1, "kernel_mul_mv_q6_K_f32", &err), err));
775
- CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_flat_v0 = clCreateKernel(backend_ctx->program_1, "kernel_mul_mat_q4_0_f32_flat_v0", &err), err));
776
- CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_flat_img_v0 = clCreateKernel(backend_ctx->program_1, "kernel_mul_mat_q4_0_f32_flat_img_v0", &err), err));
777
-
778
- // Load additional data conversion kernels.
779
- #ifdef GGML_OPENCL_EMBED_KERNELS
780
- const std::string kernel_src_2 {
781
- #include "ggml-opencl_cvt.cl.h"
782
- };
783
- #else
784
- const std::string kernel_src_2 = read_file("ggml-opencl_cvt.cl");
785
- #endif
786
- backend_ctx->program_2 = build_program_from_source(context, device, kernel_src_2.c_str(), compile_opts);
787
-
788
- CL_CHECK((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_2, "kernel_convert_block_q4_0_noshuffle", &err), err));
789
 
790
- // im2col kernels
791
- #ifdef GGML_OPENCL_EMBED_KERNELS
792
- const std::string kernel_src_im2col {
793
- #include "ggml-opencl_im2col.cl.h"
794
- };
795
- #else
796
- const std::string kernel_src_im2col = read_file("ggml-opencl_im2col.cl");
797
- #endif
798
- backend_ctx->program_im2col = build_program_from_source(context, device, kernel_src_im2col.c_str(), compile_opts);
799
-
800
- CL_CHECK((backend_ctx->kernel_im2col_f32 = clCreateKernel(backend_ctx->program_im2col, "kernel_im2col_f32", &err), err));
801
- CL_CHECK((backend_ctx->kernel_im2col_f16 = clCreateKernel(backend_ctx->program_im2col, "kernel_im2col_f16", &err), err));
802
-
803
- // Kernels for Adreno
804
  #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
805
- #ifdef GGML_OPENCL_EMBED_KERNELS
806
- const std::string transpose_32_src {
807
- #include "ggml-opencl_transpose_32.cl.h"
808
- };
809
- #else
810
- const std::string transpose_32_src = read_file("ggml-opencl_transpose_32.cl");
811
- #endif
812
- backend_ctx->program_transpose_32 = build_program_from_source(context, device, transpose_32_src.c_str(), compile_opts);
813
- CL_CHECK((backend_ctx->kernel_transpose_32 = clCreateKernel(backend_ctx->program_transpose_32, "kernel_transpose_32", &err), err));
814
-
815
- #ifdef GGML_OPENCL_EMBED_KERNELS
816
- const std::string transpose_32_16_src {
817
- #include "ggml-opencl_transpose_32_16.cl.h"
818
- };
819
- #else
820
- const std::string transpose_32_16_src = read_file("ggml-opencl_transpose_32_16.cl");
821
- #endif
822
- backend_ctx->program_transpose_32_16 = build_program_from_source(context, device, transpose_32_16_src.c_str(), compile_opts);
823
- CL_CHECK((backend_ctx->kernel_transpose_32_16 = clCreateKernel(backend_ctx->program_transpose_32_16, "kernel_transpose_32_16", &err), err));
824
-
825
- #ifdef GGML_OPENCL_EMBED_KERNELS
826
- const std::string transpose_16_src {
827
- #include "ggml-opencl_transpose_16.cl.h"
828
- };
829
- #else
830
- const std::string transpose_16_src = read_file("ggml-opencl_transpose_16.cl");
831
- #endif
832
- backend_ctx->program_transpose_16 = build_program_from_source(context, device, transpose_16_src.c_str(), compile_opts);
833
- CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose_16, "kernel_transpose_16", &err), err));
834
-
835
- // Gemv general
836
- std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
837
- " -cl-mad-enable "
838
- " -DSIMDGROUP_WIDTH=" +
839
- std::to_string(backend_ctx->adreno_wave_size);
840
- if (has_vector_subgroup_broadcast) {
841
- CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
842
- }
843
- #ifdef GGML_OPENCL_EMBED_KERNELS
844
- const std::string kernel_src_CL_gemv_general {
845
- #include "ggml-opencl_gemv_noshuffle_general.cl.h"
846
- };
847
- #else
848
- const std::string kernel_src_CL_gemv_general = read_file("ggml-opencl_gemv_noshuffle_general.cl");
849
- #endif
850
-
851
- backend_ctx->program_CL_gemv_general = build_program_from_source(
852
- context, device, kernel_src_CL_gemv_general.c_str(), CL_gemv_compile_opts);
853
- CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general = clCreateKernel(backend_ctx->program_CL_gemv_general, "kernel_gemv_noshuffle", &err), err));
854
-
855
- // Gemv 2048, 16384
856
- CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
857
- " -cl-mad-enable "
858
- " -DLINE_STRIDE_A=2048 "
859
- " -DBLOCK_STRIDE_A=16384 "
860
- " -DSIMDGROUP_WIDTH=" +
861
- std::to_string(backend_ctx->adreno_wave_size);
862
- if (has_vector_subgroup_broadcast) {
863
- CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
864
- }
865
- #ifdef GGML_OPENCL_EMBED_KERNELS
866
- const std::string kernel_src_CL_gemv {
867
- #include "ggml-opencl_gemv_noshuffle.cl.h"
868
- };
869
- #else
870
- const std::string kernel_src_CL_gemv = read_file("ggml-opencl_gemv_noshuffle.cl");
871
- #endif
872
-
873
- backend_ctx->program_CL_gemv_4096_1_4096 = build_program_from_source(
874
- context, device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
875
- CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_4096, "kernel_gemv_noshuffle", &err), err));
876
-
877
- // Gemv 2048, 16384
878
- CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
879
- " -cl-mad-enable "
880
- " -DLINE_STRIDE_A=2048 "
881
- " -DBLOCK_STRIDE_A=16384 "
882
- " -DSIMDGROUP_WIDTH=" +
883
- std::to_string(backend_ctx->adreno_wave_size);
884
- if (has_vector_subgroup_broadcast) {
885
- CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
886
- }
887
-
888
- backend_ctx->program_CL_gemv_4096_1_11008 = build_program_from_source(
889
- context, device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
890
- CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_11008, "kernel_gemv_noshuffle", &err), err));
891
-
892
- // Gemv 5504, 44032
893
- CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
894
- " -cl-mad-enable "
895
- " -DLINE_STRIDE_A=5504 "
896
- " -DBLOCK_STRIDE_A=44032 "
897
- " -DSIMDGROUP_WIDTH=" +
898
- std::to_string(backend_ctx->adreno_wave_size);
899
- if (has_vector_subgroup_broadcast) {
900
- CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
901
- }
902
-
903
- backend_ctx->program_CL_gemv_11008_1_4096 = build_program_from_source(
904
- context, device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
905
- CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_11008_1_4096, "kernel_gemv_noshuffle", &err), err));
906
-
907
- // Gemv 16000, 128000
908
- CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
909
- " -cl-mad-enable "
910
- " -DLINE_STRIDE_A=16000 "
911
- " -DBLOCK_STRIDE_A=128000 "
912
- " -DSIMDGROUP_WIDTH=" +
913
- std::to_string(backend_ctx->adreno_wave_size);
914
- if (has_vector_subgroup_broadcast) {
915
- CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
916
- }
917
-
918
- backend_ctx->program_CL_gemv_32000_1_4096 = build_program_from_source(context, device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
919
- CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_32000_1_4096, "kernel_gemv_noshuffle", &err), err));
920
-
921
- // Gemm
922
- #ifdef GGML_OPENCL_EMBED_KERNELS
923
- const std::string kernel_src_CL_gemm {
924
- #include "ggml-opencl_mul_mat_Ab_Bi_8x4.cl.h"
925
- };
926
- #else
927
- const std::string kernel_src_CL_gemm = read_file("ggml-opencl_mul_mat_Ab_Bi_8x4.cl");
928
- #endif
929
- backend_ctx->program_CL_gemm = build_program_from_source(context, device, kernel_src_CL_gemm.c_str(), compile_opts);
930
- CL_CHECK((backend_ctx->CL_mul_mat_Ab_Bi_8x4 = clCreateKernel(backend_ctx->program_CL_gemm, "kernel_mul_mat_Ab_Bi_8x4", &err), err));
931
-
932
- // TODO: fixme: these sizes are hardcoded for now.
933
- // they should be allocated based on the model's size
934
- // and the device's max alloc size
935
  // Allocate intermediate buffers and images
936
  size_t required_A_q_d_bytes = 311164928;
937
  size_t required_A_s_d_bytes = 38895616;
@@ -1495,8 +1992,15 @@ static enum ggml_status ggml_backend_opencl_buffer_init_tensor(ggml_backend_buff
1495
 
1496
  // The optimized gemm and gemv kernels are used for large matrices without batch.
1497
  // tensor is the quantized weights matrix.
1498
- inline bool use_adreno_kernels(const ggml_tensor *tensor) {
1499
- return tensor->ne[0] >= 512 && tensor->ne[1] >= 512 &&
 
 
 
 
 
 
 
1500
  tensor->ne[2] == 1 && tensor->ne[3] == 1;
1501
  }
1502
 
@@ -1574,7 +2078,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
1574
  cl_kernel kernel = backend_ctx->kernel_convert_block_q4_0;
1575
 
1576
  // The optimized kernels need weights in natural order, so unshuffle.
1577
- if (use_adreno_kernels(tensor)) {
1578
  kernel = backend_ctx->kernel_convert_block_q4_0_noshuffle;
1579
  }
1580
  #else
@@ -1598,7 +2102,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
1598
  #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
1599
  // Only do transpose for large, non batched matrix
1600
  // TODO: use preallocated images instead of sub-buffer then image
1601
- if (use_adreno_kernels(tensor)) {
1602
  // <----------------------------------------------------------------------------------> //
1603
  // start transpose
1604
  // <----------------------------------------------------------------------------------> //
@@ -2899,8 +3403,8 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
2899
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
2900
  cl_command_queue queue = backend_ctx->queue;
2901
 
2902
- ggml_backend_opencl_device_context * dev_ctx =
2903
- (ggml_backend_opencl_device_context *)backend->device->context;
2904
 
2905
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
2906
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -2931,13 +3435,20 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
2931
 
2932
  // Note, this kernel declares local memory in kernel args and the size
2933
  // depends on subgroup size.
2934
- // Retrieve subgroup size.
2935
  // Note, this requires OpenCL 2.1 and above
 
2936
  size_t sgs;
2937
- CL_CHECK(clGetKernelSubGroupInfo(kernel, dev_ctx->device,
2938
- CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
2939
- sizeof(local_work_size), local_work_size,
2940
- sizeof(size_t), &sgs, NULL));
 
 
 
 
 
 
 
2941
 
2942
  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
2943
  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
@@ -3030,7 +3541,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
3030
  #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
3031
  cl_context context = backend_ctx->context;
3032
 
3033
- if (ne01 && ne1 && use_adreno_kernels(src0)) {
3034
 
3035
  // init CL objects
3036
  // <--------------------------------------------> //
 
64
  X1E,
65
  };
66
 
67
+ enum ADRENO_CL_COMPILER_TYPE {
68
+ E031,
69
+ DX,
70
+ };
71
+
72
  struct ggml_cl_version {
73
  cl_uint major = 0;
74
  cl_uint minor = 0;
75
  };
76
 
77
+ struct ggml_cl_compiler_version {
78
+ ADRENO_CL_COMPILER_TYPE type;
79
+ int major = -1;
80
+ int minor = -1;
81
+ int patch = -1;
82
+
83
+ bool same(ADRENO_CL_COMPILER_TYPE t, int x, int y, int z) const {
84
+ return major == x && minor == y && patch == z && type == t;
85
+ }
86
+ bool newer_than(ADRENO_CL_COMPILER_TYPE t, int x, int y, int z) const {
87
+ return major*10000 + minor*100 + patch > x*10000 + y*100 + z && type == t;
88
+ }
89
+ bool newer_than_or_same(ADRENO_CL_COMPILER_TYPE t, int x, int y, int z) const {
90
+ return same(t, x, y, z) || newer_than(t, x, y, z);
91
+ }
92
+ };
93
+
94
  // Parses a version string of form "XX.YY ". On an error returns ggml_cl_version with all zeroes.
95
  static ggml_cl_version parse_cl_version(std::string_view str) {
96
  size_t major_str_begin = 0;
 
195
  return ADRENO_GPU_GEN::ADRENO_UNKNOWN;
196
  }
197
 
198
+ static ggml_cl_compiler_version get_adreno_cl_compiler_version(const char *driver_version) {
199
  std::string driver_ver_str(driver_version);
200
+ ADRENO_CL_COMPILER_TYPE type = ADRENO_CL_COMPILER_TYPE::E031;
201
  size_t compiler_ver_pos = driver_ver_str.find("E031");
202
  size_t compiler_ver_len = 13;
203
+ size_t compiler_major_offset = 5;
204
+ size_t compiler_minor_offset = 8;
205
+ size_t compiler_patch_offset = 11;
206
 
207
  if (compiler_ver_pos == std::string::npos) {
208
  compiler_ver_pos = driver_ver_str.find("DX");
209
  if (compiler_ver_pos == std::string::npos) {
210
+ return {};
211
  }
212
+ type = ADRENO_CL_COMPILER_TYPE::DX;
213
  compiler_ver_len = 11;
214
+ compiler_major_offset = 3;
215
  }
216
 
217
  std::string compiler_ver_str = driver_ver_str.substr(compiler_ver_pos, compiler_ver_len);
218
+ int major = std::atoi(compiler_ver_str.substr(compiler_major_offset, 2).c_str());
219
+ int minor = std::atoi(compiler_ver_str.substr(compiler_minor_offset, 2).c_str());
220
+ int patch = std::atoi(compiler_ver_str.substr(compiler_patch_offset, 2).c_str());
221
+ return { type, major, minor, patch };
222
  }
223
 
224
  // backend device context
 
243
  cl_int alignment;
244
  size_t max_alloc_size;
245
  bool fp16_support;
246
+ bool has_vector_subgroup_broadcast;
247
+ ggml_cl_compiler_version adreno_cl_compiler_version;
248
 
249
  int adreno_wave_size;
250
 
251
  cl_context context;
252
  cl_command_queue queue;
253
 
254
+ cl_program program_add;
255
+ cl_program program_clamp;
256
+ cl_program program_cpy;
257
+ cl_program program_cvt;
258
+ cl_program program_diag_mask_inf;
259
+ cl_program program_gelu;
260
+ cl_program program_gemv_noshuffle_general;
261
+ cl_program program_gemv_noshuffle;
262
+ cl_program program_get_rows;
263
+ cl_program program_im2col_f16;
264
+ cl_program program_im2col_f32;
265
+ cl_program program_mul_mat_Ab_Bi_8x4;
266
+ cl_program program_mul_mv_q4_0_f32;
267
+ cl_program program_mul_mv_q4_0_f32_v;
268
+ cl_program program_mul_mv_q4_0_f32_8x_flat;
269
+ cl_program program_mul_mv_q4_0_f32_1d_8x_flat;
270
+ cl_program program_mul_mv_q4_0_f32_1d_16x_flat;
271
+ cl_program program_mul_mv_q6_K;
272
+ cl_program program_mul_mv_f16_f16;
273
+ cl_program program_mul_mv_f16_f32_1row;
274
+ cl_program program_mul_mv_f16_f32_l4;
275
+ cl_program program_mul_mv_f16_f32;
276
+ cl_program program_mul_mv_f32_f32;
277
+ cl_program program_mul;
278
+ cl_program program_norm;
279
+ cl_program program_relu;
280
+ cl_program program_rms_norm;
281
+ cl_program program_rope;
282
+ cl_program program_scale;
283
+ cl_program program_silu;
284
+ cl_program program_softmax_f32;
285
+ cl_program program_softmax_f16;
286
+ cl_program program_softmax_4_f32;
287
+ cl_program program_softmax_4_f16;
288
 
289
  cl_kernel kernel_add, kernel_add_row;
290
  cl_kernel kernel_mul, kernel_mul_row;
 
309
  cl_kernel kernel_mul_mat_f16_f32;
310
  cl_kernel kernel_mul_mat_f16_f32_l4;
311
  cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
312
+ cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
313
  cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
314
+ cl_kernel kernel_convert_block_q4_0_noshuffle;
 
315
  cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
316
  cl_kernel kernel_mul_mv_q6_K_f32;
317
  cl_kernel kernel_im2col_f32, kernel_im2col_f16;
318
 
319
  #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
320
  // Transpose kernels
321
+ cl_program program_transpose;
322
+
 
323
  cl_kernel kernel_transpose_32;
324
  cl_kernel kernel_transpose_32_16;
325
  cl_kernel kernel_transpose_16;
 
432
  return p;
433
  }
434
 
435
+ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_version opencl_c_version) {
436
+ cl_int err;
437
+
438
+ // compiler options for general kernels
439
+ auto opencl_c_std =
440
+ std::string("CL") + std::to_string(opencl_c_version.major) + "." + std::to_string(opencl_c_version.minor);
441
+ std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
442
+ " -cl-mad-enable -cl-unsafe-math-optimizations"
443
+ " -cl-finite-math-only -cl-fast-relaxed-math";
444
+
445
+ GGML_LOG_INFO("ggml_opencl: loading OpenCL kernels");
446
+
447
+ // add
448
+ {
449
+ #ifdef GGML_OPENCL_EMBED_KERNELS
450
+ const std::string kernel_src {
451
+ #include "add.cl.h"
452
+ };
453
+ #else
454
+ const std::string kernel_src = read_file("add.cl");
455
+ #endif
456
+ backend_ctx->program_add =
457
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
458
+
459
+ CL_CHECK((backend_ctx->kernel_add = clCreateKernel(backend_ctx->program_add, "kernel_add", &err), err));
460
+ CL_CHECK((backend_ctx->kernel_add_row = clCreateKernel(backend_ctx->program_add, "kernel_add_row", &err), err));
461
+ GGML_LOG_CONT(".");
462
+ }
463
+
464
+ // clamp
465
+ {
466
+ #ifdef GGML_OPENCL_EMBED_KERNELS
467
+ const std::string kernel_src {
468
+ #include "clamp.cl.h"
469
+ };
470
+ #else
471
+ const std::string kernel_src = read_file("clamp.cl");
472
+ #endif
473
+ backend_ctx->program_clamp =
474
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
475
+
476
+ CL_CHECK((backend_ctx->kernel_clamp = clCreateKernel(backend_ctx->program_clamp, "kernel_clamp", &err), err));
477
+ GGML_LOG_CONT(".");
478
+ }
479
+
480
+ // cpy
481
+ {
482
+ #ifdef GGML_OPENCL_EMBED_KERNELS
483
+ const std::string kernel_src {
484
+ #include "cpy.cl.h"
485
+ };
486
+ #else
487
+ const std::string kernel_src = read_file("cpy.cl");
488
+ #endif
489
+ backend_ctx->program_cpy =
490
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
491
+
492
+ CL_CHECK((backend_ctx->kernel_cpy_f16_f16 = clCreateKernel(backend_ctx->program_cpy, "kernel_cpy_f16_f16", &err), err));
493
+ CL_CHECK((backend_ctx->kernel_cpy_f16_f32 = clCreateKernel(backend_ctx->program_cpy, "kernel_cpy_f16_f32", &err), err));
494
+ CL_CHECK((backend_ctx->kernel_cpy_f32_f16 = clCreateKernel(backend_ctx->program_cpy, "kernel_cpy_f32_f16", &err), err));
495
+ CL_CHECK((backend_ctx->kernel_cpy_f32_f32 = clCreateKernel(backend_ctx->program_cpy, "kernel_cpy_f32_f32", &err), err));
496
+ GGML_LOG_CONT(".");
497
+ }
498
+
499
+ // cvt
500
+ {
501
+ #ifdef GGML_OPENCL_EMBED_KERNELS
502
+ const std::string kernel_src {
503
+ #include "cvt.cl.h"
504
+ };
505
+ #else
506
+ const std::string kernel_src = read_file("cvt.cl");
507
+ #endif
508
+ backend_ctx->program_cvt =
509
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
510
+
511
+ CL_CHECK((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0_noshuffle", &err), err));
512
+ CL_CHECK((backend_ctx->kernel_convert_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0", &err), err));
513
+ CL_CHECK((backend_ctx->kernel_restore_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0", &err), err));
514
+ GGML_LOG_CONT(".");
515
+ }
516
+
517
+ // diag_mask_inf
518
+ {
519
+ #ifdef GGML_OPENCL_EMBED_KERNELS
520
+ const std::string kernel_src {
521
+ #include "diag_mask_inf.cl.h"
522
+ };
523
+ #else
524
+ const std::string kernel_src = read_file("diag_mask_inf.cl");
525
+ #endif
526
+ backend_ctx->program_diag_mask_inf =
527
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
528
+
529
+ CL_CHECK((backend_ctx->kernel_diag_mask_inf_8 = clCreateKernel(backend_ctx->program_diag_mask_inf, "kernel_diag_mask_inf_8", &err), err));
530
+ CL_CHECK((backend_ctx->kernel_diag_mask_inf = clCreateKernel(backend_ctx->program_diag_mask_inf, "kernel_diag_mask_inf", &err), err));
531
+ GGML_LOG_CONT(".");
532
+ }
533
+
534
+ // gelu
535
+ {
536
+ #ifdef GGML_OPENCL_EMBED_KERNELS
537
+ const std::string kernel_src {
538
+ #include "gelu.cl.h"
539
+ };
540
+ #else
541
+ const std::string kernel_src = read_file("gelu.cl");
542
+ #endif
543
+ backend_ctx->program_gelu =
544
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
545
+
546
+ CL_CHECK((backend_ctx->kernel_gelu = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu", &err), err));
547
+ CL_CHECK((backend_ctx->kernel_gelu_4 = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_4", &err), err));
548
+ CL_CHECK((backend_ctx->kernel_gelu_quick = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_quick", &err), err));
549
+ CL_CHECK((backend_ctx->kernel_gelu_quick_4 = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_quick_4", &err), err));
550
+ GGML_LOG_CONT(".");
551
+ }
552
+
553
+ // get_rows
554
+ {
555
+ #ifdef GGML_OPENCL_EMBED_KERNELS
556
+ const std::string kernel_src {
557
+ #include "get_rows.cl.h"
558
+ };
559
+ #else
560
+ const std::string kernel_src = read_file("get_rows.cl");
561
+ #endif
562
+ backend_ctx->program_get_rows =
563
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
564
+
565
+ CL_CHECK((backend_ctx->kernel_get_rows_f32 = clCreateKernel(backend_ctx->program_get_rows, "kernel_get_rows_f32", &err), err));
566
+ CL_CHECK((backend_ctx->kernel_get_rows_f16 = clCreateKernel(backend_ctx->program_get_rows, "kernel_get_rows_f16", &err), err));
567
+ CL_CHECK((backend_ctx->kernel_get_rows_q4_0 = clCreateKernel(backend_ctx->program_get_rows, "kernel_get_rows_q4_0", &err), err));
568
+ GGML_LOG_CONT(".");
569
+ }
570
+
571
+ // im2col_f32
572
+ {
573
+ #ifdef GGML_OPENCL_EMBED_KERNELS
574
+ const std::string kernel_src {
575
+ #include "im2col_f32.cl.h"
576
+ };
577
+ #else
578
+ const std::string kernel_src = read_file("im2col_f32.cl");
579
+ #endif
580
+ backend_ctx->program_im2col_f32 =
581
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
582
+
583
+ CL_CHECK((backend_ctx->kernel_im2col_f32 = clCreateKernel(backend_ctx->program_im2col_f32, "kernel_im2col_f32", &err), err));
584
+ GGML_LOG_CONT(".");
585
+ }
586
+
587
+ // im2col_f16
588
+ {
589
+ #ifdef GGML_OPENCL_EMBED_KERNELS
590
+ const std::string kernel_src {
591
+ #include "im2col_f16.cl.h"
592
+ };
593
+ #else
594
+ const std::string kernel_src = read_file("im2col_f16.cl");
595
+ #endif
596
+ backend_ctx->program_im2col_f16 =
597
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
598
+
599
+ CL_CHECK((backend_ctx->kernel_im2col_f16 = clCreateKernel(backend_ctx->program_im2col_f16, "kernel_im2col_f16", &err), err));
600
+ GGML_LOG_CONT(".");
601
+ }
602
+
603
+ // mul_mv_q4_0_f32
604
+ {
605
+ #ifdef GGML_OPENCL_EMBED_KERNELS
606
+ const std::string kernel_src {
607
+ #include "mul_mv_q4_0_f32.cl.h"
608
+ };
609
+ #else
610
+ const std::string kernel_src = read_file("mul_mv_q4_0_f32.cl");
611
+ #endif
612
+ backend_ctx->program_mul_mv_q4_0_f32 =
613
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
614
+
615
+ CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32 = clCreateKernel(backend_ctx->program_mul_mv_q4_0_f32, "kernel_mul_mat_q4_0_f32", &err), err));
616
+ GGML_LOG_CONT(".");
617
+ }
618
+
619
+ // mul_mv_q4_0_f32_v
620
+ {
621
+ #ifdef GGML_OPENCL_EMBED_KERNELS
622
+ const std::string kernel_src {
623
+ #include "mul_mv_q4_0_f32_v.cl.h"
624
+ };
625
+ #else
626
+ const std::string kernel_src = read_file("mul_mv_q4_0_f32_v.cl");
627
+ #endif
628
+ backend_ctx->program_mul_mv_q4_0_f32_v =
629
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
630
+
631
+ CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_v = clCreateKernel(backend_ctx->program_mul_mv_q4_0_f32_v, "kernel_mul_mat_q4_0_f32_v", &err), err));
632
+ GGML_LOG_CONT(".");
633
+ }
634
+
635
+ // mul_mv_q4_0_f32_8x_flat
636
+ {
637
+ #ifdef GGML_OPENCL_EMBED_KERNELS
638
+ const std::string kernel_src {
639
+ #include "mul_mv_q4_0_f32_8x_flat.cl.h"
640
+ };
641
+ #else
642
+ const std::string kernel_src = read_file("mul_mv_q4_0_f32_8x_flat.cl");
643
+ #endif
644
+ backend_ctx->program_mul_mv_q4_0_f32_8x_flat =
645
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
646
+
647
+ CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat = clCreateKernel(backend_ctx->program_mul_mv_q4_0_f32_8x_flat, "kernel_mul_mat_q4_0_f32_8x_flat", &err), err));
648
+ GGML_LOG_CONT(".");
649
+ }
650
+
651
+ // mul_mv_q4_0_f32_1d_8x_flat
652
+ // This kernel does not compiler on Adreno cl compiler 38.01. Skip it for
653
+ // those compiler versions since it is anyway not used for Adreno.
654
+ if (backend_ctx->gpu_family != ADRENO ||
655
+ backend_ctx->adreno_cl_compiler_version.newer_than_or_same(E031, 38, 11, 0) ||
656
+ backend_ctx->adreno_cl_compiler_version.type == DX) {
657
+ #ifdef GGML_OPENCL_EMBED_KERNELS
658
+ const std::string kernel_src {
659
+ #include "mul_mv_q4_0_f32_1d_8x_flat.cl.h"
660
+ };
661
+ #else
662
+ const std::string kernel_src = read_file("mul_mv_q4_0_f32_1d_8x_flat.cl");
663
+ #endif
664
+ backend_ctx->program_mul_mv_q4_0_f32_1d_8x_flat =
665
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
666
+
667
+ CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_1d_8x_flat = clCreateKernel(backend_ctx->program_mul_mv_q4_0_f32_1d_8x_flat, "kernel_mul_mat_q4_0_f32_1d_8x_flat", &err), err));
668
+ GGML_LOG_CONT(".");
669
+ }
670
+
671
+ // mul_mv_q4_0_f32_1d_16x_flat
672
+ // This kernel does not compiler on Adreno cl compiler 38.01. Skip it for
673
+ // those compiler versions since it is anyway not used for Adreno.
674
+ if (backend_ctx->gpu_family != ADRENO ||
675
+ backend_ctx->adreno_cl_compiler_version.newer_than_or_same(E031, 38, 11, 0) ||
676
+ backend_ctx->adreno_cl_compiler_version.type == DX) {
677
+ #ifdef GGML_OPENCL_EMBED_KERNELS
678
+ const std::string kernel_src {
679
+ #include "mul_mv_q4_0_f32_1d_16x_flat.cl.h"
680
+ };
681
+ #else
682
+ const std::string kernel_src = read_file("mul_mv_q4_0_f32_1d_16x_flat.cl");
683
+ #endif
684
+ backend_ctx->program_mul_mv_q4_0_f32_1d_16x_flat =
685
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
686
+
687
+ CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_1d_16x_flat = clCreateKernel(backend_ctx->program_mul_mv_q4_0_f32_1d_16x_flat, "kernel_mul_mat_q4_0_f32_1d_16x_flat", &err), err));
688
+ GGML_LOG_CONT(".");
689
+ }
690
+
691
+ // mul_mv_q6_k
692
+ {
693
+ #ifdef GGML_OPENCL_EMBED_KERNELS
694
+ const std::string kernel_src {
695
+ #include "mul_mv_q6_k.cl.h"
696
+ };
697
+ #else
698
+ const std::string kernel_src = read_file("mul_mv_q6_k.cl");
699
+ #endif
700
+ backend_ctx->program_mul_mv_q6_K =
701
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
702
+
703
+ CL_CHECK((backend_ctx->kernel_mul_mv_q6_K_f32 = clCreateKernel(backend_ctx->program_mul_mv_q6_K, "kernel_mul_mv_q6_K_f32", &err), err));
704
+ GGML_LOG_CONT(".");
705
+ }
706
+
707
+ // mul_mv_f16_f16
708
+ {
709
+ #ifdef GGML_OPENCL_EMBED_KERNELS
710
+ const std::string kernel_src {
711
+ #include "mul_mv_f16_f16.cl.h"
712
+ };
713
+ #else
714
+ const std::string kernel_src = read_file("mul_mv_f16_f16.cl");
715
+ #endif
716
+ backend_ctx->program_mul_mv_f16_f16 =
717
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
718
+
719
+ CL_CHECK((backend_ctx->kernel_mul_mat_f16_f16 = clCreateKernel(backend_ctx->program_mul_mv_f16_f16, "kernel_mul_mat_f16_f16", &err), err));
720
+ GGML_LOG_CONT(".");
721
+ }
722
+
723
+ // mul_mv_f16_f32_1row
724
+ {
725
+ #ifdef GGML_OPENCL_EMBED_KERNELS
726
+ const std::string kernel_src {
727
+ #include "mul_mv_f16_f32_1row.cl.h"
728
+ };
729
+ #else
730
+ const std::string kernel_src = read_file("mul_mv_f16_f32_1row.cl");
731
+ #endif
732
+ backend_ctx->program_mul_mv_f16_f32_1row =
733
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
734
+
735
+ CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_1row = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_1row, "kernel_mul_mat_f16_f32_1row", &err), err));
736
+ GGML_LOG_CONT(".");
737
+ }
738
+
739
+ // mul_mv_f16_f32_l4
740
+ {
741
+ #ifdef GGML_OPENCL_EMBED_KERNELS
742
+ const std::string kernel_src {
743
+ #include "mul_mv_f16_f32_l4.cl.h"
744
+ };
745
+ #else
746
+ const std::string kernel_src = read_file("mul_mv_f16_f32_l4.cl");
747
+ #endif
748
+ backend_ctx->program_mul_mv_f16_f32_l4 =
749
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
750
+
751
+ CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4 = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_l4, "kernel_mul_mat_f16_f32_l4", &err), err));
752
+ GGML_LOG_CONT(".");
753
+ }
754
+
755
+ // mul_mv_f16_f32
756
+ {
757
+ #ifdef GGML_OPENCL_EMBED_KERNELS
758
+ const std::string kernel_src {
759
+ #include "mul_mv_f16_f32.cl.h"
760
+ };
761
+ #else
762
+ const std::string kernel_src = read_file("mul_mv_f16_f32.cl");
763
+ #endif
764
+ backend_ctx->program_mul_mv_f16_f32 =
765
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
766
+
767
+ CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32 = clCreateKernel(backend_ctx->program_mul_mv_f16_f32, "kernel_mul_mat_f16_f32", &err), err));
768
+ GGML_LOG_CONT(".");
769
+ }
770
+
771
+ // mul_mv_f32_f32
772
+ {
773
+ #ifdef GGML_OPENCL_EMBED_KERNELS
774
+ const std::string kernel_src {
775
+ #include "mul_mv_f32_f32.cl.h"
776
+ };
777
+ #else
778
+ const std::string kernel_src = read_file("mul_mv_f32_f32.cl");
779
+ #endif
780
+ backend_ctx->program_mul_mv_f32_f32 =
781
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
782
+
783
+ CL_CHECK((backend_ctx->kernel_mul_mat_f32_f32 = clCreateKernel(backend_ctx->program_mul_mv_f32_f32, "kernel_mul_mat_f32_f32", &err), err));
784
+ GGML_LOG_CONT(".");
785
+ }
786
+
787
+ // mul
788
+ {
789
+ #ifdef GGML_OPENCL_EMBED_KERNELS
790
+ const std::string kernel_src {
791
+ #include "mul.cl.h"
792
+ };
793
+ #else
794
+ const std::string kernel_src = read_file("mul.cl");
795
+ #endif
796
+ backend_ctx->program_mul =
797
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
798
+
799
+ CL_CHECK((backend_ctx->kernel_mul = clCreateKernel(backend_ctx->program_mul, "kernel_mul", &err), err));
800
+ CL_CHECK((backend_ctx->kernel_mul_row = clCreateKernel(backend_ctx->program_mul, "kernel_mul_row", &err), err));
801
+ GGML_LOG_CONT(".");
802
+ }
803
+
804
+ // norm
805
+ {
806
+ #ifdef GGML_OPENCL_EMBED_KERNELS
807
+ const std::string kernel_src {
808
+ #include "norm.cl.h"
809
+ };
810
+ #else
811
+ const std::string kernel_src = read_file("norm.cl");
812
+ #endif
813
+ backend_ctx->program_norm =
814
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
815
+
816
+ CL_CHECK((backend_ctx->kernel_norm = clCreateKernel(backend_ctx->program_norm, "kernel_norm", &err), err));
817
+ GGML_LOG_CONT(".");
818
+ }
819
+
820
+ // relu
821
+ {
822
+ #ifdef GGML_OPENCL_EMBED_KERNELS
823
+ const std::string kernel_src {
824
+ #include "relu.cl.h"
825
+ };
826
+ #else
827
+ const std::string kernel_src = read_file("relu.cl");
828
+ #endif
829
+ backend_ctx->program_relu =
830
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
831
+
832
+ CL_CHECK((backend_ctx->kernel_relu = clCreateKernel(backend_ctx->program_relu, "kernel_relu", &err), err));
833
+ GGML_LOG_CONT(".");
834
+ }
835
+
836
+ // rms_norm
837
+ {
838
+ #ifdef GGML_OPENCL_EMBED_KERNELS
839
+ const std::string kernel_src {
840
+ #include "rms_norm.cl.h"
841
+ };
842
+ #else
843
+ const std::string kernel_src = read_file("rms_norm.cl");
844
+ #endif
845
+ backend_ctx->program_rms_norm =
846
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
847
+
848
+ CL_CHECK((backend_ctx->kernel_rms_norm = clCreateKernel(backend_ctx->program_rms_norm, "kernel_rms_norm", &err), err));
849
+ GGML_LOG_CONT(".");
850
+ }
851
+
852
+ // rope
853
+ {
854
+ #ifdef GGML_OPENCL_EMBED_KERNELS
855
+ const std::string kernel_src {
856
+ #include "rope.cl.h"
857
+ };
858
+ #else
859
+ const std::string kernel_src = read_file("rope.cl");
860
+ #endif
861
+ backend_ctx->program_rope =
862
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
863
+
864
+ CL_CHECK((backend_ctx->kernel_rope_norm_f32 = clCreateKernel(backend_ctx->program_rope, "kernel_rope_norm_f32", &err), err));
865
+ CL_CHECK((backend_ctx->kernel_rope_norm_f16 = clCreateKernel(backend_ctx->program_rope, "kernel_rope_norm_f16", &err), err));
866
+ CL_CHECK((backend_ctx->kernel_rope_neox_f32 = clCreateKernel(backend_ctx->program_rope, "kernel_rope_neox_f32", &err), err));
867
+ CL_CHECK((backend_ctx->kernel_rope_neox_f16 = clCreateKernel(backend_ctx->program_rope, "kernel_rope_neox_f16", &err), err));
868
+ CL_CHECK((backend_ctx->kernel_rope_multi_f32 = clCreateKernel(backend_ctx->program_rope, "kernel_rope_multi_f32", &err), err));
869
+ CL_CHECK((backend_ctx->kernel_rope_multi_f16 = clCreateKernel(backend_ctx->program_rope, "kernel_rope_multi_f16", &err), err));
870
+ CL_CHECK((backend_ctx->kernel_rope_vision_f32 = clCreateKernel(backend_ctx->program_rope, "kernel_rope_vision_f32", &err), err));
871
+ CL_CHECK((backend_ctx->kernel_rope_vision_f16 = clCreateKernel(backend_ctx->program_rope, "kernel_rope_vision_f16", &err), err));
872
+ GGML_LOG_CONT(".");
873
+ }
874
+
875
+ // scale
876
+ {
877
+ #ifdef GGML_OPENCL_EMBED_KERNELS
878
+ const std::string kernel_src {
879
+ #include "scale.cl.h"
880
+ };
881
+ #else
882
+ const std::string kernel_src = read_file("scale.cl");
883
+ #endif
884
+ backend_ctx->program_scale =
885
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
886
+
887
+ CL_CHECK((backend_ctx->kernel_scale = clCreateKernel(backend_ctx->program_scale, "kernel_scale", &err), err));
888
+ GGML_LOG_CONT(".");
889
+ }
890
+
891
+ // silu
892
+ {
893
+ #ifdef GGML_OPENCL_EMBED_KERNELS
894
+ const std::string kernel_src {
895
+ #include "silu.cl.h"
896
+ };
897
+ #else
898
+ const std::string kernel_src = read_file("silu.cl");
899
+ #endif
900
+ backend_ctx->program_silu =
901
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
902
+
903
+ CL_CHECK((backend_ctx->kernel_silu = clCreateKernel(backend_ctx->program_silu, "kernel_silu", &err), err));
904
+ CL_CHECK((backend_ctx->kernel_silu_4 = clCreateKernel(backend_ctx->program_silu, "kernel_silu_4", &err), err));
905
+ GGML_LOG_CONT(".");
906
+ }
907
+
908
+ // softmax_f32
909
+ {
910
+ #ifdef GGML_OPENCL_EMBED_KERNELS
911
+ const std::string kernel_src {
912
+ #include "softmax_f32.cl.h"
913
+ };
914
+ #else
915
+ const std::string kernel_src = read_file("softmax_f32.cl");
916
+ #endif
917
+ backend_ctx->program_softmax_f32 =
918
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
919
+
920
+ CL_CHECK((backend_ctx->kernel_soft_max = clCreateKernel(backend_ctx->program_softmax_f32, "kernel_soft_max", &err), err));
921
+ GGML_LOG_CONT(".");
922
+ }
923
+
924
+ // softmax_f16
925
+ {
926
+ #ifdef GGML_OPENCL_EMBED_KERNELS
927
+ const std::string kernel_src {
928
+ #include "softmax_f16.cl.h"
929
+ };
930
+ #else
931
+ const std::string kernel_src = read_file("softmax_f16.cl");
932
+ #endif
933
+ backend_ctx->program_softmax_f16 =
934
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
935
+
936
+ CL_CHECK((backend_ctx->kernel_soft_max_f16 = clCreateKernel(backend_ctx->program_softmax_f16, "kernel_soft_max_f16", &err), err));
937
+ GGML_LOG_CONT(".");
938
+ }
939
+
940
+ // softmax_4_f32
941
+ {
942
+ #ifdef GGML_OPENCL_EMBED_KERNELS
943
+ const std::string kernel_src {
944
+ #include "softmax_4_f32.cl.h"
945
+ };
946
+ #else
947
+ const std::string kernel_src = read_file("softmax_4_f32.cl");
948
+ #endif
949
+ backend_ctx->program_softmax_4_f32 =
950
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
951
+
952
+ CL_CHECK((backend_ctx->kernel_soft_max_4 = clCreateKernel(backend_ctx->program_softmax_4_f32, "kernel_soft_max_4", &err), err));
953
+ GGML_LOG_CONT(".");
954
+ }
955
+
956
+ // softmax_4_f16
957
+ {
958
+ #ifdef GGML_OPENCL_EMBED_KERNELS
959
+ const std::string kernel_src {
960
+ #include "softmax_4_f16.cl.h"
961
+ };
962
+ #else
963
+ const std::string kernel_src = read_file("softmax_4_f16.cl");
964
+ #endif
965
+ backend_ctx->program_softmax_4_f16 =
966
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
967
+
968
+ CL_CHECK((backend_ctx->kernel_soft_max_4_f16 = clCreateKernel(backend_ctx->program_softmax_4_f16, "kernel_soft_max_4_f16", &err), err));
969
+ GGML_LOG_CONT(".");
970
+ }
971
+
972
+ // Adreno kernels
973
+ #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
974
+ // transpose
975
+ {
976
+ #ifdef GGML_OPENCL_EMBED_KERNELS
977
+ const std::string kernel_src {
978
+ #include "transpose.cl.h"
979
+ };
980
+ #else
981
+ const std::string kernel_src = read_file("transpose.cl");
982
+ #endif
983
+ backend_ctx->program_transpose =
984
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
985
+
986
+ CL_CHECK((backend_ctx->kernel_transpose_32_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32_16", &err), err));
987
+ CL_CHECK((backend_ctx->kernel_transpose_32 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32", &err), err));
988
+ CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16", &err), err));
989
+ GGML_LOG_CONT(".");
990
+ }
991
+
992
+ // gemv_noshuffle_general
993
+ {
994
+ std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
995
+ " -cl-mad-enable "
996
+ " -DSIMDGROUP_WIDTH=" +
997
+ std::to_string(backend_ctx->adreno_wave_size);
998
+ if (backend_ctx->has_vector_subgroup_broadcast) {
999
+ CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
1000
+ }
1001
+
1002
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1003
+ const std::string kernel_src_CL_gemv_general {
1004
+ #include "gemv_noshuffle_general.cl.h"
1005
+ };
1006
+ #else
1007
+ const std::string kernel_src_CL_gemv_general = read_file("gemv_noshuffle_general.cl");
1008
+ #endif
1009
+
1010
+ backend_ctx->program_CL_gemv_general = build_program_from_source(
1011
+ backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv_general.c_str(), CL_gemv_compile_opts);
1012
+
1013
+ CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general = clCreateKernel(backend_ctx->program_CL_gemv_general, "kernel_gemv_noshuffle", &err), err));
1014
+ GGML_LOG_CONT(".");
1015
+ }
1016
+
1017
+ // gemv_noshuffle
1018
+ {
1019
+ // Gemv 2048, 16384
1020
+ std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
1021
+ " -cl-mad-enable "
1022
+ " -DLINE_STRIDE_A=2048 "
1023
+ " -DBLOCK_STRIDE_A=16384 "
1024
+ " -DSIMDGROUP_WIDTH=" +
1025
+ std::to_string(backend_ctx->adreno_wave_size);
1026
+ if (backend_ctx->has_vector_subgroup_broadcast) {
1027
+ CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
1028
+ }
1029
+
1030
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1031
+ const std::string kernel_src_CL_gemv {
1032
+ #include "gemv_noshuffle.cl.h"
1033
+ };
1034
+ #else
1035
+ const std::string kernel_src_CL_gemv = read_file("gemv_noshuffle.cl");
1036
+ #endif
1037
+
1038
+ backend_ctx->program_CL_gemv_4096_1_4096 = build_program_from_source(
1039
+ backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
1040
+ CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_4096, "kernel_gemv_noshuffle", &err), err));
1041
+ GGML_LOG_CONT(".");
1042
+
1043
+ // Gemv 2048, 16384
1044
+ CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
1045
+ " -cl-mad-enable "
1046
+ " -DLINE_STRIDE_A=2048 "
1047
+ " -DBLOCK_STRIDE_A=16384 "
1048
+ " -DSIMDGROUP_WIDTH=" +
1049
+ std::to_string(backend_ctx->adreno_wave_size);
1050
+ if (backend_ctx->has_vector_subgroup_broadcast) {
1051
+ CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
1052
+ }
1053
+
1054
+ backend_ctx->program_CL_gemv_4096_1_11008 = build_program_from_source(
1055
+ backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
1056
+ CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_11008, "kernel_gemv_noshuffle", &err), err));
1057
+ GGML_LOG_CONT(".");
1058
+
1059
+ // Gemv 5504, 44032
1060
+ CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
1061
+ " -cl-mad-enable "
1062
+ " -DLINE_STRIDE_A=5504 "
1063
+ " -DBLOCK_STRIDE_A=44032 "
1064
+ " -DSIMDGROUP_WIDTH=" +
1065
+ std::to_string(backend_ctx->adreno_wave_size);
1066
+ if (backend_ctx->has_vector_subgroup_broadcast) {
1067
+ CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
1068
+ }
1069
+
1070
+ backend_ctx->program_CL_gemv_11008_1_4096 = build_program_from_source(
1071
+ backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
1072
+ CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_11008_1_4096, "kernel_gemv_noshuffle", &err), err));
1073
+ GGML_LOG_CONT(".");
1074
+
1075
+ // Gemv 16000, 128000
1076
+ CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
1077
+ " -cl-mad-enable "
1078
+ " -DLINE_STRIDE_A=16000 "
1079
+ " -DBLOCK_STRIDE_A=128000 "
1080
+ " -DSIMDGROUP_WIDTH=" +
1081
+ std::to_string(backend_ctx->adreno_wave_size);
1082
+
1083
+ if (backend_ctx->has_vector_subgroup_broadcast) {
1084
+ CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
1085
+ }
1086
+
1087
+ backend_ctx->program_CL_gemv_32000_1_4096 = build_program_from_source(
1088
+ backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
1089
+ CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_32000_1_4096, "kernel_gemv_noshuffle", &err), err));
1090
+ GGML_LOG_CONT(".");
1091
+ }
1092
+
1093
+ // mul_mat_Ab_Bi_8x4
1094
+ {
1095
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1096
+ const std::string kernel_src_CL_gemm {
1097
+ #include "mul_mat_Ab_Bi_8x4.cl.h"
1098
+ };
1099
+ #else
1100
+ const std::string kernel_src_CL_gemm = read_file("mul_mat_Ab_Bi_8x4.cl");
1101
+ #endif
1102
+ backend_ctx->program_CL_gemm = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_CL_gemm.c_str(), compile_opts);
1103
+ CL_CHECK((backend_ctx->CL_mul_mat_Ab_Bi_8x4 = clCreateKernel(backend_ctx->program_CL_gemm, "kernel_mul_mat_Ab_Bi_8x4", &err), err));
1104
+ GGML_LOG_CONT(".");
1105
+ }
1106
+ #endif // GGML_OPENCL_USE_ADRENO_KERNELS
1107
+ GGML_LOG_CONT("\n");
1108
+ }
1109
+
1110
  static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1111
  static bool initialized = false;
1112
  static ggml_backend_opencl_context *backend_ctx = nullptr;
 
1345
  GGML_LOG_INFO("ggml_opencl: OpenCL driver: %s\n", driver_version);
1346
  backend_ctx->driver_version = driver_version;
1347
 
1348
+ backend_ctx->adreno_cl_compiler_version = get_adreno_cl_compiler_version(driver_version);
1349
+ backend_ctx->has_vector_subgroup_broadcast =
1350
+ backend_ctx->adreno_cl_compiler_version.major >= 47 ||
1351
+ backend_ctx->adreno_cl_compiler_version.major == 17;
1352
  GGML_LOG_INFO("ggml_opencl: vector subgroup broadcast support: %s\n",
1353
+ backend_ctx->has_vector_subgroup_broadcast ? "true" : "false");
1354
 
1355
  size_t ext_str_size;
1356
  clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size);
 
1425
  #endif
1426
  CL_CHECK((backend_ctx->queue = clCreateCommandQueue(context, device, command_queue_props, &err), err));
1427
 
1428
+ // Load kernels
1429
+ load_cl_kernels(backend_ctx, opencl_c_version);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1430
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1431
  #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1432
  // Allocate intermediate buffers and images
1433
  size_t required_A_q_d_bytes = 311164928;
1434
  size_t required_A_s_d_bytes = 38895616;
 
1992
 
1993
  // The optimized gemm and gemv kernels are used for large matrices without batch.
1994
  // tensor is the quantized weights matrix.
1995
+ inline bool use_adreno_kernels(const ggml_backend_opencl_context *backend_ctx, const ggml_tensor *tensor) {
1996
+ int64_t threshold_ne0 = 512;
1997
+ int64_t threshold_ne1 = 512;
1998
+ if (!backend_ctx->adreno_cl_compiler_version.newer_than_or_same(E031, 38, 11, 0) &&
1999
+ backend_ctx->adreno_cl_compiler_version.type != DX) {
2000
+ threshold_ne0 = 128;
2001
+ threshold_ne1 = 128;
2002
+ }
2003
+ return tensor->ne[0] >= threshold_ne0 && tensor->ne[1] >= threshold_ne1 &&
2004
  tensor->ne[2] == 1 && tensor->ne[3] == 1;
2005
  }
2006
 
 
2078
  cl_kernel kernel = backend_ctx->kernel_convert_block_q4_0;
2079
 
2080
  // The optimized kernels need weights in natural order, so unshuffle.
2081
+ if (use_adreno_kernels(backend_ctx, tensor)) {
2082
  kernel = backend_ctx->kernel_convert_block_q4_0_noshuffle;
2083
  }
2084
  #else
 
2102
  #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
2103
  // Only do transpose for large, non batched matrix
2104
  // TODO: use preallocated images instead of sub-buffer then image
2105
+ if (use_adreno_kernels(backend_ctx, tensor)) {
2106
  // <----------------------------------------------------------------------------------> //
2107
  // start transpose
2108
  // <----------------------------------------------------------------------------------> //
 
3403
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3404
  cl_command_queue queue = backend_ctx->queue;
3405
 
3406
+ //ggml_backend_opencl_device_context * dev_ctx =
3407
+ // (ggml_backend_opencl_device_context *)backend->device->context;
3408
 
3409
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3410
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 
3435
 
3436
  // Note, this kernel declares local memory in kernel args and the size
3437
  // depends on subgroup size.
 
3438
  // Note, this requires OpenCL 2.1 and above
3439
+ // For now we use fixed subgroup size to simplify support for OpenCL 2.0.
3440
  size_t sgs;
3441
+ //CL_CHECK(clGetKernelSubGroupInfo(kernel, dev_ctx->device,
3442
+ // CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
3443
+ // sizeof(local_work_size), local_work_size,
3444
+ // sizeof(size_t), &sgs, NULL));
3445
+ if (backend_ctx->gpu_family == ADRENO) {
3446
+ sgs = 64;
3447
+ } else if (backend_ctx->gpu_family == INTEL) {
3448
+ sgs = 32;
3449
+ } else {
3450
+ GGML_ASSERT(false && "Unsupported GPU");
3451
+ }
3452
 
3453
  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
3454
  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
 
3541
  #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
3542
  cl_context context = backend_ctx->context;
3543
 
3544
+ if (ne01 && ne1 && use_adreno_kernels(backend_ctx, src0)) {
3545
 
3546
  // init CL objects
3547
  // <--------------------------------------------> //
ggml/src/ggml-opencl/kernels/add.cl ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ //------------------------------------------------------------------------------
4
+ // add
5
+ //------------------------------------------------------------------------------
6
+
7
+ // general-purpose kernel for addition of two tensors
8
+ // pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3
9
+ // cons: not very efficient
10
+ kernel void kernel_add(
11
+ global char * src0,
12
+ ulong offset0,
13
+ global char * src1,
14
+ ulong offset1,
15
+ global char * dst,
16
+ ulong offsetd,
17
+ int ne00,
18
+ int ne01,
19
+ int ne02,
20
+ int ne03,
21
+ ulong nb00,
22
+ ulong nb01,
23
+ ulong nb02,
24
+ ulong nb03,
25
+ int ne10,
26
+ int ne11,
27
+ int ne12,
28
+ int ne13,
29
+ ulong nb10,
30
+ ulong nb11,
31
+ ulong nb12,
32
+ ulong nb13,
33
+ int ne0,
34
+ int ne1,
35
+ int ne2,
36
+ int ne3,
37
+ ulong nb0,
38
+ ulong nb1,
39
+ ulong nb2,
40
+ ulong nb3
41
+ ) {
42
+ src0 = src0 + offset0;
43
+ src1 = src1 + offset1;
44
+ dst = dst + offsetd;
45
+
46
+ int i03 = get_group_id(2);
47
+ int i02 = get_group_id(1);
48
+ int i01 = get_group_id(0);
49
+
50
+ int i13 = i03 % ne13;
51
+ int i12 = i02 % ne12;
52
+ int i11 = i01 % ne11;
53
+
54
+ global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
55
+ global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
56
+ global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
57
+
58
+ for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
59
+ const int i10 = i0 % ne10;
60
+ *((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) + *((global float *)(src1_ptr + i10*nb10));
61
+ }
62
+ }
63
+
64
+ // assumption: src1 is a row
65
+ // broadcast src1 into src0
66
+ kernel void kernel_add_row(
67
+ global float4 * src0,
68
+ ulong offset0,
69
+ global float4 * src1,
70
+ ulong offset1,
71
+ global float4 * dst,
72
+ ulong offsetd,
73
+ int ne
74
+ ) {
75
+ src0 = (global float4*)((global char*)src0 + offset0);
76
+ src1 = (global float4*)((global char*)src1 + offset1);
77
+ dst = (global float4*)((global char*)dst + offsetd);
78
+
79
+ // This performs better than using %.
80
+ uint gid = get_global_id(0);
81
+ uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
82
+ dst[gid] = src0[gid] + src1[idx1];
83
+ }
ggml/src/ggml-opencl/kernels/clamp.cl ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ //------------------------------------------------------------------------------
4
+ // clamp
5
+ //------------------------------------------------------------------------------
6
+ kernel void kernel_clamp(
7
+ global float * src0,
8
+ ulong offset0,
9
+ global float * dst,
10
+ ulong offsetd,
11
+ float min,
12
+ float max
13
+ ) {
14
+ src0 = (global float*)((global char*)src0 + offset0);
15
+ dst = (global float*)((global char*)dst + offsetd);
16
+
17
+ dst[get_global_id(0)] = src0[get_global_id(0)] < min ?
18
+ min :
19
+ (src0[get_global_id(0)] > max ? max : src0[get_global_id(0)]);
20
+ }
ggml/src/ggml-opencl/kernels/cpy.cl ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ //------------------------------------------------------------------------------
4
+ // cpy
5
+ //------------------------------------------------------------------------------
6
+
7
+ kernel void kernel_cpy_f16_f16(
8
+ global half * src0,
9
+ ulong offset0,
10
+ global half * dst,
11
+ ulong offsetd,
12
+ int ne00,
13
+ int ne01,
14
+ int ne02,
15
+ int ne03,
16
+ ulong nb00,
17
+ ulong nb01,
18
+ ulong nb02,
19
+ ulong nb03,
20
+ int ne0,
21
+ int ne1,
22
+ int ne2,
23
+ int ne3,
24
+ ulong nb0,
25
+ ulong nb1,
26
+ ulong nb2,
27
+ ulong nb3
28
+ ) {
29
+ src0 = (global half*)((global char*)src0 + offset0);
30
+ dst = (global half*)((global char*)dst + offsetd);
31
+
32
+ int i03 = get_group_id(2);
33
+ int i02 = get_group_id(1);
34
+ int i01 = get_group_id(0);
35
+
36
+ int n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
37
+
38
+ int i3 = n / (ne2*ne1*ne0);
39
+ int i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
40
+ int i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
41
+ int i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
42
+
43
+ global half * dst_data = (global half *) ((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
44
+
45
+ for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
46
+ global const half * src = (global half *)((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
47
+ dst_data[i00] = src[0];
48
+ }
49
+ }
50
+
51
+ kernel void kernel_cpy_f16_f32(
52
+ global half * src0,
53
+ ulong offset0,
54
+ global float * dst,
55
+ ulong offsetd,
56
+ int ne00,
57
+ int ne01,
58
+ int ne02,
59
+ int ne03,
60
+ ulong nb00,
61
+ ulong nb01,
62
+ ulong nb02,
63
+ ulong nb03,
64
+ int ne0,
65
+ int ne1,
66
+ int ne2,
67
+ int ne3,
68
+ ulong nb0,
69
+ ulong nb1,
70
+ ulong nb2,
71
+ ulong nb3
72
+ ) {
73
+
74
+ src0 = (global half*)((global char*)src0 + offset0);
75
+ dst = (global float*)((global char*)dst + offsetd);
76
+
77
+ int i03 = get_group_id(2);
78
+ int i02 = get_group_id(1);
79
+ int i01 = get_group_id(0);
80
+
81
+ int n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
82
+
83
+ int i3 = n / (ne2*ne1*ne0);
84
+ int i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
85
+ int i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
86
+ int i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
87
+
88
+ global float * dst_data = (global float *) ((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
89
+
90
+ for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
91
+ global half * src = (global half *)((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
92
+ dst_data[i00] = src[0];
93
+ }
94
+ }
95
+
96
+ kernel void kernel_cpy_f32_f16(
97
+ global float * src0,
98
+ ulong offset0,
99
+ global half * dst,
100
+ ulong offsetd,
101
+ int ne00,
102
+ int ne01,
103
+ int ne02,
104
+ int ne03,
105
+ ulong nb00,
106
+ ulong nb01,
107
+ ulong nb02,
108
+ ulong nb03,
109
+ int ne0,
110
+ int ne1,
111
+ int ne2,
112
+ int ne3,
113
+ ulong nb0,
114
+ ulong nb1,
115
+ ulong nb2,
116
+ ulong nb3
117
+ ) {
118
+ src0 = (global float*)((global char*)src0 + offset0);
119
+ dst = (global half*)((global char*)dst + offsetd);
120
+
121
+ int i03 = get_group_id(2);
122
+ int i02 = get_group_id(1);
123
+ int i01 = get_group_id(0);
124
+
125
+ int n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
126
+
127
+ int i3 = n / (ne2*ne1*ne0);
128
+ int i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
129
+ int i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
130
+ int i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
131
+
132
+ global half * dst_data = (global half *) ((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
133
+
134
+ for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
135
+ global const float * src = (global float *)((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
136
+
137
+ dst_data[i00] = src[0];
138
+ }
139
+ }
140
+
141
+ kernel void kernel_cpy_f32_f32(
142
+ global float * src0,
143
+ ulong offset0,
144
+ global float * dst,
145
+ ulong offsetd,
146
+ int ne00,
147
+ int ne01,
148
+ int ne02,
149
+ int ne03,
150
+ ulong nb00,
151
+ ulong nb01,
152
+ ulong nb02,
153
+ ulong nb03,
154
+ int ne0,
155
+ int ne1,
156
+ int ne2,
157
+ int ne3,
158
+ ulong nb0,
159
+ ulong nb1,
160
+ ulong nb2,
161
+ ulong nb3
162
+ ) {
163
+ src0 = (global float*)((global char*)src0 + offset0);
164
+ dst = (global float*)((global char*)dst + offsetd);
165
+
166
+ int i03 = get_group_id(2);
167
+ int i02 = get_group_id(1);
168
+ int i01 = get_group_id(0);
169
+
170
+ int n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
171
+
172
+ int i3 = n / (ne2*ne1*ne0);
173
+ int i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
174
+ int i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
175
+ int i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
176
+
177
+ global float * dst_data = (global float *) ((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
178
+
179
+ for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
180
+ global const float * src = (global float *)((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
181
+
182
+ dst_data[i00] = src[0];
183
+ }
184
+ }
ggml/src/ggml-opencl/kernels/cvt.cl ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //------------------------------------------------------------------------------
2
+ // This file is contains kernels for data conversion.
3
+ // These kernels are used when loading the model, so its performance is less
4
+ // important.
5
+ //------------------------------------------------------------------------------
6
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
7
+
8
+ #ifdef cl_intel_required_subgroup_size
9
+ #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
10
+ #define INTEL_GPU 1
11
+ #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
12
+ #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
13
+ #elif defined(cl_qcom_reqd_sub_group_size)
14
+ #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
15
+ #define ADRENO_GPU 1
16
+ #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
17
+ #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
18
+ #endif
19
+
20
+ #define QK4_0 32
21
+ #define QR4_0 2
22
+ #define QK4_1 32
23
+ #define QR4_1 2
24
+ #define QK5_0 32
25
+ #define QR5_0 2
26
+ #define QK5_1 32
27
+ #define QR5_1 2
28
+ #define QK8_0 32
29
+ #define QR8_0 1
30
+ #define QK_K 256
31
+ #define K_QUANTS_PER_ITERATION 2
32
+
33
+ typedef char int8_t;
34
+ typedef uchar uint8_t;
35
+ typedef short int16_t;
36
+ typedef ushort uint16_t;
37
+ typedef int int32_t;
38
+ typedef uint uint32_t;
39
+
40
+ //------------------------------------------------------------------------------
41
+ // block_q4_0
42
+ //------------------------------------------------------------------------------
43
+ struct block_q4_0
44
+ {
45
+ half d;
46
+ uint8_t qs[QK4_0 / 2];
47
+ };
48
+
49
+ //------------------------------------------------------------------------------
50
+ // kernel_convert_block_q4_0
51
+ // Convert the block_q4_0 format to 2 separate arrays (AOS -> SOA).
52
+ // This kernel does not deshuffle the bits.
53
+ //------------------------------------------------------------------------------
54
+ kernel void kernel_convert_block_q4_0(
55
+ global struct block_q4_0 * src0,
56
+ global uchar * dst_q,
57
+ global half * dst_d
58
+ ) {
59
+ global struct block_q4_0 * b = (global struct block_q4_0 *) src0 + get_global_id(0);
60
+ global uchar * q = (global uchar *) dst_q + QK4_0/2*get_global_id(0);
61
+ global half * d = (global half *) dst_d + get_global_id(0);
62
+
63
+ *d = b->d;
64
+
65
+ for (int i = 0; i < QK4_0/2; ++i) {
66
+ q[i] = b->qs[i];
67
+ }
68
+ }
69
+
70
+ kernel void kernel_restore_block_q4_0(
71
+ global uchar * src_q,
72
+ global half * src_d,
73
+ global struct block_q4_0 * dst
74
+ ) {
75
+ global struct block_q4_0 * b = (global struct block_q4_0 *) dst + get_global_id(0);
76
+ global uchar * q = (global uchar *) src_q + QK4_0/2*get_global_id(0);
77
+ global half * d = (global half *) src_d + get_global_id(0);
78
+
79
+ b->d = *d;
80
+ for (int i = 0; i < QK4_0/2; ++i) {
81
+ b->qs[i] = q[i];
82
+ }
83
+ }
84
+
85
+ //------------------------------------------------------------------------------
86
+ // kernel_convert_block_q4_0_noshuffle
87
+ // Flatten q4_0 weights and unshuffle the bits
88
+ //------------------------------------------------------------------------------
89
+
90
+ kernel void kernel_convert_block_q4_0_noshuffle(
91
+ global struct block_q4_0 * src0,
92
+ global uchar * dst_q,
93
+ global half * dst_d
94
+ ) {
95
+ global struct block_q4_0 * b = (global struct block_q4_0 *) src0 + get_global_id(0);
96
+ global uchar * q = (global uchar *) dst_q + QK4_0/2*get_global_id(0);
97
+ global half * d = (global half *) dst_d + get_global_id(0);
98
+
99
+ *d = b->d;
100
+ for (int i = 0; i < QK4_0/4; ++i) {
101
+ uchar x0 = b->qs[2*i + 0];
102
+ uchar x1 = b->qs[2*i + 1];
103
+
104
+ q[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
105
+ q[i + QK4_0/4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
106
+
107
+ #ifdef ADRENO_GPU
108
+ // Workaround for adreno - must have the following printf statement for
109
+ // the kernel to work properly. Otherwise it produces incorrect result.
110
+ // convert_uchar above also seems necessary.
111
+ // Compare against a large number so that it does not print anything.
112
+ // get_sub_group_local_id() also works.
113
+ if (get_global_id(0) == 65536*4096) {
114
+ printf("%04x - %02x\n", *(global ushort*)d, ((x0 & 0xF0) >> 4) | (x1 & 0xF0));
115
+ }
116
+ #endif
117
+ }
118
+ }
ggml/src/ggml-opencl/kernels/diag_mask_inf.cl ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ //------------------------------------------------------------------------------
4
+ // diag_mask_inf kernels
5
+ //------------------------------------------------------------------------------
6
+ kernel void kernel_diag_mask_inf(
7
+ global float * src0,
8
+ ulong offset0,
9
+ global float * dst,
10
+ ulong offsetd,
11
+ int ne00,
12
+ int ne01,
13
+ int n_past
14
+ ) {
15
+ src0 = (global float*)((global char*)src0 + offset0);
16
+ dst = (global float*)((global char*)dst + offsetd);
17
+
18
+ int i02 = get_global_id(2);
19
+ int i01 = get_global_id(1);
20
+ int i00 = get_global_id(0);
21
+
22
+ if (i00 > n_past + i01) {
23
+ dst[i02*ne01*ne00 + i01*ne00 + i00] = -INFINITY;
24
+ } else {
25
+ dst[i02*ne01*ne00 + i01*ne00 + i00] = src0[i02*ne01*ne00 + i01*ne00 + i00];
26
+ }
27
+ }
28
+
29
+ kernel void kernel_diag_mask_inf_8(
30
+ global float4 * src0,
31
+ ulong offset0,
32
+ global float4 * dst,
33
+ ulong offsetd,
34
+ int ne00,
35
+ int ne01,
36
+ int n_past
37
+ ) {
38
+ src0 = (global float4*)((global char*)src0 + offset0);
39
+ dst = (global float4*)((global char*)dst + offsetd);
40
+
41
+ int i = 2*get_global_id(0);
42
+
43
+ dst[i+0] = src0[i+0];
44
+ dst[i+1] = src0[i+1];
45
+ int i4 = 4*i;
46
+ int i02 = i4/(ne00*ne01); i4 -= i02*ne00*ne01;
47
+ int i01 = i4/(ne00); i4 -= i01*ne00;
48
+ int i00 = i4;
49
+ for (int k = 3; k >= 0; --k) {
50
+ if (i00 + 4 + k <= n_past + i01) {
51
+ break;
52
+ }
53
+ (&dst[i+1])[k] = -INFINITY;
54
+ if (i00 + k > n_past + i01) {
55
+ (&dst[i])[k] = -INFINITY;
56
+ }
57
+ }
58
+ }
ggml/src/ggml-opencl/kernels/gelu.cl ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ //------------------------------------------------------------------------------
4
+ // gelu
5
+ //------------------------------------------------------------------------------
6
+ #define GELU_COEF_A 0.044715f
7
+ #define GELU_QUICK_COEF -1.702f
8
+ #define SQRT_2_OVER_PI 0.79788456080286535587989211986876f
9
+
10
+ kernel void kernel_gelu(
11
+ global float * src0,
12
+ ulong offset0,
13
+ global float * dst,
14
+ ulong offsetd
15
+ ) {
16
+ src0 = (global float*)((global char*)src0 + offset0);
17
+ dst = (global float*)((global char*)dst + offsetd);
18
+
19
+ float x = src0[get_global_id(0)];
20
+
21
+ dst[get_global_id(0)] = 0.5f*x*(1.0f + tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
22
+ }
23
+
24
+ kernel void kernel_gelu_4(
25
+ global float4 * src0,
26
+ ulong offset0,
27
+ global float4 * dst,
28
+ ulong offsetd
29
+ ) {
30
+ src0 = (global float4*)((global char*)src0 + offset0);
31
+ dst = (global float4*)((global char*)dst + offsetd);
32
+
33
+ float4 x = src0[get_global_id(0)];
34
+
35
+ dst[get_global_id(0)] = 0.5f*x*(1.0f + tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
36
+ }
37
+
38
+ kernel void kernel_gelu_quick(
39
+ global float * src0,
40
+ ulong offset0,
41
+ global float * dst,
42
+ ulong offsetd
43
+ ) {
44
+ src0 = (global float*)((global char*)src0 + offset0);
45
+ dst = (global float*)((global char*)dst + offsetd);
46
+
47
+ float x = src0[get_global_id(0)];
48
+ dst[get_global_id(0)] = x*(1.0f/(1.0f+exp(GELU_QUICK_COEF*x)));
49
+ }
50
+
51
+ kernel void kernel_gelu_quick_4(
52
+ global float4 * src0,
53
+ ulong offset0,
54
+ global float4 * dst,
55
+ ulong offsetd
56
+ ) {
57
+ src0 = (global float4*)((global char*)src0 + offset0);
58
+ dst = (global float4*)((global char*)dst + offsetd);
59
+
60
+ float4 x = src0[get_global_id(0)];
61
+ dst[get_global_id(0)] = x*(1.0f/(1.0f+exp(GELU_QUICK_COEF*x)));
62
+ }
ggml/src/ggml-opencl/kernels/gemv_noshuffle.cl ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+ #pragma OPENCL EXTENSION cl_khr_subgroups : enable
3
+
4
+ #ifdef cl_qcom_reqd_sub_group_size
5
+ #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
6
+ #define ADRENO_GPU 1
7
+ #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
8
+ #endif
9
+
10
+ // assume
11
+ #define QK4_0 32
12
+ #define N_SIMDGROUP 4
13
+
14
+ #define dequantizeBlockAccum_ns_sgbroadcast_1_hi(total_sums, bits4, scale, y) \
15
+ float shared_y; \
16
+ shared_y = sub_group_broadcast(y.s0, 0); \
17
+ total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
18
+ total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
19
+ shared_y = sub_group_broadcast(y.s1, 0); \
20
+ total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
21
+ total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
22
+ shared_y = sub_group_broadcast(y.s2, 0); \
23
+ total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
24
+ total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
25
+ shared_y = sub_group_broadcast(y.s3, 0); \
26
+ total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
27
+ total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
28
+ shared_y = sub_group_broadcast(y.s4, 0); \
29
+ total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
30
+ total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
31
+ shared_y = sub_group_broadcast(y.s5, 0); \
32
+ total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
33
+ total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
34
+ shared_y = sub_group_broadcast(y.s6, 0); \
35
+ total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
36
+ total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
37
+ shared_y = sub_group_broadcast(y.s7, 0); \
38
+ total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
39
+ total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
40
+ shared_y = sub_group_broadcast(y.s0, 1); \
41
+ total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
42
+ total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
43
+ shared_y = sub_group_broadcast(y.s1, 1); \
44
+ total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
45
+ total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
46
+ shared_y = sub_group_broadcast(y.s2, 1); \
47
+ total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
48
+ total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
49
+ shared_y = sub_group_broadcast(y.s3, 1); \
50
+ total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
51
+ total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
52
+ shared_y = sub_group_broadcast(y.s4, 1); \
53
+ total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
54
+ total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
55
+ shared_y = sub_group_broadcast(y.s5, 1); \
56
+ total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
57
+ total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
58
+ shared_y = sub_group_broadcast(y.s6, 1); \
59
+ total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
60
+ total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
61
+ shared_y = sub_group_broadcast(y.s7, 1); \
62
+ total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
63
+ total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
64
+
65
+
66
+ #define dequantizeBlockAccum_ns_sgbroadcast_1_lo(total_sums, bits4, scale, y) \
67
+ shared_y = sub_group_broadcast(y.s0, 2); \
68
+ total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
69
+ total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
70
+ shared_y = sub_group_broadcast(y.s1, 2); \
71
+ total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
72
+ total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
73
+ shared_y = sub_group_broadcast(y.s2, 2); \
74
+ total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
75
+ total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
76
+ shared_y = sub_group_broadcast(y.s3, 2); \
77
+ total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
78
+ total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
79
+ shared_y = sub_group_broadcast(y.s4, 2); \
80
+ total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
81
+ total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
82
+ shared_y = sub_group_broadcast(y.s5, 2); \
83
+ total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
84
+ total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
85
+ shared_y = sub_group_broadcast(y.s6, 2); \
86
+ total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
87
+ total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
88
+ shared_y = sub_group_broadcast(y.s7, 2); \
89
+ total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
90
+ total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
91
+ shared_y = sub_group_broadcast(y.s0, 3); \
92
+ total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
93
+ total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
94
+ shared_y = sub_group_broadcast(y.s1, 3); \
95
+ total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
96
+ total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
97
+ shared_y = sub_group_broadcast(y.s2, 3); \
98
+ total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
99
+ total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
100
+ shared_y = sub_group_broadcast(y.s3, 3); \
101
+ total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
102
+ total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
103
+ shared_y = sub_group_broadcast(y.s4, 3); \
104
+ total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
105
+ total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
106
+ shared_y = sub_group_broadcast(y.s5, 3); \
107
+ total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
108
+ total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
109
+ shared_y = sub_group_broadcast(y.s6, 3); \
110
+ total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
111
+ total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
112
+ shared_y = sub_group_broadcast(y.s7, 3); \
113
+ total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
114
+ total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
115
+
116
+
117
+ #define dequantizeBlockAccum_ns_sgbroadcast_8_hi(total_sums, bits4, scale, y) \
118
+ float8 shared_y; \
119
+ shared_y = sub_group_broadcast(y, 0); \
120
+ total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
121
+ total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
122
+ total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
123
+ total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
124
+ total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
125
+ total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
126
+ total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
127
+ total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
128
+ total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
129
+ total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
130
+ total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
131
+ total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
132
+ total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
133
+ total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
134
+ total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
135
+ total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
136
+ shared_y = sub_group_broadcast(y, 1); \
137
+ total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
138
+ total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
139
+ total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
140
+ total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
141
+ total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
142
+ total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
143
+ total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
144
+ total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
145
+ total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
146
+ total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
147
+ total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
148
+ total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
149
+ total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
150
+ total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
151
+ total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
152
+ total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
153
+
154
+
155
+ #define dequantizeBlockAccum_ns_sgbroadcast_8_lo(total_sums, bits4, scale, y) \
156
+ shared_y = sub_group_broadcast(y, 2); \
157
+ total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
158
+ total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
159
+ total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
160
+ total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
161
+ total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
162
+ total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
163
+ total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
164
+ total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
165
+ total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
166
+ total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
167
+ total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
168
+ total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
169
+ total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
170
+ total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
171
+ total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
172
+ total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
173
+ shared_y = sub_group_broadcast(y, 3); \
174
+ total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
175
+ total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
176
+ total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
177
+ total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
178
+ total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
179
+ total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
180
+ total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
181
+ total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
182
+ total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
183
+ total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
184
+ total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
185
+ total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
186
+ total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
187
+ total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
188
+ total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
189
+ total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
190
+
191
+ #ifdef ADRENO_GPU
192
+ REQD_SUBGROUP_SIZE_64
193
+ #endif
194
+ __kernel void kernel_gemv_noshuffle(
195
+ __read_only image1d_buffer_t src0_q, // quantized A
196
+ global half2 * src0_d, // A scales
197
+ __read_only image1d_buffer_t src1, // B
198
+ ulong offset1, // offset to B (0)
199
+ global float * dst, // C
200
+ ulong offsetd, // offset to C (0)
201
+ uint K, // K
202
+ int ne01, // M
203
+ int ne02, // 1
204
+ int ne10, // K
205
+ int ne12, // 1
206
+ int ne0, // M
207
+ int ne1, // N
208
+ int r2, // 1
209
+ int r3)
210
+ {
211
+ uint groupId = get_local_id(1);
212
+ uint gid = get_global_id(0);
213
+ ushort slid = get_sub_group_local_id();
214
+
215
+ __private uint4 regA;
216
+ __private half2 regS;
217
+ __private float8 regB;
218
+
219
+ __private float2 totalSum = (float2)(0.0f);
220
+
221
+ // loop along K in block granularity, skip 4 blocks every iter
222
+ for (uint k = groupId; k < (K / QK4_0); k += N_SIMDGROUP) {
223
+ regS = src0_d[gid + k * LINE_STRIDE_A]; // each fiber loads scale of two rows
224
+ // first 4 fibers in each wave load 8 B values to its private scope
225
+ if (slid < 4) {
226
+ regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
227
+ regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
228
+ }
229
+
230
+ // load half weights for two blocks in consecutive rows
231
+ regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
232
+ regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
233
+ regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
234
+ regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
235
+ #ifdef VECTOR_SUB_GROUP_BROADCAT
236
+ dequantizeBlockAccum_ns_sgbroadcast_8_hi(totalSum, as_ushort8(regA), regS, regB);
237
+ #else
238
+ dequantizeBlockAccum_ns_sgbroadcast_1_hi(totalSum, as_ushort8(regA), regS, regB);
239
+ #endif // VECTOR_SUB_GROUP_BROADCAT
240
+
241
+ regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
242
+ regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
243
+ regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
244
+ regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
245
+ #ifdef VECTOR_SUB_GROUP_BROADCAT
246
+ dequantizeBlockAccum_ns_sgbroadcast_8_lo(totalSum, as_ushort8(regA), regS, regB);
247
+ #else
248
+ dequantizeBlockAccum_ns_sgbroadcast_1_lo(totalSum, as_ushort8(regA), regS, regB);
249
+ #endif // VECTOR_SUB_GROUP_BROADCAT
250
+ }
251
+
252
+ // reduction in local memory, assumes #wave=4
253
+ __local float2 reduceLM[SIMDGROUP_WIDTH * 3];
254
+ if (groupId == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = totalSum;
255
+ if (groupId == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = totalSum;
256
+ if (groupId == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = totalSum;
257
+ barrier(CLK_LOCAL_MEM_FENCE);
258
+ if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
259
+ if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
260
+ if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
261
+
262
+ // 2 outputs per fiber in wave 0
263
+ if (groupId == 0) {
264
+ dst = (global float*)((global char*)dst + offsetd);
265
+ vstore2(totalSum, 0, &(dst[gid * 2]));
266
+ }
267
+
268
+ }
ggml/src/ggml-opencl/kernels/gemv_noshuffle_general.cl ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+ #pragma OPENCL EXTENSION cl_khr_subgroups : enable
3
+
4
+ #ifdef cl_qcom_reqd_sub_group_size
5
+ #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
6
+ #define ADRENO_GPU 1
7
+ #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
8
+ #endif
9
+
10
+ // assume
11
+ #define QK4_0 32
12
+ #define N_SIMDGROUP 4
13
+
14
+ #define dequantizeBlockAccum_ns_sgbroadcast_1_hi(total_sums, bits4, scale, y) \
15
+ float shared_y; \
16
+ shared_y = sub_group_broadcast(y.s0, 0); \
17
+ total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
18
+ total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
19
+ shared_y = sub_group_broadcast(y.s1, 0); \
20
+ total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
21
+ total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
22
+ shared_y = sub_group_broadcast(y.s2, 0); \
23
+ total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
24
+ total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
25
+ shared_y = sub_group_broadcast(y.s3, 0); \
26
+ total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
27
+ total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
28
+ shared_y = sub_group_broadcast(y.s4, 0); \
29
+ total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
30
+ total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
31
+ shared_y = sub_group_broadcast(y.s5, 0); \
32
+ total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
33
+ total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
34
+ shared_y = sub_group_broadcast(y.s6, 0); \
35
+ total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
36
+ total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
37
+ shared_y = sub_group_broadcast(y.s7, 0); \
38
+ total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
39
+ total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
40
+ shared_y = sub_group_broadcast(y.s0, 1); \
41
+ total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
42
+ total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
43
+ shared_y = sub_group_broadcast(y.s1, 1); \
44
+ total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
45
+ total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
46
+ shared_y = sub_group_broadcast(y.s2, 1); \
47
+ total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
48
+ total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
49
+ shared_y = sub_group_broadcast(y.s3, 1); \
50
+ total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
51
+ total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
52
+ shared_y = sub_group_broadcast(y.s4, 1); \
53
+ total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
54
+ total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
55
+ shared_y = sub_group_broadcast(y.s5, 1); \
56
+ total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
57
+ total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
58
+ shared_y = sub_group_broadcast(y.s6, 1); \
59
+ total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
60
+ total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
61
+ shared_y = sub_group_broadcast(y.s7, 1); \
62
+ total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
63
+ total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
64
+
65
+
66
+ #define dequantizeBlockAccum_ns_sgbroadcast_1_lo(total_sums, bits4, scale, y) \
67
+ shared_y = sub_group_broadcast(y.s0, 2); \
68
+ total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
69
+ total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
70
+ shared_y = sub_group_broadcast(y.s1, 2); \
71
+ total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
72
+ total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
73
+ shared_y = sub_group_broadcast(y.s2, 2); \
74
+ total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
75
+ total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
76
+ shared_y = sub_group_broadcast(y.s3, 2); \
77
+ total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
78
+ total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
79
+ shared_y = sub_group_broadcast(y.s4, 2); \
80
+ total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
81
+ total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
82
+ shared_y = sub_group_broadcast(y.s5, 2); \
83
+ total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
84
+ total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
85
+ shared_y = sub_group_broadcast(y.s6, 2); \
86
+ total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
87
+ total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
88
+ shared_y = sub_group_broadcast(y.s7, 2); \
89
+ total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
90
+ total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
91
+ shared_y = sub_group_broadcast(y.s0, 3); \
92
+ total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
93
+ total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
94
+ shared_y = sub_group_broadcast(y.s1, 3); \
95
+ total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
96
+ total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
97
+ shared_y = sub_group_broadcast(y.s2, 3); \
98
+ total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
99
+ total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
100
+ shared_y = sub_group_broadcast(y.s3, 3); \
101
+ total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
102
+ total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
103
+ shared_y = sub_group_broadcast(y.s4, 3); \
104
+ total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
105
+ total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
106
+ shared_y = sub_group_broadcast(y.s5, 3); \
107
+ total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
108
+ total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
109
+ shared_y = sub_group_broadcast(y.s6, 3); \
110
+ total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
111
+ total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
112
+ shared_y = sub_group_broadcast(y.s7, 3); \
113
+ total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
114
+ total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
115
+
116
+
117
+ #define dequantizeBlockAccum_ns_sgbroadcast_8_hi(total_sums, bits4, scale, y) \
118
+ float8 shared_y; \
119
+ shared_y = sub_group_broadcast(y, 0); \
120
+ total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
121
+ total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
122
+ total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
123
+ total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
124
+ total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
125
+ total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
126
+ total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
127
+ total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
128
+ total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
129
+ total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
130
+ total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
131
+ total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
132
+ total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
133
+ total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
134
+ total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
135
+ total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
136
+ shared_y = sub_group_broadcast(y, 1); \
137
+ total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
138
+ total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
139
+ total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
140
+ total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
141
+ total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
142
+ total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
143
+ total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
144
+ total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
145
+ total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
146
+ total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
147
+ total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
148
+ total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
149
+ total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
150
+ total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
151
+ total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
152
+ total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
153
+
154
+
155
+ #define dequantizeBlockAccum_ns_sgbroadcast_8_lo(total_sums, bits4, scale, y) \
156
+ shared_y = sub_group_broadcast(y, 2); \
157
+ total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
158
+ total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
159
+ total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
160
+ total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
161
+ total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
162
+ total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
163
+ total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
164
+ total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
165
+ total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
166
+ total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
167
+ total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
168
+ total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
169
+ total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
170
+ total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
171
+ total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
172
+ total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
173
+ shared_y = sub_group_broadcast(y, 3); \
174
+ total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
175
+ total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
176
+ total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
177
+ total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
178
+ total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
179
+ total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
180
+ total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
181
+ total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
182
+ total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
183
+ total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
184
+ total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
185
+ total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
186
+ total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
187
+ total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
188
+ total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
189
+ total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
190
+
191
+ #ifdef ADRENO_GPU
192
+ REQD_SUBGROUP_SIZE_64
193
+ #endif
194
+ __kernel void kernel_gemv_noshuffle(
195
+ __read_only image1d_buffer_t src0_q, // quantized A
196
+ global half2 * src0_d, // A scales
197
+ __read_only image1d_buffer_t src1, // B
198
+ ulong offset1, // offset to B (0)
199
+ global float * dst, // C
200
+ ulong offsetd, // offset to C (0)
201
+ int ne00, // K
202
+ int ne01, // M
203
+ int ne02, // 1
204
+ int ne10, // K
205
+ int ne12, // 1
206
+ int ne0, // M
207
+ int ne1, // N
208
+ int r2, // 1
209
+ int r3)
210
+ {
211
+ uint groupId = get_local_id(1);
212
+ uint gid = get_global_id(0);
213
+ ushort slid = get_sub_group_local_id();
214
+
215
+ uint K = ne00;
216
+ uint M = ne01;
217
+
218
+ uint LINE_STRIDE_A = M / 2;
219
+ uint BLOCK_STRIDE_A = N_SIMDGROUP * M;
220
+
221
+ __private uint4 regA;
222
+ __private half2 regS;
223
+ __private float8 regB;
224
+
225
+ __private float2 totalSum = (float2)(0.0f);
226
+
227
+ // loop along K in block granularity, skip 4 blocks every iter
228
+ for (uint k = groupId; k < (K / QK4_0); k += N_SIMDGROUP) {
229
+ regS = src0_d[gid + k * LINE_STRIDE_A]; // each fiber loads scale of two rows
230
+ // first 4 fibers in each wave load 8 B values to its private scope
231
+ if (slid < 4) {
232
+ regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
233
+ regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
234
+ }
235
+
236
+ // load half weights for two blocks in consecutive rows
237
+ regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
238
+ regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
239
+ regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
240
+ regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
241
+ #ifdef VECTOR_SUB_GROUP_BROADCAT
242
+ dequantizeBlockAccum_ns_sgbroadcast_8_hi(totalSum, as_ushort8(regA), regS, regB);
243
+ #else
244
+ dequantizeBlockAccum_ns_sgbroadcast_1_hi(totalSum, as_ushort8(regA), regS, regB);
245
+ #endif // VECTOR_SUB_GROUP_BROADCAT
246
+
247
+ regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
248
+ regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
249
+ regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
250
+ regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
251
+ #ifdef VECTOR_SUB_GROUP_BROADCAT
252
+ dequantizeBlockAccum_ns_sgbroadcast_8_lo(totalSum, as_ushort8(regA), regS, regB);
253
+ #else
254
+ dequantizeBlockAccum_ns_sgbroadcast_1_lo(totalSum, as_ushort8(regA), regS, regB);
255
+ #endif // VECTOR_SUB_GROUP_BROADCAT
256
+ }
257
+
258
+ // reduction in local memory, assumes #wave=4
259
+ __local float2 reduceLM[SIMDGROUP_WIDTH * 3];
260
+ if (groupId == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = totalSum;
261
+ if (groupId == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = totalSum;
262
+ if (groupId == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = totalSum;
263
+ barrier(CLK_LOCAL_MEM_FENCE);
264
+ if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
265
+ if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
266
+ if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
267
+
268
+ // 2 outputs per fiber in wave 0
269
+ if (groupId == 0) {
270
+ dst = (global float*)((global char*)dst + offsetd);
271
+ vstore2(totalSum, 0, &(dst[gid * 2]));
272
+ }
273
+
274
+ }
ggml/src/ggml-opencl/kernels/get_rows.cl ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ typedef char int8_t;
4
+ typedef uchar uint8_t;
5
+ typedef short int16_t;
6
+ typedef ushort uint16_t;
7
+ typedef int int32_t;
8
+ typedef uint uint32_t;
9
+
10
+ #define QK4_0 32
11
+
12
+ //------------------------------------------------------------------------------
13
+ // block_q4_0
14
+ //------------------------------------------------------------------------------
15
+ struct block_q4_0
16
+ {
17
+ half d;
18
+ uint8_t qs[QK4_0 / 2];
19
+ };
20
+
21
+
22
+ //------------------------------------------------------------------------------
23
+ // dequantize_q4_0_f32, dequantize_q4_0_f16
24
+ //------------------------------------------------------------------------------
25
+ void dequantize_q4_0_f32(global struct block_q4_0 * xb, short il, float16 * reg) {
26
+ global ushort * qs = ((global ushort *)xb + 1);
27
+ float d1 = il ? (xb->d / 16.h) : xb->d;
28
+ float d2 = d1 / 256.f;
29
+ float md = -8.h * xb->d;
30
+ ushort mask0 = il ? 0x00F0 : 0x000F;
31
+ ushort mask1 = mask0 << 8;
32
+
33
+ reg->s0 = d1 * (qs[0] & mask0) + md;
34
+ reg->s1 = d2 * (qs[0] & mask1) + md;
35
+
36
+ reg->s2 = d1 * (qs[1] & mask0) + md;
37
+ reg->s3 = d2 * (qs[1] & mask1) + md;
38
+
39
+ reg->s4 = d1 * (qs[2] & mask0) + md;
40
+ reg->s5 = d2 * (qs[2] & mask1) + md;
41
+
42
+ reg->s6 = d1 * (qs[3] & mask0) + md;
43
+ reg->s7 = d2 * (qs[3] & mask1) + md;
44
+
45
+ reg->s8 = d1 * (qs[4] & mask0) + md;
46
+ reg->s9 = d2 * (qs[4] & mask1) + md;
47
+
48
+ reg->sa = d1 * (qs[5] & mask0) + md;
49
+ reg->sb = d2 * (qs[5] & mask1) + md;
50
+
51
+ reg->sc = d1 * (qs[6] & mask0) + md;
52
+ reg->sd = d2 * (qs[6] & mask1) + md;
53
+
54
+ reg->se = d1 * (qs[7] & mask0) + md;
55
+ reg->sf = d2 * (qs[7] & mask1) + md;
56
+ }
57
+
58
+
59
+ //------------------------------------------------------------------------------
60
+ // get_rows
61
+ //------------------------------------------------------------------------------
62
+ kernel void kernel_get_rows_f32(
63
+ global void * src0,
64
+ ulong offset0,
65
+ global int * src1,
66
+ ulong offset1,
67
+ global float * dst,
68
+ ulong offsetd,
69
+ int ne00,
70
+ ulong nb01,
71
+ ulong nb02,
72
+ int ne10,
73
+ ulong nb10,
74
+ ulong nb11,
75
+ ulong nb1,
76
+ ulong nb2
77
+ ) {
78
+ src0 = (global void*)((global char*)src0 + offset0);
79
+ src1 = (global int*)((global char*)src1 + offset1);
80
+ dst = (global float*)((global char*)dst + offsetd);
81
+
82
+ int i10 = get_group_id(0);
83
+ int i11 = get_group_id(1);
84
+
85
+ int r = ((global int *) ((global char *) src1 + i11*nb11 + i10*nb10))[0];
86
+
87
+ int i02 = i11;
88
+
89
+ for (int ind = get_local_id(0); ind < ne00; ind += get_local_size(0)) {
90
+ ((global float *) ((global char *) dst + i11*nb2 + i10*nb1))[ind] =
91
+ ((global float *) ((global char *) src0 + r*nb01 + i02*nb02))[ind];
92
+ }
93
+ }
94
+
95
+ kernel void kernel_get_rows_f16(
96
+ global void * src0,
97
+ ulong offset0,
98
+ global int * src1,
99
+ ulong offset1,
100
+ global float * dst,
101
+ ulong offsetd,
102
+ int ne00,
103
+ ulong nb01,
104
+ ulong nb02,
105
+ int ne10,
106
+ ulong nb10,
107
+ ulong nb11,
108
+ ulong nb1,
109
+ ulong nb2
110
+ ) {
111
+ src0 = (global void*)((global char*)src0 + offset0);
112
+ src1 = (global int*)((global char*)src1 + offset1);
113
+ dst = (global float*)((global char*)dst + offsetd);
114
+
115
+ int i10 = get_group_id(0);
116
+ int i11 = get_group_id(1);
117
+
118
+ int r = ((global int32_t *) ((global char *) src1 + i11*nb11 + i10*nb10))[0];
119
+
120
+ int i02 = i11;
121
+
122
+ for (int ind = get_local_id(0); ind < ne00; ind += get_local_size(0)) {
123
+ ((global float *) ((global char *) dst + i11*nb2 + i10*nb1))[ind] =
124
+ ((global half *) ((global char *) src0 + r*nb01 + i02*nb02))[ind];
125
+ }
126
+ }
127
+
128
+ kernel void kernel_get_rows_q4_0(
129
+ global void * src0,
130
+ ulong offset0,
131
+ global int * src1,
132
+ ulong offset1,
133
+ global float * dst,
134
+ ulong offsetd,
135
+ int ne00,
136
+ ulong nb01,
137
+ ulong nb02,
138
+ int ne10,
139
+ ulong nb10,
140
+ ulong nb11,
141
+ ulong nb1,
142
+ ulong nb2
143
+ ) {
144
+ src0 = (global void*)((global char*)src0 + offset0);
145
+ src1 = (global int*)((global char*)src1 + offset1);
146
+ dst = (global float*)((global char*)dst + offsetd);
147
+
148
+ const int NL = 2;
149
+
150
+ int i10 = get_group_id(0);
151
+ int i11 = get_group_id(1);
152
+
153
+ int r = ((global int32_t *) ((global char *) src1 + i11*nb11 + i10*nb10))[0];
154
+
155
+ int i02 = i11;
156
+
157
+ for (int ind = get_local_id(0); ind < ne00/16; ind += get_local_size(0)) {
158
+ float16 temp;
159
+ dequantize_q4_0_f32(
160
+ ((global struct block_q4_0 *) ((global char *) src0 + r*nb01 + i02*nb02)) + ind/NL, ind%NL, &temp);
161
+ *(((global float16 *) ((global char *) dst + i11*nb2 + i10*nb1)) + ind) = temp;
162
+ }
163
+ }
ggml/src/ggml-opencl/kernels/im2col_f16.cl ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ kernel void kernel_im2col_f16(
4
+ global float * src1,
5
+ ulong offset1,
6
+ global half * dst,
7
+ ulong offsetd,
8
+ ulong batch_offset,
9
+ ulong delta_offset,
10
+ long IW,
11
+ long IH,
12
+ long IC,
13
+ long OW,
14
+ long OH,
15
+ long KW,
16
+ long KH,
17
+ long pelements,
18
+ long CHW,
19
+ int s0,
20
+ int s1,
21
+ int p0,
22
+ int p1,
23
+ int d0,
24
+ int d1
25
+ ) {
26
+ long i = get_global_id(0);
27
+ if (i >= pelements) {
28
+ return;
29
+ }
30
+
31
+ src1 = (global float*)((global char*)src1 + offset1);
32
+ dst = (global half*)((global char*)dst + offsetd);
33
+
34
+ long ksize = OW * (KH > 1 ? KW : 1);
35
+ long kx = i / ksize;
36
+ long kd = kx * ksize;
37
+ long ky = (i - kd) / OW;
38
+ long ix = i % OW;
39
+
40
+ long oh = get_group_id(1);
41
+ long batch = get_group_id(2) / IC;
42
+ long ic = get_group_id(2) % IC;
43
+
44
+ long iiw = ix * s0 + kx * d0 - p0;
45
+ long iih = oh * s1 + ky * d1 - p1;
46
+
47
+ long offset_dst =
48
+ ((batch * OH + oh) * OW + ix) * CHW +
49
+ (ic * (KW * KH) + ky * KW + kx);
50
+
51
+ if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
52
+ dst[offset_dst] = 0.0f;
53
+ } else {
54
+ long offset_src = ic * delta_offset + batch * batch_offset;
55
+ dst[offset_dst] = src1[offset_src + iih * IW + iiw];
56
+ }
57
+ }
ggml/src/ggml-opencl/kernels/im2col_f32.cl ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ kernel void kernel_im2col_f32(
4
+ global float * src1,
5
+ ulong offset1,
6
+ global float * dst,
7
+ ulong offsetd,
8
+ ulong batch_offset,
9
+ ulong delta_offset,
10
+ long IW,
11
+ long IH,
12
+ long IC,
13
+ long OW,
14
+ long OH,
15
+ long KW,
16
+ long KH,
17
+ long pelements,
18
+ long CHW,
19
+ int s0,
20
+ int s1,
21
+ int p0,
22
+ int p1,
23
+ int d0,
24
+ int d1
25
+ ) {
26
+ long i = get_global_id(0);
27
+ if (i >= pelements) {
28
+ return;
29
+ }
30
+
31
+ src1 = (global float*)((global char*)src1 + offset1);
32
+ dst = (global float*)((global char*)dst + offsetd);
33
+
34
+ long ksize = OW * (KH > 1 ? KW : 1);
35
+ long kx = i / ksize;
36
+ long kd = kx * ksize;
37
+ long ky = (i - kd) / OW;
38
+ long ix = i % OW;
39
+
40
+ long oh = get_group_id(1);
41
+ long batch = get_group_id(2) / IC;
42
+ long ic = get_group_id(2) % IC;
43
+
44
+ long iiw = ix * s0 + kx * d0 - p0;
45
+ long iih = oh * s1 + ky * d1 - p1;
46
+
47
+ long offset_dst =
48
+ ((batch * OH + oh) * OW + ix) * CHW +
49
+ (ic * (KW * KH) + ky * KW + kx);
50
+
51
+ if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
52
+ dst[offset_dst] = 0.0f;
53
+ } else {
54
+ long offset_src = ic * delta_offset + batch * batch_offset;
55
+ dst[offset_dst] = src1[offset_src + iih * IW + iiw];
56
+ }
57
+ }
ggml/src/ggml-opencl/kernels/mul.cl ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ //------------------------------------------------------------------------------
4
+ // mul
5
+ //------------------------------------------------------------------------------
6
+ kernel void kernel_mul(
7
+ global char * src0,
8
+ ulong offset0,
9
+ global char * src1,
10
+ ulong offset1,
11
+ global char * dst,
12
+ ulong offsetd,
13
+ int ne00,
14
+ int ne01,
15
+ int ne02,
16
+ int ne03,
17
+ ulong nb00,
18
+ ulong nb01,
19
+ ulong nb02,
20
+ ulong nb03,
21
+ int ne10,
22
+ int ne11,
23
+ int ne12,
24
+ int ne13,
25
+ ulong nb10,
26
+ ulong nb11,
27
+ ulong nb12,
28
+ ulong nb13,
29
+ int ne0,
30
+ int ne1,
31
+ int ne2,
32
+ int ne3,
33
+ ulong nb0,
34
+ ulong nb1,
35
+ ulong nb2,
36
+ ulong nb3
37
+ ) {
38
+ src0 = src0 + offset0;
39
+ src1 = src1 + offset1;
40
+ dst = dst + offsetd;
41
+
42
+ int i03 = get_group_id(2);
43
+ int i02 = get_group_id(1);
44
+ int i01 = get_group_id(0);
45
+
46
+ int i13 = i03 % ne13;
47
+ int i12 = i02 % ne12;
48
+ int i11 = i01 % ne11;
49
+
50
+ global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
51
+ global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
52
+ global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
53
+
54
+ for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
55
+ const int i10 = i0 % ne10;
56
+ *((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) * *((global float *)(src1_ptr + i10*nb10));
57
+ }
58
+ }
59
+
60
+ // assumption: src1 is a row
61
+ // broadcast src1 into src0
62
+ kernel void kernel_mul_row(
63
+ global float4 * src0,
64
+ ulong offset0,
65
+ global float4 * src1,
66
+ ulong offset1,
67
+ global float4 * dst,
68
+ ulong offsetd,
69
+ int ne
70
+ ) {
71
+ src0 = (global float4*)((global char*)src0 + offset0);
72
+ src1 = (global float4*)((global char*)src1 + offset1);
73
+ dst = (global float4*)((global char*)dst + offsetd);
74
+
75
+ // This performs better than using %.
76
+ uint gid = get_global_id(0);
77
+ uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
78
+ dst[gid] = src0[gid] * src1[idx1];
79
+ }
ggml/src/ggml-opencl/kernels/mul_mat_Ab_Bi_8x4.cl ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // src0_q, src0_d, src1 are transposed as a preprocessing step
2
+ // 4-bit weights are transposed in groups of 4 (unsigned short int)
3
+ // consider weights originally "next to each other", now "on top of each other"
4
+ // each fiber computes a 8x4 tile of output elements
5
+ // using unshuffled weights
6
+
7
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
8
+ #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
9
+
10
+ #ifdef cl_qcom_reqd_sub_group_size
11
+ #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
12
+ #define ADRENO_GPU 1
13
+ #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
14
+ #endif
15
+
16
+ #ifdef ADRENO_GPU
17
+ REQD_SUBGROUP_SIZE_128
18
+ #endif
19
+
20
+ kernel void kernel_mul_mat_Ab_Bi_8x4(
21
+ global const ushort * src0_q, // quantized A
22
+ global const half * src0_d, // A scales
23
+ __read_only image1d_buffer_t src1, // B (1d image)
24
+ global float * dst, // C
25
+ int m, // M
26
+ int n, // N with padding
27
+ int k, // K
28
+ int n_no_padding // N without padding
29
+ ) {
30
+
31
+ int m_4 = m >> 2;
32
+ int n_4 = n >> 2;
33
+
34
+ int gy = get_global_id(0);
35
+ int gx = get_global_id(1);
36
+ int gx_2 = gx << 2;
37
+
38
+ half8 c0 = 0, c1 = 0, c2 = 0, c3 = 0; // 8x4 output elements
39
+ half8 B; // registers for activations
40
+ half4 dequantized_weights; // registers for dequantized weights
41
+ __global const ushort* weight_ptr = src0_q + gx_2; // pointer for weights
42
+ __global const half* scale_ptr = src0_d + gx_2; // pointer for scales
43
+
44
+ for(int i=0; i<k; i+=4){ //loop through K dimension
45
+
46
+ B.s0123 = read_imageh(src1, gy*2 + (i)*(n_4));
47
+ B.s4567 = read_imageh(src1, gy*2 + (i)*(n_4)+1);
48
+
49
+ // keep (i/4) and (i/32) in parenthesis, rounds down
50
+ // load 4 consecutive groups of 4 weights
51
+ ushort4 bits4 = vload4(0, weight_ptr + (i/4)*(m)); // (i/4) because weights grouped in 4s
52
+
53
+ // load 4 consecutive scales
54
+ half4 scale = vload4(0, scale_ptr + (i/32)*(m));// (i/32) because 1 scale per 32 elements
55
+
56
+ // j=0
57
+ dequantized_weights.s0 = ((bits4.s0 & (0x000F)) - 8) * scale.s0; // dequantize a row of the 16 weights
58
+ dequantized_weights.s1 = ((bits4.s1 & (0x000F)) - 8) * scale.s1;
59
+ dequantized_weights.s2 = ((bits4.s2 & (0x000F)) - 8) * scale.s2;
60
+ dequantized_weights.s3 = ((bits4.s3 & (0x000F)) - 8) * scale.s3;
61
+ c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate
62
+ c1 += B * dequantized_weights.s1;
63
+ c2 += B * dequantized_weights.s2;
64
+ c3 += B * dequantized_weights.s3;
65
+
66
+ // j=1
67
+ B.s0123 = read_imageh(src1, gy*2 + (i+1)*(n_4));
68
+ B.s4567 = read_imageh(src1, gy*2 + (i+1)*(n_4)+1);
69
+ dequantized_weights.s0 = (((bits4.s0 & (0x00F0)) >> 4) - 8) * scale.s0; // dequantize a row of the 16 weights
70
+ dequantized_weights.s1 = (((bits4.s1 & (0x00F0)) >> 4) - 8) * scale.s1;
71
+ dequantized_weights.s2 = (((bits4.s2 & (0x00F0)) >> 4) - 8) * scale.s2;
72
+ dequantized_weights.s3 = (((bits4.s3 & (0x00F0)) >> 4) - 8) * scale.s3;
73
+ c0 += B * dequantized_weights.s0; //vector-scalar multiplication to accumulate
74
+ c1 += B * dequantized_weights.s1;
75
+ c2 += B * dequantized_weights.s2;
76
+ c3 += B * dequantized_weights.s3;
77
+
78
+ // j=2
79
+ B.s0123 = read_imageh(src1, gy*2 + (i+2)*(n_4));
80
+ B.s4567 = read_imageh(src1, gy*2 + (i+2)*(n_4)+1);
81
+ dequantized_weights.s0 = (((bits4.s0 & (0x0F00)) >> 8) - 8) * scale.s0; // dequantize a row of the 16 weights
82
+ dequantized_weights.s1 = (((bits4.s1 & (0x0F00)) >> 8) - 8) * scale.s1;
83
+ dequantized_weights.s2 = (((bits4.s2 & (0x0F00)) >> 8) - 8) * scale.s2;
84
+ dequantized_weights.s3 = (((bits4.s3 & (0x0F00)) >> 8) - 8) * scale.s3;
85
+ c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate
86
+ c1 += B * dequantized_weights.s1;
87
+ c2 += B * dequantized_weights.s2;
88
+ c3 += B * dequantized_weights.s3;
89
+
90
+ // j=3
91
+ B.s0123 = read_imageh(src1, gy*2 + (i+3)*(n_4));
92
+ B.s4567 = read_imageh(src1, gy*2 + (i+3)*(n_4)+1);
93
+ dequantized_weights.s0 = (((bits4.s0 & (0xF000)) >> 12) - 8) * scale.s0; // dequantize a row of the 16 weights
94
+ dequantized_weights.s1 = (((bits4.s1 & (0xF000)) >> 12) - 8) * scale.s1;
95
+ dequantized_weights.s2 = (((bits4.s2 & (0xF000)) >> 12) - 8) * scale.s2;
96
+ dequantized_weights.s3 = (((bits4.s3 & (0xF000)) >> 12) - 8) * scale.s3;
97
+ c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate
98
+ c1 += B * dequantized_weights.s1;
99
+ c2 += B * dequantized_weights.s2;
100
+ c3 += B * dequantized_weights.s3;
101
+ }
102
+
103
+ int idx = (gy<<3)*m + (gx<<2); // vectorized store 16 elements
104
+
105
+ // conditional check if store is to a valid location. Required when N is not a multiple of 8
106
+ // if statements allow registers to be reused for each store
107
+ // provides a performance boost due to reduced register footprint, which increases number of concurrent waves
108
+ if(idx+3 < m*n_no_padding){
109
+ vstore4((float4)(c0.s0, c1.s0, c2.s0, c3.s0), 0, dst + idx);
110
+ idx += m;
111
+ }
112
+ if(idx+3 < m*n_no_padding){
113
+ vstore4((float4)(c0.s1, c1.s1, c2.s1, c3.s1), 0, dst + idx);
114
+ idx += m;
115
+ }
116
+ if(idx+3 < m*n_no_padding){
117
+ vstore4((float4)(c0.s2, c1.s2, c2.s2, c3.s2), 0, dst + idx);
118
+ idx += m;
119
+ }
120
+ if(idx+3 < m*n_no_padding){
121
+ vstore4((float4)(c0.s3, c1.s3, c2.s3, c3.s3), 0, dst + idx);
122
+ idx += m;
123
+ }
124
+ if(idx+3 < m*n_no_padding){
125
+ vstore4((float4)(c0.s4, c1.s4, c2.s4, c3.s4), 0, dst + idx);
126
+ idx += m;
127
+ }
128
+ if(idx+3 < m*n_no_padding){
129
+ vstore4((float4)(c0.s5, c1.s5, c2.s5, c3.s5), 0, dst + idx);
130
+ idx += m;
131
+ }
132
+ if(idx+3 < m*n_no_padding){
133
+ vstore4((float4)(c0.s6, c1.s6, c2.s6, c3.s6), 0, dst + idx);
134
+ idx += m;
135
+ }
136
+ if(idx+3 < m*n_no_padding){
137
+ vstore4((float4)(c0.s7, c1.s7, c2.s7, c3.s7), 0, dst + idx);
138
+ }
139
+ }
ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ #ifdef cl_intel_subgroups
4
+ #pragma OPENCL EXTENSION cl_intel_subgroups : enable
5
+ #else
6
+ #pragma OPENCL EXTENSION cl_khr_subgroups : enable
7
+ #endif
8
+
9
+ #ifdef cl_intel_required_subgroup_size
10
+ #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
11
+ #define INTEL_GPU 1
12
+ #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
13
+ #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
14
+ #elif defined(cl_qcom_reqd_sub_group_size)
15
+ #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
16
+ #define ADRENO_GPU 1
17
+ #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
18
+ #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
19
+ #endif
20
+
21
+ #define N_F16_F16 4
22
+
23
+ #ifdef ADRENO_GPU
24
+ REQD_SUBGROUP_SIZE_64
25
+ #endif
26
+ kernel void kernel_mul_mat_f16_f16(
27
+ global char * src0,
28
+ ulong offset0,
29
+ global char * src1,
30
+ ulong offset1,
31
+ global float * dst,
32
+ ulong offsetd,
33
+ int ne00,
34
+ int ne01,
35
+ int ne02,
36
+ ulong nb00,
37
+ ulong nb01,
38
+ ulong nb02,
39
+ ulong nb03,
40
+ int ne10,
41
+ int ne11,
42
+ int ne12,
43
+ ulong nb10,
44
+ ulong nb11,
45
+ ulong nb12,
46
+ ulong nb13,
47
+ int ne0,
48
+ int ne1,
49
+ int r2,
50
+ int r3)
51
+ {
52
+ src0 = (global char*)((global char*)src0 + offset0);
53
+ src1 = (global char*)((global char*)src1 + offset1);
54
+ dst = (global float*)((global char*)dst + offsetd);
55
+
56
+ int r0 = get_group_id(0);
57
+ int rb = get_group_id(1)*N_F16_F16;
58
+ int im = get_group_id(2);
59
+
60
+ int i12 = im%ne12;
61
+ int i13 = im/ne12;
62
+
63
+ ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
64
+
65
+ global half * x = (global half *) (src0 + offset_src0);
66
+
67
+ if (ne00 < 128) {
68
+ for (int row = 0; row < N_F16_F16; ++row) {
69
+ int r1 = rb + row;
70
+ if (r1 >= ne11) {
71
+ break;
72
+ }
73
+
74
+ ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
75
+
76
+ global half * y = (global half *) (src1 + offset_src1);
77
+
78
+ float sumf = 0;
79
+ for (int i = get_sub_group_local_id(); i < ne00; i += get_max_sub_group_size()) {
80
+ sumf += (half) x[i] * (half) y[i];
81
+ }
82
+
83
+ float all_sum = sub_group_reduce_add(sumf);
84
+ if (get_sub_group_local_id() == 0) {
85
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
86
+ }
87
+ }
88
+ } else {
89
+ global half4 * x4 = (global half4 *)x;
90
+ for (int row = 0; row < N_F16_F16; ++row) {
91
+ int r1 = rb + row;
92
+ if (r1 >= ne11) {
93
+ break;
94
+ }
95
+
96
+ ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
97
+
98
+ global half * y = (global half *) (src1 + offset_src1);
99
+ global half4 * y4 = (global half4 *) y;
100
+
101
+ float sumf = 0;
102
+ for (int i = get_sub_group_local_id(); i < ne00/4; i += get_max_sub_group_size()) {
103
+ sumf += (half) x4[i].s0 * y4[i].s0;
104
+ sumf += (half) x4[i].s1 * y4[i].s1;
105
+ sumf += (half) x4[i].s2 * y4[i].s2;
106
+ sumf += (half) x4[i].s3 * y4[i].s3;
107
+ }
108
+
109
+ float all_sum = sub_group_reduce_add(sumf);
110
+ if (get_sub_group_local_id() == 0) {
111
+ for (int i = 4*(ne00/4); i < ne00; ++i) {
112
+ all_sum += (half) x[i] * y[i];
113
+ }
114
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
115
+ }
116
+ }
117
+ }
118
+ }
ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ #ifdef cl_intel_subgroups
4
+ #pragma OPENCL EXTENSION cl_intel_subgroups : enable
5
+ #else
6
+ #pragma OPENCL EXTENSION cl_khr_subgroups : enable
7
+ #endif
8
+
9
+ #ifdef cl_intel_required_subgroup_size
10
+ #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
11
+ #define INTEL_GPU 1
12
+ #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
13
+ #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
14
+ #elif defined(cl_qcom_reqd_sub_group_size)
15
+ #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
16
+ #define ADRENO_GPU 1
17
+ #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
18
+ #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
19
+ #endif
20
+
21
+ #define N_F16_F32 4
22
+
23
+ #ifdef ADRENO_GPU
24
+ REQD_SUBGROUP_SIZE_64
25
+ #endif
26
+ kernel void kernel_mul_mat_f16_f32(
27
+ global char * src0,
28
+ ulong offset0,
29
+ global char * src1,
30
+ ulong offset1,
31
+ global float * dst,
32
+ ulong offsetd,
33
+ int ne00,
34
+ int ne01,
35
+ int ne02,
36
+ ulong nb00,
37
+ ulong nb01,
38
+ ulong nb02,
39
+ ulong nb03,
40
+ int ne10,
41
+ int ne11,
42
+ int ne12,
43
+ ulong nb10,
44
+ ulong nb11,
45
+ ulong nb12,
46
+ ulong nb13,
47
+ int ne0,
48
+ int ne1,
49
+ int r2,
50
+ int r3
51
+ ) {
52
+ src0 = (global char*)((global char*)src0 + offset0);
53
+ src1 = (global char*)((global char*)src1 + offset1);
54
+ dst = (global float*)((global char*)dst + offsetd);
55
+
56
+ int r0 = get_group_id(0);
57
+ int rb = get_group_id(1)*N_F16_F32;
58
+ int im = get_group_id(2);
59
+
60
+ int i12 = im%ne12;
61
+ int i13 = im/ne12;
62
+
63
+ ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
64
+
65
+ global half * x = (global half *) (src0 + offset_src0);
66
+
67
+ if (ne00 < 128) {
68
+ for (int row = 0; row < N_F16_F32; ++row) {
69
+ int r1 = rb + row;
70
+ if (r1 >= ne11) {
71
+ break;
72
+ }
73
+
74
+ ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
75
+
76
+ global float * y = (global float *) (src1 + offset_src1);
77
+
78
+ float sumf = 0;
79
+ for (int i = get_sub_group_local_id(); i < ne00; i += get_max_sub_group_size()) {
80
+ sumf += convert_float(x[i]) * y[i];
81
+ }
82
+
83
+ float all_sum = sub_group_reduce_add(sumf);
84
+ if (get_sub_group_local_id() == 0) {
85
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
86
+ }
87
+ }
88
+ } else {
89
+ global half4 * x4 = (global half4 *)x;
90
+ for (int row = 0; row < N_F16_F32; ++row) {
91
+ int r1 = rb + row;
92
+ if (r1 >= ne11) {
93
+ break;
94
+ }
95
+
96
+ ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
97
+
98
+ global float * y = (global float *) (src1 + offset_src1);
99
+ global float4 * y4 = (global float4 *) y;
100
+
101
+ float sumf = 0;
102
+ for (int i = get_sub_group_local_id(); i < ne00/4; i += get_max_sub_group_size()) {
103
+ sumf += convert_float(x4[i].s0) * y4[i].s0;
104
+ sumf += convert_float(x4[i].s1) * y4[i].s1;
105
+ sumf += convert_float(x4[i].s2) * y4[i].s2;
106
+ sumf += convert_float(x4[i].s3) * y4[i].s3;
107
+ }
108
+
109
+ float all_sum = sub_group_reduce_add(sumf);
110
+ if (get_sub_group_local_id() == 0) {
111
+ for (int i = 4*(ne00/4); i < ne00; ++i) {
112
+ all_sum += (float) x[i] * y[i];
113
+ }
114
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
115
+ }
116
+ }
117
+ }
118
+ }
ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ #ifdef cl_intel_subgroups
4
+ #pragma OPENCL EXTENSION cl_intel_subgroups : enable
5
+ #else
6
+ #pragma OPENCL EXTENSION cl_khr_subgroups : enable
7
+ #endif
8
+
9
+ #ifdef cl_intel_required_subgroup_size
10
+ #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
11
+ #define INTEL_GPU 1
12
+ #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
13
+ #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
14
+ #elif defined(cl_qcom_reqd_sub_group_size)
15
+ #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
16
+ #define ADRENO_GPU 1
17
+ #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
18
+ #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
19
+ #endif
20
+
21
+ #ifdef ADRENO_GPU
22
+ REQD_SUBGROUP_SIZE_64
23
+ #endif
24
+ kernel void kernel_mul_mat_f16_f32_1row(
25
+ global char * src0,
26
+ ulong offset0,
27
+ global char * src1,
28
+ ulong offset1,
29
+ global float * dst,
30
+ ulong offsetd,
31
+ int ne00,
32
+ int ne01,
33
+ int ne02,
34
+ ulong nb00,
35
+ ulong nb01,
36
+ ulong nb02,
37
+ ulong nb03,
38
+ int ne10,
39
+ int ne11,
40
+ int ne12,
41
+ ulong nb10,
42
+ ulong nb11,
43
+ ulong nb12,
44
+ ulong nb13,
45
+ int ne0,
46
+ int ne1,
47
+ int r2,
48
+ int r3
49
+ ) {
50
+ src0 = (global char*)((global char*)src0 + offset0);
51
+ src1 = (global char*)((global char*)src1 + offset1);
52
+ dst = (global float*)((global char*)dst + offsetd);
53
+
54
+ int r0 = get_group_id(0);
55
+ int r1 = get_group_id(1);
56
+ int im = get_group_id(2);
57
+
58
+ int i12 = im%ne12;
59
+ int i13 = im/ne12;
60
+
61
+ ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
62
+ ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
63
+
64
+ global half * x = (global half *) (src0 + offset_src0);
65
+ global float * y = (global float *) (src1 + offset_src1);
66
+
67
+ float sumf = 0;
68
+ if (ne00 < 128) {
69
+ for (int i = get_sub_group_local_id(); i < ne00; i += get_max_sub_group_size()) {
70
+ sumf += (float) x[i] * (float) y[i];
71
+ }
72
+ float all_sum = sub_group_reduce_add(sumf);
73
+ if (get_sub_group_local_id() == 0) {
74
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
75
+ }
76
+ } else {
77
+ global half4 * x4 = (global half4 *) x;
78
+ global float4 * y4 = (global float4 *) y;
79
+ for (int i = get_sub_group_local_id(); i < ne00/4; i += get_max_sub_group_size()) {
80
+ sumf += (float) x4[i].s0 * y4[i].s0;
81
+ sumf += (float) x4[i].s1 * y4[i].s1;
82
+ sumf += (float) x4[i].s2 * y4[i].s2;
83
+ sumf += (float) x4[i].s3 * y4[i].s3;
84
+ }
85
+ float all_sum = sub_group_reduce_add(sumf);
86
+ if (get_sub_group_local_id() == 0) {
87
+ for (int i = 4*(ne00/4); i < ne00; ++i) {
88
+ all_sum += (float) x[i] * y[i];
89
+ }
90
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
91
+ }
92
+ }
93
+
94
+ }
ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ #ifdef cl_intel_subgroups
4
+ #pragma OPENCL EXTENSION cl_intel_subgroups : enable
5
+ #else
6
+ #pragma OPENCL EXTENSION cl_khr_subgroups : enable
7
+ #endif
8
+
9
+ #ifdef cl_intel_required_subgroup_size
10
+ #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
11
+ #define INTEL_GPU 1
12
+ #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
13
+ #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
14
+ #elif defined(cl_qcom_reqd_sub_group_size)
15
+ #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
16
+ #define ADRENO_GPU 1
17
+ #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
18
+ #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
19
+ #endif
20
+
21
+ // Assumes row size (ne00) is a multiple of 4
22
+ #ifdef ADRENO_GPU
23
+ REQD_SUBGROUP_SIZE_64
24
+ #endif
25
+ kernel void kernel_mul_mat_f16_f32_l4(
26
+ global char * src0,
27
+ ulong offset0,
28
+ global char * src1,
29
+ ulong offset1,
30
+ global float * dst,
31
+ ulong offsetd,
32
+ int ne00,
33
+ int ne01,
34
+ int ne02,
35
+ ulong nb00,
36
+ ulong nb01,
37
+ ulong nb02,
38
+ ulong nb03,
39
+ int ne10,
40
+ int ne11,
41
+ int ne12,
42
+ ulong nb10,
43
+ ulong nb11,
44
+ ulong nb12,
45
+ ulong nb13,
46
+ int ne0,
47
+ int ne1,
48
+ int r2,
49
+ int r3
50
+ ) {
51
+ src0 = (global char*)((global char*)src0 + offset0);
52
+ src1 = (global char*)((global char*)src1 + offset1);
53
+ dst = (global float*)((global char*)dst + offsetd);
54
+
55
+ int nrows = ne11;
56
+ int r0 = get_group_id(0);
57
+ int im = get_group_id(2);
58
+
59
+ int i12 = im%ne12;
60
+ int i13 = im/ne12;
61
+
62
+ ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
63
+
64
+ global half4 * x4 = (global half4 *) (src0 + offset_src0);
65
+
66
+ for (int r1 = 0; r1 < nrows; ++r1) {
67
+ ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
68
+
69
+ global float4 * y4 = (global float4 *) (src1 + offset_src1);
70
+
71
+ float sumf = 0;
72
+ for (int i = get_sub_group_local_id(); i < ne00/4; i += get_max_sub_group_size()) {
73
+ sumf += convert_float(x4[i].s0) * y4[i].s0;
74
+ sumf += convert_float(x4[i].s1) * y4[i].s1;
75
+ sumf += convert_float(x4[i].s2) * y4[i].s2;
76
+ sumf += convert_float(x4[i].s3) * y4[i].s3;
77
+ }
78
+
79
+ float all_sum = sub_group_reduce_add(sumf);
80
+ if (get_sub_group_local_id() == 0) {
81
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
82
+ }
83
+ }
84
+ }
ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ #ifdef cl_intel_subgroups
4
+ #pragma OPENCL EXTENSION cl_intel_subgroups : enable
5
+ #else
6
+ #pragma OPENCL EXTENSION cl_khr_subgroups : enable
7
+ #endif
8
+
9
+ #ifdef cl_intel_required_subgroup_size
10
+ #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
11
+ #define INTEL_GPU 1
12
+ #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
13
+ #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
14
+ #elif defined(cl_qcom_reqd_sub_group_size)
15
+ #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
16
+ #define ADRENO_GPU 1
17
+ #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
18
+ #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
19
+ #endif
20
+
21
+ #define N_F32_F32 4
22
+
23
+ #ifdef ADRENO_GPU
24
+ REQD_SUBGROUP_SIZE_64
25
+ #endif
26
+ kernel void kernel_mul_mat_f32_f32(
27
+ global char * src0,
28
+ ulong offset0,
29
+ global char * src1,
30
+ ulong offset1,
31
+ global float * dst,
32
+ ulong offsetd,
33
+ int ne00,
34
+ int ne01,
35
+ int ne02,
36
+ ulong nb00,
37
+ ulong nb01,
38
+ ulong nb02,
39
+ ulong nb03,
40
+ int ne10,
41
+ int ne11,
42
+ int ne12,
43
+ ulong nb10,
44
+ ulong nb11,
45
+ ulong nb12,
46
+ ulong nb13,
47
+ int ne0,
48
+ int ne1,
49
+ int r2,
50
+ int r3
51
+ ) {
52
+ src0 = (global char*)((global char*)src0 + offset0);
53
+ src1 = (global char*)((global char*)src1 + offset1);
54
+ dst = (global float*)((global char*)dst + offsetd);
55
+
56
+ int r0 = get_group_id(0);
57
+ int rb = get_group_id(1)*N_F32_F32;
58
+ int im = get_group_id(2);
59
+
60
+ int i12 = im%ne12;
61
+ int i13 = im/ne12;
62
+
63
+ ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
64
+
65
+ global float * x = (global float *) (src0 + offset_src0);
66
+
67
+ if (ne00 < 128) {
68
+ for (int row = 0; row < N_F32_F32; ++row) {
69
+ int r1 = rb + row;
70
+ if (r1 >= ne11) {
71
+ break;
72
+ }
73
+
74
+ ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
75
+
76
+ global float * y = (global float *) (src1 + offset_src1);
77
+
78
+ float sumf = 0;
79
+ for (int i = get_sub_group_local_id(); i < ne00; i += get_max_sub_group_size()) {
80
+ sumf += (float) x[i] * (float) y[i];
81
+ }
82
+
83
+ float all_sum = sub_group_reduce_add(sumf);
84
+ if (get_sub_group_local_id() == 0) {
85
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
86
+ }
87
+ }
88
+ } else {
89
+ global float4 * x4 = (global float4 *)x;
90
+ for (int row = 0; row < N_F32_F32; ++row) {
91
+ int r1 = rb + row;
92
+ if (r1 >= ne11) {
93
+ break;
94
+ }
95
+
96
+ ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
97
+
98
+ global float * y = (global float *) (src1 + offset_src1);
99
+ global float4 * y4 = (global float4 *) y;
100
+
101
+ float sumf = 0;
102
+ for (int i = get_sub_group_local_id(); i < ne00/4; i += get_max_sub_group_size()) {
103
+ sumf += (float) x4[i].s0 * y4[i].s0;
104
+ sumf += (float) x4[i].s1 * y4[i].s1;
105
+ sumf += (float) x4[i].s2 * y4[i].s2;
106
+ sumf += (float) x4[i].s3 * y4[i].s3;
107
+ }
108
+
109
+ float all_sum = sub_group_reduce_add(sumf);
110
+ if (get_sub_group_local_id() == 0) {
111
+ for (int i = 4*(ne00/4); i < ne00; ++i) {
112
+ all_sum += (float) x[i] * y[i];
113
+ }
114
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
115
+ }
116
+ }
117
+ }
118
+ }
ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ #ifdef cl_intel_subgroups
4
+ #pragma OPENCL EXTENSION cl_intel_subgroups : enable
5
+ #else
6
+ #pragma OPENCL EXTENSION cl_khr_subgroups : enable
7
+ #endif
8
+
9
+ #ifdef cl_intel_required_subgroup_size
10
+ #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
11
+ #define INTEL_GPU 1
12
+ #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
13
+ #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
14
+ #elif defined(cl_qcom_reqd_sub_group_size)
15
+ #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
16
+ #define ADRENO_GPU 1
17
+ #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
18
+ #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
19
+ #endif
20
+
21
+ #define QK4_0 32
22
+ #define QR4_0 2
23
+ #define QK4_1 32
24
+ #define QR4_1 2
25
+ #define QK5_0 32
26
+ #define QR5_0 2
27
+ #define QK5_1 32
28
+ #define QR5_1 2
29
+ #define QK8_0 32
30
+ #define QR8_0 1
31
+ #define QK_K 256
32
+ #define K_QUANTS_PER_ITERATION 2
33
+
34
+ typedef char int8_t;
35
+ typedef uchar uint8_t;
36
+ typedef short int16_t;
37
+ typedef ushort uint16_t;
38
+ typedef int int32_t;
39
+ typedef uint uint32_t;
40
+
41
+ //------------------------------------------------------------------------------
42
+ // block_q4_0
43
+ //------------------------------------------------------------------------------
44
+ struct block_q4_0
45
+ {
46
+ half d;
47
+ uint8_t qs[QK4_0 / 2];
48
+ };
49
+
50
+ //------------------------------------------------------------------------------
51
+ // mul_vec_q_n_f32
52
+ //------------------------------------------------------------------------------
53
+ // function for calculate inner product between half a q4_0 block and 16 floats (yl), sumy is SUM(yl[i])
54
+ // il indicates where the q4 quants begin (0 or QK4_0/4)
55
+ // we assume that the yl's have been multiplied with the appropriate scale factor
56
+ // that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
57
+ inline float block_q_4_0_dot_y(
58
+ global struct block_q4_0 * qb_curr,
59
+ float sumy,
60
+ private float * yl,
61
+ int il
62
+ ) {
63
+ float d = qb_curr->d;
64
+ float2 acc = 0.f;
65
+ global ushort * qs = ((global ushort *)qb_curr + 1 + il/2);
66
+ for (int i = 0; i < 8; i+=2) {
67
+ acc.s0 += yl[i + 0] * (qs[i / 2] & 0x000F)
68
+ + yl[i + 1] * (qs[i / 2] & 0x0F00);
69
+ acc.s1 += yl[i + 8] * (qs[i / 2] & 0x00F0)
70
+ + yl[i + 9] * (qs[i / 2] & 0xF000);
71
+ }
72
+ return d * (sumy * -8.f + acc.s0 + acc.s1);
73
+ }
74
+
75
+ #ifdef INTEL_GPU
76
+ #define N_DST 4 // each SIMD group works on 4 rows
77
+ #define N_SIMDGROUP 1 // number of SIMD groups in a thread group
78
+ #define N_SIMDWIDTH 16 // assuming SIMD group size is 16
79
+ #elif defined (ADRENO_GPU)
80
+ #define N_DST 4
81
+ #define N_SIMDGROUP 1
82
+ #define N_SIMDWIDTH 64
83
+ #endif
84
+
85
+ inline void mul_vec_q_n_f32(
86
+ global void * src0,
87
+ global float * src1,
88
+ global float * dst,
89
+ int ne00,
90
+ int ne01,
91
+ int ne02,
92
+ int ne10,
93
+ int ne12,
94
+ int ne0,
95
+ int ne1,
96
+ int r2,
97
+ int r3
98
+ ) {
99
+
100
+ const ulong nb = ne00/QK4_0;
101
+
102
+ int r0 = get_group_id(0);
103
+ int r1 = get_group_id(1);
104
+ int im = get_group_id(2);
105
+
106
+ // (r0 * N_SIMDGROUP + get_sub_group_id()) is essenatially the linear global
107
+ // id of a SIMD group in the grid.
108
+ int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
109
+
110
+ int i12 = im%ne12;
111
+ int i13 = im/ne12;
112
+
113
+ ulong offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
114
+
115
+ global struct block_q4_0 * x = (global struct block_q4_0 *) src0 + offset0;
116
+ global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
117
+
118
+ float yl[16]; // src1 vector cache
119
+ float sumf[N_DST]={0.f};
120
+
121
+ int ix = get_sub_group_local_id()/2;
122
+ int il = 8*(get_sub_group_local_id()%2);
123
+
124
+ global float * yb = y + ix * QK4_0 + il;
125
+
126
+ // each thread in a SIMD group deals with half a block.
127
+ for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
128
+ float sumy = 0;
129
+ for (int i = 0; i < 8; i += 2) {
130
+ sumy += yb[i] + yb[i+1];
131
+ yl[i+0] = yb[i+ 0];
132
+ yl[i+1] = yb[i+ 1]/256.f;
133
+ sumy += yb[i+16] + yb[i+17];
134
+ yl[i+8] = yb[i+16]/16.f;
135
+ yl[i+9] = yb[i+17]/4096.f;
136
+ }
137
+
138
+ for (int row = 0; row < N_DST; row++) {
139
+ sumf[row] += block_q_4_0_dot_y(x+ib+row*nb, sumy, yl, il);
140
+ }
141
+
142
+ // One thread in a SIMD group (i.e., subgroup) handles a half block,
143
+ // hence then entire SIMD group handles SIMDWIDTH/2 blocks.
144
+ // y points to the activation matrix (of type float). Therefore for
145
+ // one thread, the # of blocks y should advance is SIMDWIDTH/2 (because
146
+ // SIMDWIDTH/2 blocks are processed by a SIMD group) - in terms of
147
+ // floats, it is QK4_0 * (SIMDWIDTH/2), where QK4_0 is the block size.
148
+ yb += QK4_0 * (N_SIMDWIDTH/2);
149
+ }
150
+
151
+ // The above does not work for Adreno - it produces incorrect results for
152
+ // row = 1, 2, 3 and only row = 0 gives the correct result.
153
+ // If N_DST is changed, the below array must be initialized accordingly.
154
+ // This also seems to perform better on Intel.
155
+ float tot[N_DST] = {
156
+ sub_group_reduce_add(sumf[0]), sub_group_reduce_add(sumf[1]),
157
+ sub_group_reduce_add(sumf[2]), sub_group_reduce_add(sumf[3])};
158
+ for (int row = 0; row < N_DST; ++row) {
159
+ if (get_sub_group_local_id() == 0 && first_row + row < ne01) {
160
+ dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot[row];
161
+ }
162
+ }
163
+ }
164
+
165
+ #ifdef INTEL_GPU
166
+ REQD_SUBGROUP_SIZE_16
167
+ #elif defined (ADRENO_GPU)
168
+ REQD_SUBGROUP_SIZE_64
169
+ #endif
170
+ kernel void kernel_mul_mat_q4_0_f32(
171
+ global void * src0,
172
+ ulong offset0,
173
+ global float * src1,
174
+ ulong offset1,
175
+ global float * dst,
176
+ ulong offsetd,
177
+ int ne00,
178
+ int ne01,
179
+ int ne02,
180
+ int ne10,
181
+ int ne12,
182
+ int ne0,
183
+ int ne1,
184
+ int r2,
185
+ int r3
186
+ ) {
187
+ src0 = (global void*)((global char*)src0 + offset0);
188
+ src1 = (global float*)((global char*)src1 + offset1);
189
+ dst = (global float*)((global char*)dst + offsetd);
190
+
191
+ mul_vec_q_n_f32(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
192
+ }
ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ #ifdef cl_intel_subgroups
4
+ #pragma OPENCL EXTENSION cl_intel_subgroups : enable
5
+ #else
6
+ #pragma OPENCL EXTENSION cl_khr_subgroups : enable
7
+ #endif
8
+
9
+ #ifdef cl_intel_required_subgroup_size
10
+ #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
11
+ #define INTEL_GPU 1
12
+ #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
13
+ #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
14
+ #elif defined(cl_qcom_reqd_sub_group_size)
15
+ #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
16
+ #define ADRENO_GPU 1
17
+ #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
18
+ #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
19
+ #endif
20
+
21
+ #define QK4_0 32
22
+ #define QR4_0 2
23
+ #define QK4_1 32
24
+ #define QR4_1 2
25
+ #define QK5_0 32
26
+ #define QR5_0 2
27
+ #define QK5_1 32
28
+ #define QR5_1 2
29
+ #define QK8_0 32
30
+ #define QR8_0 1
31
+ #define QK_K 256
32
+ #define K_QUANTS_PER_ITERATION 2
33
+
34
+ typedef char int8_t;
35
+ typedef uchar uint8_t;
36
+ typedef short int16_t;
37
+ typedef ushort uint16_t;
38
+ typedef int int32_t;
39
+ typedef uint uint32_t;
40
+
41
+ //------------------------------------------------------------------------------
42
+ // block_q4_0
43
+ //------------------------------------------------------------------------------
44
+ struct block_q4_0
45
+ {
46
+ half d;
47
+ uint8_t qs[QK4_0 / 2];
48
+ };
49
+
50
+ inline float mm_block_q_4_0_dot_y_flat(
51
+ global uchar * x,
52
+ global half * dh,
53
+ float sumy,
54
+ float16 yl,
55
+ int il
56
+ ) {
57
+ float d = *dh;
58
+ global ushort * qs = ((global ushort *)x + il/2);
59
+ float acc = 0.f;
60
+
61
+ acc += yl.s0 * (qs[0] & 0x000F);
62
+ acc += yl.s1 * (qs[0] & 0x0F00);
63
+ acc += yl.s8 * (qs[0] & 0x00F0);
64
+ acc += yl.s9 * (qs[0] & 0xF000);
65
+
66
+ acc += yl.s2 * (qs[1] & 0x000F);
67
+ acc += yl.s3 * (qs[1] & 0x0F00);
68
+ acc += yl.sa * (qs[1] & 0x00F0);
69
+ acc += yl.sb * (qs[1] & 0xF000);
70
+
71
+ acc += yl.s4 * (qs[2] & 0x000F);
72
+ acc += yl.s5 * (qs[2] & 0x0F00);
73
+ acc += yl.sc * (qs[2] & 0x00F0);
74
+ acc += yl.sd * (qs[2] & 0xF000);
75
+
76
+ acc += yl.s6 * (qs[3] & 0x000F);
77
+ acc += yl.s7 * (qs[3] & 0x0F00);
78
+ acc += yl.se * (qs[3] & 0x00F0);
79
+ acc += yl.sf * (qs[3] & 0xF000);
80
+
81
+ return d * (sumy * -8.f + acc);
82
+ }
83
+
84
+ #ifdef INTEL_GPU
85
+ #define N_DST 16 // each SIMD group works on 8 rows (in weights matrix)
86
+ #define N_SIMDGROUP 1 // number of SIMD groups in a thread group
87
+ #define N_SIMDWIDTH 16 // assuming SIMD group size is 16
88
+ #elif defined (ADRENO_GPU)
89
+ #define N_DST 16
90
+ #define N_SIMDGROUP 1
91
+ #define N_SIMDWIDTH 64
92
+ #endif
93
+ //
94
+ // This variant performs 1d blocking with 16x output.
95
+ // Eeach simdgroup outputs 16 values on `n0` dim (row in the output matrix).
96
+ //
97
+ inline void mul_mat_q_n_f32_1d_16x_flat(
98
+ global uchar * src0_q,
99
+ global half * src0_d,
100
+ global float * src1,
101
+ global float * dst,
102
+ int ne00,
103
+ int ne01,
104
+ int ne02,
105
+ int ne10,
106
+ int ne12,
107
+ int ne0,
108
+ int ne1,
109
+ int r2,
110
+ int r3
111
+ ) {
112
+ const int nb = ne00/QK4_0;
113
+
114
+ int r0 = get_group_id(0);
115
+ int r1 = get_group_id(1);
116
+ int im = get_group_id(2);
117
+
118
+ // (r0 * N_SIMDGROUP + get_sub_group_id()) is the linear global id of
119
+ // a SIMD group in the grid. Each SIMD group produces N_DST values in the
120
+ // result, hence uses nb blocks, i.e., the offset becomes first_row*nb.
121
+ // Currently with llama2 7B, im is always 0.
122
+ // TODO: how to handle im/gqa*(nb*ne0)?
123
+ int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
124
+
125
+ int i12 = im%ne12;
126
+ int i13 = im/ne12;
127
+
128
+ // The number of scales is the same as the number of blocks.
129
+ ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
130
+ // Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
131
+ ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
132
+
133
+ global uchar * x = (global uchar *) src0_q + offset0_q;
134
+ global half * d = (global half *) src0_d + offset0_d;
135
+ global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
136
+
137
+ float16 yl;
138
+ float16 sumf = (float16)(0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
139
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f);
140
+
141
+ int ix = get_sub_group_local_id()/2;
142
+ int il = 8*(get_sub_group_local_id()%2);
143
+
144
+ global float * yb = y + ix*QK4_0 + il;
145
+
146
+ for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
147
+ float sumy = 0.f;
148
+
149
+ sumy += yb[0];
150
+ sumy += yb[1];
151
+ sumy += yb[2];
152
+ sumy += yb[3];
153
+ sumy += yb[4];
154
+ sumy += yb[5];
155
+ sumy += yb[6];
156
+ sumy += yb[7];
157
+
158
+ sumy += yb[16];
159
+ sumy += yb[17];
160
+ sumy += yb[18];
161
+ sumy += yb[19];
162
+ sumy += yb[20];
163
+ sumy += yb[21];
164
+ sumy += yb[22];
165
+ sumy += yb[23];
166
+
167
+ yl.s0 = yb[0];
168
+ yl.s1 = yb[1]/256.f;
169
+
170
+ yl.s2 = yb[2];
171
+ yl.s3 = yb[3]/256.f;
172
+
173
+ yl.s4 = yb[4];
174
+ yl.s5 = yb[5]/256.f;
175
+
176
+ yl.s6 = yb[6];
177
+ yl.s7 = yb[7]/256.f;
178
+
179
+ yl.s8 = yb[16]/16.f;
180
+ yl.s9 = yb[17]/4096.f;
181
+
182
+ yl.sa = yb[18]/16.f;
183
+ yl.sb = yb[19]/4096.f;
184
+
185
+ yl.sc = yb[20]/16.f;
186
+ yl.sd = yb[21]/4096.f;
187
+
188
+ yl.se = yb[22]/16.f;
189
+ yl.sf = yb[23]/4096.f;
190
+
191
+ sumf.s0 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 0*nb*QK4_0/2, d + ib + 0*nb, sumy, yl, il);
192
+ sumf.s1 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 1*nb*QK4_0/2, d + ib + 1*nb, sumy, yl, il);
193
+ sumf.s2 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 2*nb*QK4_0/2, d + ib + 2*nb, sumy, yl, il);
194
+ sumf.s3 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 3*nb*QK4_0/2, d + ib + 3*nb, sumy, yl, il);
195
+
196
+ sumf.s4 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 4*nb*QK4_0/2, d + ib + 4*nb, sumy, yl, il);
197
+ sumf.s5 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 5*nb*QK4_0/2, d + ib + 5*nb, sumy, yl, il);
198
+ sumf.s6 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 6*nb*QK4_0/2, d + ib + 6*nb, sumy, yl, il);
199
+ sumf.s7 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 7*nb*QK4_0/2, d + ib + 7*nb, sumy, yl, il);
200
+
201
+ sumf.s8 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 8*nb*QK4_0/2, d + ib + 8*nb, sumy, yl, il);
202
+ sumf.s9 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 9*nb*QK4_0/2, d + ib + 9*nb, sumy, yl, il);
203
+ sumf.sa += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 10*nb*QK4_0/2, d + ib + 10*nb, sumy, yl, il);
204
+ sumf.sb += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 11*nb*QK4_0/2, d + ib + 11*nb, sumy, yl, il);
205
+
206
+ sumf.sc += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 12*nb*QK4_0/2, d + ib + 12*nb, sumy, yl, il);
207
+ sumf.sd += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 13*nb*QK4_0/2, d + ib + 13*nb, sumy, yl, il);
208
+ sumf.se += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 14*nb*QK4_0/2, d + ib + 14*nb, sumy, yl, il);
209
+ sumf.sf += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 15*nb*QK4_0/2, d + ib + 15*nb, sumy, yl, il);
210
+
211
+ yb += QK4_0 * (N_SIMDWIDTH/2);
212
+ }
213
+
214
+ float16 tot = (float16)(
215
+ sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
216
+ sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3),
217
+ sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5),
218
+ sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7),
219
+
220
+ sub_group_reduce_add(sumf.s8), sub_group_reduce_add(sumf.s9),
221
+ sub_group_reduce_add(sumf.sa), sub_group_reduce_add(sumf.sb),
222
+ sub_group_reduce_add(sumf.sc), sub_group_reduce_add(sumf.sd),
223
+ sub_group_reduce_add(sumf.se), sub_group_reduce_add(sumf.sf)
224
+ );
225
+
226
+ if (get_sub_group_local_id() == 0) {
227
+ if (first_row + 0 < ne01) {
228
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
229
+ }
230
+ if (first_row + 1 < ne01) {
231
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
232
+ }
233
+ if (first_row + 2 < ne01) {
234
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
235
+ }
236
+ if (first_row + 3 < ne01) {
237
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
238
+ }
239
+
240
+ if (first_row + 4 < ne01) {
241
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
242
+ }
243
+ if (first_row + 5 < ne01) {
244
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
245
+ }
246
+ if (first_row + 6 < ne01) {
247
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
248
+ }
249
+ if (first_row + 7 < ne01) {
250
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
251
+ }
252
+
253
+ if (first_row + 8 < ne01) {
254
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 8] = tot.s8;
255
+ }
256
+ if (first_row + 9 < ne01) {
257
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 9] = tot.s9;
258
+ }
259
+ if (first_row + 10 < ne01) {
260
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 10] = tot.sa;
261
+ }
262
+ if (first_row + 11 < ne01) {
263
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 11] = tot.sb;
264
+ }
265
+
266
+ if (first_row + 12 < ne01) {
267
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 12] = tot.sc;
268
+ }
269
+ if (first_row + 13 < ne01) {
270
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 13] = tot.sd;
271
+ }
272
+ if (first_row + 14 < ne01) {
273
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 14] = tot.se;
274
+ }
275
+ if (first_row + 15 < ne01) {
276
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 15] = tot.sf;
277
+ }
278
+ }
279
+ }
280
+
281
+ #ifdef INTEL_GPU
282
+ REQD_SUBGROUP_SIZE_16
283
+ #elif defined (ADRENO_GPU)
284
+ REQD_SUBGROUP_SIZE_64
285
+ #endif
286
+ kernel void kernel_mul_mat_q4_0_f32_1d_16x_flat(
287
+ global uchar * src0_q,
288
+ global half * src0_d,
289
+ global float * src1,
290
+ ulong offset1,
291
+ global float * dst,
292
+ ulong offsetd,
293
+ int ne00,
294
+ int ne01,
295
+ int ne02,
296
+ int ne10,
297
+ int ne12,
298
+ int ne0,
299
+ int ne1,
300
+ int r2,
301
+ int r3
302
+ ) {
303
+ src1 = (global float*)((global char*)src1 + offset1);
304
+ dst = (global float*)((global char*)dst + offsetd);
305
+
306
+ mul_mat_q_n_f32_1d_16x_flat(src0_q, src0_d, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
307
+ }
ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ #ifdef cl_intel_subgroups
4
+ #pragma OPENCL EXTENSION cl_intel_subgroups : enable
5
+ #else
6
+ #pragma OPENCL EXTENSION cl_khr_subgroups : enable
7
+ #endif
8
+
9
+ #ifdef cl_intel_required_subgroup_size
10
+ #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
11
+ #define INTEL_GPU 1
12
+ #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
13
+ #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
14
+ #elif defined(cl_qcom_reqd_sub_group_size)
15
+ #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
16
+ #define ADRENO_GPU 1
17
+ #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
18
+ #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
19
+ #endif
20
+
21
+ #define QK4_0 32
22
+ #define QR4_0 2
23
+ #define QK4_1 32
24
+ #define QR4_1 2
25
+ #define QK5_0 32
26
+ #define QR5_0 2
27
+ #define QK5_1 32
28
+ #define QR5_1 2
29
+ #define QK8_0 32
30
+ #define QR8_0 1
31
+ #define QK_K 256
32
+ #define K_QUANTS_PER_ITERATION 2
33
+
34
+ typedef char int8_t;
35
+ typedef uchar uint8_t;
36
+ typedef short int16_t;
37
+ typedef ushort uint16_t;
38
+ typedef int int32_t;
39
+ typedef uint uint32_t;
40
+
41
+ //------------------------------------------------------------------------------
42
+ // block_q4_0
43
+ //------------------------------------------------------------------------------
44
+ struct block_q4_0
45
+ {
46
+ half d;
47
+ uint8_t qs[QK4_0 / 2];
48
+ };
49
+
50
+ inline float mm_block_q_4_0_dot_y_flat(
51
+ global uchar * x,
52
+ global half * dh,
53
+ float sumy,
54
+ float16 yl,
55
+ int il
56
+ ) {
57
+ float d = *dh;
58
+ global ushort * qs = ((global ushort *)x + il/2);
59
+ float acc = 0.f;
60
+
61
+ acc += yl.s0 * (qs[0] & 0x000F);
62
+ acc += yl.s1 * (qs[0] & 0x0F00);
63
+ acc += yl.s8 * (qs[0] & 0x00F0);
64
+ acc += yl.s9 * (qs[0] & 0xF000);
65
+
66
+ acc += yl.s2 * (qs[1] & 0x000F);
67
+ acc += yl.s3 * (qs[1] & 0x0F00);
68
+ acc += yl.sa * (qs[1] & 0x00F0);
69
+ acc += yl.sb * (qs[1] & 0xF000);
70
+
71
+ acc += yl.s4 * (qs[2] & 0x000F);
72
+ acc += yl.s5 * (qs[2] & 0x0F00);
73
+ acc += yl.sc * (qs[2] & 0x00F0);
74
+ acc += yl.sd * (qs[2] & 0xF000);
75
+
76
+ acc += yl.s6 * (qs[3] & 0x000F);
77
+ acc += yl.s7 * (qs[3] & 0x0F00);
78
+ acc += yl.se * (qs[3] & 0x00F0);
79
+ acc += yl.sf * (qs[3] & 0xF000);
80
+
81
+ return d * (sumy * -8.f + acc);
82
+ }
83
+
84
+ #ifdef INTEL_GPU
85
+ #define N_DST 8 // each SIMD group works on 8 rows (in weights matrix)
86
+ #define N_SIMDGROUP 1 // number of SIMD groups in a thread group
87
+ #define N_SIMDWIDTH 16 // assuming SIMD group size is 16
88
+ #elif defined (ADRENO_GPU)
89
+ #define N_DST 8
90
+ #define N_SIMDGROUP 1
91
+ #define N_SIMDWIDTH 64
92
+ #endif
93
+ //
94
+ // This variant performs 1d blocking with 8x output.
95
+ // Eeach simdgroup outputs 8 values on `n0` dim (row in the output matrix).
96
+ //
97
+ inline void mul_mat_q_n_f32_1d_8x_flat(
98
+ global uchar * src0_q,
99
+ global half * src0_d,
100
+ global float * src1,
101
+ global float * dst,
102
+ int ne00,
103
+ int ne01,
104
+ int ne02,
105
+ int ne10,
106
+ int ne12,
107
+ int ne0,
108
+ int ne1,
109
+ int r2,
110
+ int r3
111
+ ) {
112
+ const int nb = ne00/QK4_0;
113
+
114
+ int r0 = get_group_id(0);
115
+ int r1 = get_group_id(1);
116
+ int im = get_group_id(2);
117
+
118
+ // (r0 * N_SIMDGROUP + get_sub_group_id()) is the linear global id of
119
+ // a SIMD group in the grid. Each SIMD group produces N_DST values in the
120
+ // result, hence uses nb blocks, i.e., the offset becomes first_row*nb.
121
+ // Currently with llama2 7B, im is always 0.
122
+ // TODO: how to handle im/gqa*(nb*ne0)?
123
+ int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
124
+
125
+ int i12 = im%ne12;
126
+ int i13 = im/ne12;
127
+
128
+ // The number of scales is the same as the number of blocks.
129
+ ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
130
+ // Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
131
+ ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
132
+
133
+ global uchar * x = (global uchar *) src0_q + offset0_q;
134
+ global half * d = (global half *) src0_d + offset0_d;
135
+ global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
136
+
137
+ float16 yl;
138
+ float8 sumf = (float8)(0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f);
139
+
140
+ int ix = get_sub_group_local_id()/2;
141
+ int il = 8*(get_sub_group_local_id()%2);
142
+
143
+ global float * yb = y + ix*QK4_0 + il;
144
+
145
+ for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
146
+ float sumy = 0.f;
147
+
148
+ sumy += yb[0];
149
+ sumy += yb[1];
150
+ sumy += yb[2];
151
+ sumy += yb[3];
152
+ sumy += yb[4];
153
+ sumy += yb[5];
154
+ sumy += yb[6];
155
+ sumy += yb[7];
156
+
157
+ sumy += yb[16];
158
+ sumy += yb[17];
159
+ sumy += yb[18];
160
+ sumy += yb[19];
161
+ sumy += yb[20];
162
+ sumy += yb[21];
163
+ sumy += yb[22];
164
+ sumy += yb[23];
165
+
166
+ yl.s0 = yb[0];
167
+ yl.s1 = yb[1]/256.f;
168
+
169
+ yl.s2 = yb[2];
170
+ yl.s3 = yb[3]/256.f;
171
+
172
+ yl.s4 = yb[4];
173
+ yl.s5 = yb[5]/256.f;
174
+
175
+ yl.s6 = yb[6];
176
+ yl.s7 = yb[7]/256.f;
177
+
178
+ yl.s8 = yb[16]/16.f;
179
+ yl.s9 = yb[17]/4096.f;
180
+
181
+ yl.sa = yb[18]/16.f;
182
+ yl.sb = yb[19]/4096.f;
183
+
184
+ yl.sc = yb[20]/16.f;
185
+ yl.sd = yb[21]/4096.f;
186
+
187
+ yl.se = yb[22]/16.f;
188
+ yl.sf = yb[23]/4096.f;
189
+
190
+ sumf.s0 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 0*nb*QK4_0/2, d + ib + 0*nb, sumy, yl, il);
191
+ sumf.s1 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 1*nb*QK4_0/2, d + ib + 1*nb, sumy, yl, il);
192
+ sumf.s2 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 2*nb*QK4_0/2, d + ib + 2*nb, sumy, yl, il);
193
+ sumf.s3 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 3*nb*QK4_0/2, d + ib + 3*nb, sumy, yl, il);
194
+
195
+ sumf.s4 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 4*nb*QK4_0/2, d + ib + 4*nb, sumy, yl, il);
196
+ sumf.s5 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 5*nb*QK4_0/2, d + ib + 5*nb, sumy, yl, il);
197
+ sumf.s6 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 6*nb*QK4_0/2, d + ib + 6*nb, sumy, yl, il);
198
+ sumf.s7 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 7*nb*QK4_0/2, d + ib + 7*nb, sumy, yl, il);
199
+
200
+ yb += QK4_0 * (N_SIMDWIDTH/2);
201
+ }
202
+
203
+ float8 tot = (float8)(
204
+ sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
205
+ sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3),
206
+ sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5),
207
+ sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7)
208
+ );
209
+
210
+ if (get_sub_group_local_id() == 0) {
211
+ if (first_row + 0 < ne01) {
212
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
213
+ }
214
+ if (first_row + 1 < ne01) {
215
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
216
+ }
217
+ if (first_row + 2 < ne01) {
218
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
219
+ }
220
+ if (first_row + 3 < ne01) {
221
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
222
+ }
223
+
224
+ if (first_row + 4 < ne01) {
225
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
226
+ }
227
+ if (first_row + 5 < ne01) {
228
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
229
+ }
230
+ if (first_row + 6 < ne01) {
231
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
232
+ }
233
+ if (first_row + 7 < ne01) {
234
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
235
+ }
236
+ }
237
+ }
238
+
239
+ #ifdef INTEL_GPU
240
+ REQD_SUBGROUP_SIZE_16
241
+ #elif defined (ADRENO_GPU)
242
+ REQD_SUBGROUP_SIZE_64
243
+ #endif
244
+ kernel void kernel_mul_mat_q4_0_f32_1d_8x_flat(
245
+ global uchar * src0_q,
246
+ global half * src0_d,
247
+ global float * src1,
248
+ ulong offset1,
249
+ global float * dst,
250
+ ulong offsetd,
251
+ int ne00,
252
+ int ne01,
253
+ int ne02,
254
+ int ne10,
255
+ int ne12,
256
+ int ne0,
257
+ int ne1,
258
+ int r2,
259
+ int r3
260
+ ) {
261
+ src1 = (global float*)((global char*)src1 + offset1);
262
+ dst = (global float*)((global char*)dst + offsetd);
263
+
264
+ mul_mat_q_n_f32_1d_8x_flat(src0_q, src0_d, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
265
+ }
ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ #ifdef cl_intel_subgroups
4
+ #pragma OPENCL EXTENSION cl_intel_subgroups : enable
5
+ #else
6
+ #pragma OPENCL EXTENSION cl_khr_subgroups : enable
7
+ #endif
8
+
9
+ #ifdef cl_intel_required_subgroup_size
10
+ #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
11
+ #define INTEL_GPU 1
12
+ #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
13
+ #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
14
+ #elif defined(cl_qcom_reqd_sub_group_size)
15
+ #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
16
+ #define ADRENO_GPU 1
17
+ #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
18
+ #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
19
+ #endif
20
+
21
+ #define QK4_0 32
22
+ #define QR4_0 2
23
+ #define QK4_1 32
24
+ #define QR4_1 2
25
+ #define QK5_0 32
26
+ #define QR5_0 2
27
+ #define QK5_1 32
28
+ #define QR5_1 2
29
+ #define QK8_0 32
30
+ #define QR8_0 1
31
+ #define QK_K 256
32
+ #define K_QUANTS_PER_ITERATION 2
33
+
34
+ typedef char int8_t;
35
+ typedef uchar uint8_t;
36
+ typedef short int16_t;
37
+ typedef ushort uint16_t;
38
+ typedef int int32_t;
39
+ typedef uint uint32_t;
40
+
41
+ //------------------------------------------------------------------------------
42
+ // block_q4_0
43
+ //------------------------------------------------------------------------------
44
+ struct block_q4_0
45
+ {
46
+ half d;
47
+ uint8_t qs[QK4_0 / 2];
48
+ };
49
+
50
+ // This function requires the original shuffled weights.
51
+ // As a reminder, the original weights are shuffled so that (q[0], q[16]) are
52
+ // packed together in a byte, so are (q[1], q[17]) and so on.
53
+ inline float block_q_4_0_dot_y_flat(
54
+ global uchar * x,
55
+ global half * dh,
56
+ float sumy,
57
+ float16 yl,
58
+ int il
59
+ ) {
60
+ float d = *dh;
61
+ global ushort * qs = ((global ushort *)x + il/2);
62
+ float acc = 0.f;
63
+
64
+ acc += yl.s0 * (qs[0] & 0x000F);
65
+ acc += yl.s1 * (qs[0] & 0x0F00);
66
+ acc += yl.s8 * (qs[0] & 0x00F0);
67
+ acc += yl.s9 * (qs[0] & 0xF000);
68
+
69
+ acc += yl.s2 * (qs[1] & 0x000F);
70
+ acc += yl.s3 * (qs[1] & 0x0F00);
71
+ acc += yl.sa * (qs[1] & 0x00F0);
72
+ acc += yl.sb * (qs[1] & 0xF000);
73
+
74
+ acc += yl.s4 * (qs[2] & 0x000F);
75
+ acc += yl.s5 * (qs[2] & 0x0F00);
76
+ acc += yl.sc * (qs[2] & 0x00F0);
77
+ acc += yl.sd * (qs[2] & 0xF000);
78
+
79
+ acc += yl.s6 * (qs[3] & 0x000F);
80
+ acc += yl.s7 * (qs[3] & 0x0F00);
81
+ acc += yl.se * (qs[3] & 0x00F0);
82
+ acc += yl.sf * (qs[3] & 0xF000);
83
+
84
+ return d * (sumy * -8.f + acc);
85
+ }
86
+
87
+ //
88
+ // This variant outputs 8 values.
89
+ //
90
+ #undef N_DST
91
+ #undef N_SIMDGROUP
92
+ #undef N_SIMDWIDTH
93
+
94
+ #ifdef INTEL_GPU
95
+ #define N_DST 8 // each SIMD group works on 8 rows
96
+ #define N_SIMDGROUP 1 // number of SIMD groups in a thread group
97
+ #define N_SIMDWIDTH 16 // assuming SIMD group size is 32
98
+ #elif defined (ADRENO_GPU)
99
+ #define N_DST 8
100
+ #define N_SIMDGROUP 1
101
+ #define N_SIMDWIDTH 64
102
+ #endif
103
+
104
+ inline void mul_vec_q_n_f32_8x_flat(
105
+ global uchar * src0_q,
106
+ global half * src0_d,
107
+ global float * src1,
108
+ global float * dst,
109
+ int ne00,
110
+ int ne01,
111
+ int ne02,
112
+ int ne10,
113
+ int ne12,
114
+ int ne0,
115
+ int ne1,
116
+ int r2,
117
+ int r3
118
+ ) {
119
+ const ulong nb = ne00/QK4_0;
120
+
121
+ int r0 = get_group_id(0);
122
+ int r1 = get_group_id(1);
123
+ int im = get_group_id(2);
124
+
125
+ // (r0 * N_SIMDGROUP + get_sub_group_id()) is the linear global id of
126
+ // a SIMD group in the grid. Each SIMD group produces N_DST values in the
127
+ // result, hence uses nb blocks, i.e., the offset becomes first_row*nb.
128
+ // Currently with llama2 7B, im is always 0.
129
+ // TODO: how to handle im/gqa*(nb*ne0)?
130
+ int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
131
+
132
+ int i12 = im%ne12;
133
+ int i13 = im/ne12;
134
+
135
+ // The number of scales is the same as the number of blocks.
136
+ ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
137
+ // Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
138
+ ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
139
+
140
+ global uchar * x = (global uchar *) src0_q + offset0_q;
141
+ global half * d = (global half *) src0_d + offset0_d;
142
+ global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
143
+
144
+ float16 yl;
145
+ float8 sumf = 0.f;
146
+
147
+ int ix = get_sub_group_local_id()/2;
148
+ int il = 8*(get_sub_group_local_id()%2);
149
+
150
+ global float * yb = y + ix*QK4_0 + il;
151
+
152
+ for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
153
+ float sumy = 0.f;
154
+
155
+ sumy += yb[0];
156
+ sumy += yb[1];
157
+ sumy += yb[2];
158
+ sumy += yb[3];
159
+ sumy += yb[4];
160
+ sumy += yb[5];
161
+ sumy += yb[6];
162
+ sumy += yb[7];
163
+
164
+ sumy += yb[16];
165
+ sumy += yb[17];
166
+ sumy += yb[18];
167
+ sumy += yb[19];
168
+ sumy += yb[20];
169
+ sumy += yb[21];
170
+ sumy += yb[22];
171
+ sumy += yb[23];
172
+
173
+ yl.s0 = yb[0];
174
+ yl.s1 = yb[1]/256.f;
175
+
176
+ yl.s2 = yb[2];
177
+ yl.s3 = yb[3]/256.f;
178
+
179
+ yl.s4 = yb[4];
180
+ yl.s5 = yb[5]/256.f;
181
+
182
+ yl.s6 = yb[6];
183
+ yl.s7 = yb[7]/256.f;
184
+
185
+ yl.s8 = yb[16]/16.f;
186
+ yl.s9 = yb[17]/4096.f;
187
+
188
+ yl.sa = yb[18]/16.f;
189
+ yl.sb = yb[19]/4096.f;
190
+
191
+ yl.sc = yb[20]/16.f;
192
+ yl.sd = yb[21]/4096.f;
193
+
194
+ yl.se = yb[22]/16.f;
195
+ yl.sf = yb[23]/4096.f;
196
+
197
+ sumf.s0 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 0*nb*QK4_0/2, d + ib + 0*nb, sumy, yl, il);
198
+ sumf.s1 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 1*nb*QK4_0/2, d + ib + 1*nb, sumy, yl, il);
199
+ sumf.s2 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 2*nb*QK4_0/2, d + ib + 2*nb, sumy, yl, il);
200
+ sumf.s3 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 3*nb*QK4_0/2, d + ib + 3*nb, sumy, yl, il);
201
+
202
+ sumf.s4 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 4*nb*QK4_0/2, d + ib + 4*nb, sumy, yl, il);
203
+ sumf.s5 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 5*nb*QK4_0/2, d + ib + 5*nb, sumy, yl, il);
204
+ sumf.s6 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 6*nb*QK4_0/2, d + ib + 6*nb, sumy, yl, il);
205
+ sumf.s7 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 7*nb*QK4_0/2, d + ib + 7*nb, sumy, yl, il);
206
+
207
+ yb += QK4_0 * (N_SIMDWIDTH/2);
208
+ }
209
+
210
+ float8 tot = (float8)(
211
+ sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
212
+ sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3),
213
+ sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5),
214
+ sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7)
215
+ );
216
+
217
+ if (get_sub_group_local_id() == 0) {
218
+ if (first_row + 0 < ne01) {
219
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
220
+ }
221
+ if (first_row + 1 < ne01) {
222
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
223
+ }
224
+ if (first_row + 2 < ne01) {
225
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
226
+ }
227
+ if (first_row + 3 < ne01) {
228
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
229
+ }
230
+
231
+ if (first_row + 4 < ne01) {
232
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
233
+ }
234
+ if (first_row + 5 < ne01) {
235
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
236
+ }
237
+ if (first_row + 6 < ne01) {
238
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
239
+ }
240
+ if (first_row + 7 < ne01) {
241
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
242
+ }
243
+ }
244
+ }
245
+
246
+ #ifdef INTEL_GPU
247
+ REQD_SUBGROUP_SIZE_16
248
+ #elif defined (ADRENO_GPU)
249
+ REQD_SUBGROUP_SIZE_64
250
+ #endif
251
+ kernel void kernel_mul_mat_q4_0_f32_8x_flat(
252
+ global uchar * src0_q,
253
+ global half * src0_d,
254
+ global float * src1,
255
+ ulong offset1,
256
+ global float * dst,
257
+ ulong offsetd,
258
+ int ne00,
259
+ int ne01,
260
+ int ne02,
261
+ int ne10,
262
+ int ne12,
263
+ int ne0,
264
+ int ne1,
265
+ int r2,
266
+ int r3
267
+ ) {
268
+ src1 = (global float*)((global char*)src1 + offset1);
269
+ dst = (global float*)((global char*)dst + offsetd);
270
+
271
+ mul_vec_q_n_f32_8x_flat(src0_q, src0_d, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
272
+ }
ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ #ifdef cl_intel_subgroups
4
+ #pragma OPENCL EXTENSION cl_intel_subgroups : enable
5
+ #else
6
+ #pragma OPENCL EXTENSION cl_khr_subgroups : enable
7
+ #endif
8
+
9
+ #ifdef cl_intel_required_subgroup_size
10
+ #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
11
+ #define INTEL_GPU 1
12
+ #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
13
+ #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
14
+ #elif defined(cl_qcom_reqd_sub_group_size)
15
+ #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
16
+ #define ADRENO_GPU 1
17
+ #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
18
+ #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
19
+ #endif
20
+
21
+ #define QK4_0 32
22
+ #define QR4_0 2
23
+ #define QK4_1 32
24
+ #define QR4_1 2
25
+ #define QK5_0 32
26
+ #define QR5_0 2
27
+ #define QK5_1 32
28
+ #define QR5_1 2
29
+ #define QK8_0 32
30
+ #define QR8_0 1
31
+ #define QK_K 256
32
+ #define K_QUANTS_PER_ITERATION 2
33
+
34
+ typedef char int8_t;
35
+ typedef uchar uint8_t;
36
+ typedef short int16_t;
37
+ typedef ushort uint16_t;
38
+ typedef int int32_t;
39
+ typedef uint uint32_t;
40
+
41
+ //------------------------------------------------------------------------------
42
+ // block_q4_0
43
+ //------------------------------------------------------------------------------
44
+ struct block_q4_0
45
+ {
46
+ half d;
47
+ uint8_t qs[QK4_0 / 2];
48
+ };
49
+
50
+ //
51
+ // This variant unrolls the loops and uses vector types instead of pointers.
52
+ // It improves performance on Adreno but not so much on Intel.
53
+ //
54
+ inline float block_q_4_0_dot_y_v(
55
+ global struct block_q4_0 * qb_curr,
56
+ float sumy,
57
+ float16 yl,
58
+ int il
59
+ ) {
60
+ float d = qb_curr->d;
61
+ float acc = 0.f;
62
+ global ushort * qs = ((global ushort *)qb_curr + 1 + il/2);
63
+
64
+ acc += yl.s0 * (qs[0] & 0x000F);
65
+ acc += yl.s1 * (qs[0] & 0x0F00);
66
+ acc += yl.s8 * (qs[0] & 0x00F0);
67
+ acc += yl.s9 * (qs[0] & 0xF000);
68
+
69
+ acc += yl.s2 * (qs[1] & 0x000F);
70
+ acc += yl.s3 * (qs[1] & 0x0F00);
71
+ acc += yl.sa * (qs[1] & 0x00F0);
72
+ acc += yl.sb * (qs[1] & 0xF000);
73
+
74
+ acc += yl.s4 * (qs[2] & 0x000F);
75
+ acc += yl.s5 * (qs[2] & 0x0F00);
76
+ acc += yl.sc * (qs[2] & 0x00F0);
77
+ acc += yl.sd * (qs[2] & 0xF000);
78
+
79
+ acc += yl.s6 * (qs[3] & 0x000F);
80
+ acc += yl.s7 * (qs[3] & 0x0F00);
81
+ acc += yl.se * (qs[3] & 0x00F0);
82
+ acc += yl.sf * (qs[3] & 0xF000);
83
+
84
+ return d * (sumy * -8.f + acc);
85
+ }
86
+
87
+ #undef N_DST
88
+ #undef N_SIMDGROUP
89
+ #undef N_SIMDWIDTH
90
+
91
+ #ifdef INTEL_GPU
92
+ #define N_DST 4 // each SIMD group works on 4 rows
93
+ #define N_SIMDGROUP 1 // number of SIMD groups in a thread group
94
+ #define N_SIMDWIDTH 16 // assuming SIMD group size is 16
95
+ #elif defined (ADRENO_GPU)
96
+ #define N_DST 4
97
+ #define N_SIMDGROUP 1
98
+ #define N_SIMDWIDTH 64
99
+ #endif
100
+
101
+ inline void mul_vec_q_n_f32_v(
102
+ global void * src0,
103
+ global float * src1,
104
+ global float * dst,
105
+ int ne00,
106
+ int ne01,
107
+ int ne02,
108
+ int ne10,
109
+ int ne12,
110
+ int ne0,
111
+ int ne1,
112
+ int r2,
113
+ int r3
114
+ ) {
115
+ const ulong nb = ne00/QK4_0;
116
+
117
+ int r0 = get_group_id(0);
118
+ int r1 = get_group_id(1);
119
+ int im = get_group_id(2);
120
+
121
+ // (r0 * N_SIMDGROUP + get_sub_group_id()) is essenatially the linear global
122
+ // id of a SIMD group in the grid.
123
+ int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
124
+
125
+ int i12 = im%ne12;
126
+ int i13 = im/ne12;
127
+
128
+ ulong offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
129
+
130
+ global struct block_q4_0 * x = (global struct block_q4_0 *) src0 + offset0;
131
+ global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
132
+
133
+ float16 yl; // src1 vector cache
134
+ float4 sumf = (float4)(0.f, 0.f, 0.f, 0.f);
135
+
136
+ int ix = get_sub_group_local_id()/2;
137
+ int il = 8*(get_sub_group_local_id()%2);
138
+
139
+ global float * yb = y + ix * QK4_0 + il;
140
+
141
+ // each thread in a SIMD group deals with half a block.
142
+ for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
143
+ float sumy = 0;
144
+
145
+ sumy += yb[0];
146
+ sumy += yb[1];
147
+ sumy += yb[2];
148
+ sumy += yb[3];
149
+ sumy += yb[4];
150
+ sumy += yb[5];
151
+ sumy += yb[6];
152
+ sumy += yb[7];
153
+
154
+ sumy += yb[16];
155
+ sumy += yb[17];
156
+ sumy += yb[18];
157
+ sumy += yb[19];
158
+ sumy += yb[20];
159
+ sumy += yb[21];
160
+ sumy += yb[22];
161
+ sumy += yb[23];
162
+
163
+
164
+ yl.s0 = yb[0];
165
+ yl.s1 = yb[1]/256.f;
166
+
167
+ yl.s2 = yb[2];
168
+ yl.s3 = yb[3]/256.f;
169
+
170
+ yl.s4 = yb[4];
171
+ yl.s5 = yb[5]/256.f;
172
+
173
+ yl.s6 = yb[6];
174
+ yl.s7 = yb[7]/256.f;
175
+
176
+ yl.s8 = yb[16]/16.f;
177
+ yl.s9 = yb[17]/4096.f;
178
+
179
+ yl.sa = yb[18]/16.f;
180
+ yl.sb = yb[19]/4096.f;
181
+
182
+ yl.sc = yb[20]/16.f;
183
+ yl.sd = yb[21]/4096.f;
184
+
185
+ yl.se = yb[22]/16.f;
186
+ yl.sf = yb[23]/4096.f;
187
+
188
+ sumf.s0 += block_q_4_0_dot_y_v(x+ib+0*nb, sumy, yl, il);
189
+ sumf.s1 += block_q_4_0_dot_y_v(x+ib+1*nb, sumy, yl, il);
190
+ sumf.s2 += block_q_4_0_dot_y_v(x+ib+2*nb, sumy, yl, il);
191
+ sumf.s3 += block_q_4_0_dot_y_v(x+ib+3*nb, sumy, yl, il);
192
+
193
+ // One thread in a SIMD group (i.e., subgroup) handles a half block,
194
+ // hence then entire SIMD group handles SIMDWIDTH/2 blocks.
195
+ // y points to the activation matrix (of type float). Therefore for
196
+ // one thread, the # of blocks y should advance is SIMDWIDTH/2 (because
197
+ // SIMDWIDTH/2 blocks are processed by a SIMD group) - in terms of
198
+ // floats, it is QK4_0 * (SIMDWIDTH/2), where QK4_0 is the block size.
199
+ yb += QK4_0 * (N_SIMDWIDTH/2);
200
+ }
201
+
202
+ // The above does not work for Adreno - it produces incorrect results for
203
+ // row = 1, 2, 3 and only row = 0 gives the correct result.
204
+ // If N_DST is changed, the below array must be initialized accordingly.
205
+ // This also seems to perform better on Intel.
206
+ float4 tot = (float4)(
207
+ sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
208
+ sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3)
209
+ );
210
+
211
+ if (get_sub_group_local_id() == 0) {
212
+ if (first_row + 0 < ne01) {
213
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
214
+ }
215
+ if (first_row + 1 < ne01) {
216
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
217
+ }
218
+ if (first_row + 2 < ne01) {
219
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
220
+ }
221
+ if (first_row + 3 < ne01) {
222
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
223
+ }
224
+ }
225
+ }
226
+
227
+ #ifdef INTEL_GPU
228
+ REQD_SUBGROUP_SIZE_16
229
+ #elif defined (ADRENO_GPU)
230
+ REQD_SUBGROUP_SIZE_64
231
+ #endif
232
+ kernel void kernel_mul_mat_q4_0_f32_v(
233
+ global void * src0,
234
+ ulong offset0,
235
+ global float * src1,
236
+ ulong offset1,
237
+ global float * dst,
238
+ ulong offsetd,
239
+ int ne00,
240
+ int ne01,
241
+ int ne02,
242
+ int ne10,
243
+ int ne12,
244
+ int ne0,
245
+ int ne1,
246
+ int r2,
247
+ int r3
248
+ ) {
249
+ src0 = (global void*)((global char*)src0 + offset0);
250
+ src1 = (global float*)((global char*)src1 + offset1);
251
+ dst = (global float*)((global char*)dst + offsetd);
252
+
253
+ mul_vec_q_n_f32_v(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
254
+ }
ggml/src/ggml-opencl/kernels/mul_mv_q6_k.cl ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ #ifdef cl_intel_subgroups
4
+ #pragma OPENCL EXTENSION cl_intel_subgroups : enable
5
+ #else
6
+ #pragma OPENCL EXTENSION cl_khr_subgroups : enable
7
+ #endif
8
+
9
+ #ifdef cl_intel_required_subgroup_size
10
+ #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
11
+ #define INTEL_GPU 1
12
+ #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
13
+ #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
14
+ #elif defined(cl_qcom_reqd_sub_group_size)
15
+ #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
16
+ #define ADRENO_GPU 1
17
+ #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
18
+ #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
19
+ #endif
20
+
21
+ #define QK4_0 32
22
+ #define QR4_0 2
23
+ #define QK4_1 32
24
+ #define QR4_1 2
25
+ #define QK5_0 32
26
+ #define QR5_0 2
27
+ #define QK5_1 32
28
+ #define QR5_1 2
29
+ #define QK8_0 32
30
+ #define QR8_0 1
31
+ #define QK_K 256
32
+ #define K_QUANTS_PER_ITERATION 2
33
+
34
+ typedef char int8_t;
35
+ typedef uchar uint8_t;
36
+ typedef short int16_t;
37
+ typedef ushort uint16_t;
38
+ typedef int int32_t;
39
+ typedef uint uint32_t;
40
+
41
+ //------------------------------------------------------------------------------
42
+ // block_q6_K
43
+ //------------------------------------------------------------------------------
44
+ // 6-bit quantization
45
+ // weight is represented as x = a * q
46
+ // 16 blocks of 16 elements each
47
+ // Effectively 6.5625 bits per weight
48
+ typedef struct {
49
+ uint8_t ql[QK_K/2]; // quants, lower 4 bits
50
+ uint8_t qh[QK_K/4]; // quants, upper 2 bits
51
+ int8_t scales[QK_K/16]; // scales, quantized with 8 bits
52
+ half d; // super-block scale
53
+ } block_q6_K;
54
+
55
+ //------------------------------------------------------------------------------
56
+ // kernel_mul_mv_q6_K_f32
57
+ //------------------------------------------------------------------------------
58
+
59
+ #undef N_DST
60
+ #undef N_SIMDGROUP
61
+ #undef N_SIMDWIDTH
62
+
63
+ #ifdef INTEL_GPU
64
+ #define N_DST 1 // number of rows each SIMD group works on
65
+ #define N_SIMDGROUP 2 // number of SIMD groups in a thread group
66
+ #define N_SIMDWIDTH 16 // SIMD group size
67
+ #elif defined (ADRENO_GPU)
68
+ #define N_DST 1
69
+ #define N_SIMDGROUP 2
70
+ #define N_SIMDWIDTH 64
71
+ #endif
72
+
73
+ #define BLOCK_STRIDE (N_SIMDWIDTH/16) // number of blocks each subgroup processes
74
+
75
+ #ifdef INTEL_GPU
76
+ REQD_SUBGROUP_SIZE_16
77
+ #elif defined (ADRENO_GPU)
78
+ REQD_SUBGROUP_SIZE_64
79
+ #endif
80
+ kernel void kernel_mul_mv_q6_K_f32(
81
+ global void * src0,
82
+ ulong offset0,
83
+ global float * src1,
84
+ ulong offset1,
85
+ global float * dst,
86
+ ulong offsetd,
87
+ int ne00,
88
+ int ne01,
89
+ int ne02,
90
+ int ne10,
91
+ int ne12,
92
+ int ne0,
93
+ int ne1,
94
+ int r2,
95
+ int r3
96
+ ) {
97
+ src0 = (global void*)((global char*)src0 + offset0);
98
+ src1 = (global float*)((global char*)src1 + offset1);
99
+ dst = (global float*)((global char*)dst + offsetd);
100
+
101
+ uchar kmask1 = 0x03;
102
+ uchar kmask2 = 0x0C;
103
+ uchar kmask3 = 0x30;
104
+ uchar kmask4 = 0xC0;
105
+
106
+ int nb = ne00/QK_K;
107
+
108
+ int r0 = get_group_id(0);
109
+ int r1 = get_group_id(1);
110
+ int im = get_group_id(2);
111
+
112
+ int row = N_SIMDGROUP * r0 + get_sub_group_id();
113
+
114
+ int i12 = im%ne12;
115
+ int i13 = im/ne12;
116
+
117
+ ulong offset_src0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
118
+
119
+ global block_q6_K * x = (global block_q6_K *) src0 + row*nb + offset_src0;
120
+ global float * yy = (global float *) src1 + r1*ne10 + im*ne00*ne1;
121
+
122
+ float sumf = 0;
123
+
124
+ // For Q6_K quantization, 16 values forms a subblock, 16 subblock forms a
125
+ // block. Values in a subblock shares a scale that is quantized with 8 bits;
126
+ // the entire block shares a single floating point scale.
127
+ // For work distribution, each thread processes a subblock (16 weights), hence
128
+ // 16 threads process a (super) block -- a subgroup thus handles SIMDWIDTH/16
129
+ // (super) blocks -- this is the block stride.
130
+ // The 16 threads that process a (super) block are split into 2 portions, each has
131
+ // 8 threads; each portion works on 8 subblocks.
132
+ // For subgroup of 16 threads, the entire subgroup works on a single (super) block
133
+ // before moving to the next (super) block. Thread0 - thread7 work on the
134
+ // first 8 subblocks; thread8 - thread15 works on the last 8 subblocks.
135
+ // Thread0 - thread3 work on subblocks 0, 2, 4, 6; thread4 - thread7 work on
136
+ // subblocks 1, 3, 5, 7. Each thread does not work on an entire subblock, but
137
+ // works on a total of 16 weight values.
138
+ int tid = get_sub_group_local_id()/BLOCK_STRIDE; // first block_stride groups have tid=0
139
+ int ix = get_sub_group_local_id()%BLOCK_STRIDE; // first block is 0..block_stride-1
140
+ int ip = tid/8; // first or second half of (super) block (0 or 1)
141
+ int il = tid%8; // each half has 8 parts, one per scale
142
+ int n = 4; // 4 scales at a time (and 4 sums)
143
+ int l0 = n*il; // offset into half-block, 0..28
144
+ int is = 8*ip + l0/16; // 0, 1, 8, 9
145
+
146
+ int y_offset = 128*ip + l0;
147
+ int q_offset_l = 64*ip + l0;
148
+ int q_offset_h = 32*ip + l0;
149
+
150
+ for (int i = ix; i < nb; i += BLOCK_STRIDE) {
151
+
152
+ global uint8_t * q1 = x[i].ql + q_offset_l;
153
+ global uint8_t * q2 = q1 + QK_K/8;
154
+ global uint8_t * qh = x[i].qh + q_offset_h;
155
+ global int8_t * sc = x[i].scales + is;
156
+
157
+ global float * y = yy + i * QK_K + y_offset;
158
+
159
+ float dall = x[i].d;
160
+
161
+ float4 sums = {0.f, 0.f, 0.f, 0.f};
162
+
163
+ sums.s0 += y[0+ 0] * ((float)((q1[0] & 0xF) | ((qh[0] & kmask1) << 4)) - 32.f);
164
+ sums.s1 += y[0+32] * ((float)((q2[0] & 0xF) | ((qh[0] & kmask2) << 2)) - 32.f);
165
+ sums.s2 += y[0+64] * ((float)((q1[0] >> 4) | ((qh[0] & kmask3) << 0)) - 32.f);
166
+ sums.s3 += y[0+96] * ((float)((q2[0] >> 4) | ((qh[0] & kmask4) >> 2)) - 32.f);
167
+
168
+ sums.s0 += y[1+ 0] * ((float)((q1[1] & 0xF) | ((qh[1] & kmask1) << 4)) - 32.f);
169
+ sums.s1 += y[1+32] * ((float)((q2[1] & 0xF) | ((qh[1] & kmask2) << 2)) - 32.f);
170
+ sums.s2 += y[1+64] * ((float)((q1[1] >> 4) | ((qh[1] & kmask3) << 0)) - 32.f);
171
+ sums.s3 += y[1+96] * ((float)((q2[1] >> 4) | ((qh[1] & kmask4) >> 2)) - 32.f);
172
+
173
+ sums.s0 += y[2+ 0] * ((float)((q1[2] & 0xF) | ((qh[2] & kmask1) << 4)) - 32.f);
174
+ sums.s1 += y[2+32] * ((float)((q2[2] & 0xF) | ((qh[2] & kmask2) << 2)) - 32.f);
175
+ sums.s2 += y[2+64] * ((float)((q1[2] >> 4) | ((qh[2] & kmask3) << 0)) - 32.f);
176
+ sums.s3 += y[2+96] * ((float)((q2[2] >> 4) | ((qh[2] & kmask4) >> 2)) - 32.f);
177
+
178
+ sums.s0 += y[3+ 0] * ((float)((q1[3] & 0xF) | ((qh[3] & kmask1) << 4)) - 32.f);
179
+ sums.s1 += y[3+32] * ((float)((q2[3] & 0xF) | ((qh[3] & kmask2) << 2)) - 32.f);
180
+ sums.s2 += y[3+64] * ((float)((q1[3] >> 4) | ((qh[3] & kmask3) << 0)) - 32.f);
181
+ sums.s3 += y[3+96] * ((float)((q2[3] >> 4) | ((qh[3] & kmask4) >> 2)) - 32.f);
182
+
183
+ sumf += dall * (sums.s0 * sc[0] + sums.s1 * sc[2] + sums.s2 * sc[4] + sums.s3 * sc[6]);
184
+ }
185
+
186
+ float tot = sub_group_reduce_add(sumf);
187
+ if (get_sub_group_local_id() == 0) {
188
+ dst[r1*ne0 + im*ne0*ne1 + row] = tot;
189
+ }
190
+ }
ggml/src/ggml-opencl/kernels/norm.cl ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ #ifdef cl_intel_required_subgroup_size
4
+ #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
5
+ #define INTEL_GPU 1
6
+ #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
7
+ #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
8
+ #elif defined(cl_qcom_reqd_sub_group_size)
9
+ #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
10
+ #define ADRENO_GPU 1
11
+ #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
12
+ #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
13
+ #endif
14
+
15
+ //------------------------------------------------------------------------------
16
+ // norm
17
+ //------------------------------------------------------------------------------
18
+ kernel void kernel_norm(
19
+ global void * src0,
20
+ ulong offset0,
21
+ global float * dst,
22
+ ulong offsetd,
23
+ int ne00,
24
+ int ne01,
25
+ int ne02,
26
+ int ne03,
27
+ ulong nb01,
28
+ ulong nb02,
29
+ ulong nb03,
30
+ float eps,
31
+ local float * sum
32
+ ) {
33
+ src0 = (global void*)((global char*)src0 + offset0);
34
+ dst = (global void*)((global char*)dst + offsetd);
35
+
36
+ int i03 = get_group_id(2);
37
+ int i02 = get_group_id(1);
38
+ int i01 = get_group_id(0);
39
+
40
+ global float * x = (global float *) ((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01);
41
+
42
+ // MEAN
43
+ // parallel sum
44
+ sum[get_local_id(0)] = 0.0f;
45
+ for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
46
+ sum[get_local_id(0)] += x[i00];
47
+ }
48
+ // reduce
49
+ barrier(CLK_LOCAL_MEM_FENCE);
50
+ for (uint i = get_local_size(0)/2; i > 0; i /= 2) {
51
+ if (get_local_id(0) < i) {
52
+ sum[get_local_id(0)] += sum[get_local_id(0) + i];
53
+ }
54
+ barrier(CLK_LOCAL_MEM_FENCE);
55
+ }
56
+ float mean = sum[0] / ne00;
57
+
58
+ // recenter and VARIANCE
59
+ barrier(CLK_LOCAL_MEM_FENCE);
60
+ global float * y = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
61
+ sum[get_local_id(0)] = 0.0f;
62
+ for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
63
+ y[i00] = x[i00] - mean;
64
+ sum[get_local_id(0)] += y[i00] * y[i00];
65
+ }
66
+
67
+ // reduce
68
+ barrier(CLK_LOCAL_MEM_FENCE);
69
+ for (uint i = get_local_size(0)/2; i > 0; i /= 2) {
70
+ if (get_local_id(0) < i) {
71
+ sum[get_local_id(0)] += sum[get_local_id(0) + i];
72
+ }
73
+ barrier(CLK_LOCAL_MEM_FENCE);
74
+ }
75
+ float variance = sum[0] / ne00;
76
+
77
+ float scale = 1.0f/sqrt(variance + eps);
78
+ for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
79
+ y[i00] = y[i00] * scale;
80
+ }
81
+ }
ggml/src/ggml-opencl/kernels/relu.cl ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ //------------------------------------------------------------------------------
4
+ // relu
5
+ //------------------------------------------------------------------------------
6
+ kernel void kernel_relu(
7
+ global float * src0,
8
+ ulong offset0,
9
+ global float * dst,
10
+ ulong offsetd
11
+ ) {
12
+ src0 = (global float*)((global char*)src0 + offset0);
13
+ dst = (global float*)((global char*)dst + offsetd);
14
+
15
+ dst[get_global_id(0)] = fmax(0.0f, src0[get_global_id(0)]);
16
+ }
ggml/src/ggml-opencl/kernels/rms_norm.cl ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ #ifdef cl_intel_subgroups
4
+ #pragma OPENCL EXTENSION cl_intel_subgroups : enable
5
+ #else
6
+ #pragma OPENCL EXTENSION cl_khr_subgroups : enable
7
+ #endif
8
+
9
+ #ifdef cl_intel_required_subgroup_size
10
+ #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
11
+ #define INTEL_GPU 1
12
+ #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
13
+ #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
14
+ #elif defined(cl_qcom_reqd_sub_group_size)
15
+ #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
16
+ #define ADRENO_GPU 1
17
+ #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
18
+ #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
19
+ #endif
20
+
21
+ //------------------------------------------------------------------------------
22
+ // rms_norm
23
+ //------------------------------------------------------------------------------
24
+ // This kernel depends on subgroup size.
25
+ #ifdef INTEL_GPU
26
+ REQD_SUBGROUP_SIZE_32
27
+ #elif defined (ADRENO_GPU)
28
+ REQD_SUBGROUP_SIZE_64
29
+ #endif
30
+ kernel void kernel_rms_norm(
31
+ global void * src0,
32
+ ulong offset0,
33
+ global float * dst,
34
+ ulong offsetd,
35
+ int ne00,
36
+ int ne01,
37
+ int ne02,
38
+ int ne03,
39
+ ulong nb01,
40
+ ulong nb02,
41
+ ulong nb03,
42
+ float eps,
43
+ local float * sum // Note, the size depends on number of subgroups
44
+ ) {
45
+ src0 = (global void*)((global char*)src0 + offset0);
46
+ dst = (global float*)((global char*)dst + offsetd);
47
+
48
+ int i03 = get_group_id(2);
49
+ int i02 = get_group_id(1);
50
+ int i01 = get_group_id(0);
51
+
52
+ global float4 * x = (global float4 *) ((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01);
53
+ global float * x_scalar = (global float *) x;
54
+ float4 sumf = 0;
55
+ float all_sum = 0;
56
+
57
+ // parallel sum
58
+ for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
59
+ sumf += x[i00] * x[i00];
60
+ }
61
+ all_sum = sumf.s0 + sumf.s1 + sumf.s2 + sumf.s3;
62
+ all_sum = sub_group_reduce_add(all_sum);
63
+ if (get_sub_group_local_id() == 0) {
64
+ sum[get_sub_group_id()] = all_sum;
65
+ }
66
+
67
+ barrier(CLK_LOCAL_MEM_FENCE);
68
+ // broadcast
69
+ for (uint i = get_local_size(0) / get_max_sub_group_size() / 2; i > 0; i /= 2) {
70
+ if (get_local_id(0) < i) {
71
+ sum[get_local_id(0)] += sum[get_local_id(0) + i];
72
+ }
73
+ }
74
+ if (get_local_id(0) == 0) {
75
+ for (int i = 4 * (ne00 / 4); i < ne00; i++) {
76
+ sum[0] += x_scalar[i];
77
+ }
78
+ sum[0] /= ne00;
79
+ }
80
+
81
+ barrier(CLK_LOCAL_MEM_FENCE);
82
+
83
+ const float mean = sum[0];
84
+ const float scale = 1.0f/sqrt(mean + eps);
85
+
86
+ global float4 * y = (global float4 *) (dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
87
+ global float * y_scalar = (global float *) y;
88
+ for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
89
+ y[i00] = x[i00] * scale;
90
+ }
91
+ if (get_local_id(0) == 0) {
92
+ for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {
93
+ y_scalar[i00] = x_scalar[i00] * scale;
94
+ }
95
+ }
96
+ }
ggml/src/ggml-opencl/kernels/rope.cl ADDED
@@ -0,0 +1,721 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ //------------------------------------------------------------------------------
4
+ // kernel_rope
5
+ //------------------------------------------------------------------------------
6
+ float rope_yarn_ramp(float low, float high, int i0) {
7
+ const float y = (i0 / 2 - low) / max(0.001f, high - low);
8
+ return 1.0f - min(1.0f, max(0.0f, y));
9
+ }
10
+
11
+ // YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
12
+ // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
13
+ float2 rope_yarn(
14
+ float theta_extrap, float freq_scale, float2 corr_dims, int i0, float ext_factor, float mscale
15
+ ) {
16
+ // Get n-d rotational scaling corrected for extrapolation
17
+ float theta_interp = freq_scale * theta_extrap;
18
+ float theta = theta_interp;
19
+ if (ext_factor != 0.0f) {
20
+ float ramp_mix = rope_yarn_ramp(corr_dims.s0, corr_dims.s1, i0) * ext_factor;
21
+ theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
22
+
23
+ // Get n-d magnitude scaling corrected for interpolation
24
+ mscale *= 1.0f + 0.1f * log(1.0f / freq_scale);
25
+ }
26
+ return (float2)(cos(theta) * mscale, sin(theta) * mscale);
27
+ }
28
+
29
+ // Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
30
+ // `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
31
+ float rope_yarn_corr_factor(int n_dims, int n_ctx_orig, float n_rot, float base) {
32
+ return n_dims * log(n_ctx_orig / (n_rot * 2 * M_PI_F)) / (2 * log(base));
33
+ }
34
+
35
+ float2 rope_yarn_corr_dims(
36
+ int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow
37
+ ) {
38
+ // start and end correction dims
39
+ return (float2)(
40
+ max(0.0f, floor(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_fast, freq_base))),
41
+ min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_slow, freq_base)))
42
+ );
43
+ }
44
+
45
+ kernel void kernel_rope_norm_f32(
46
+ global void * src0,
47
+ ulong offset0,
48
+ global int * src1,
49
+ ulong offset1,
50
+ global float * src2,
51
+ ulong offset2,
52
+ global float * dst,
53
+ ulong offsetd,
54
+ int ne00,
55
+ int ne01,
56
+ int ne02,
57
+ int ne03,
58
+ ulong nb00,
59
+ ulong nb01,
60
+ ulong nb02,
61
+ ulong nb03,
62
+ int ne0,
63
+ int ne1,
64
+ int ne2,
65
+ int ne3,
66
+ ulong nb0,
67
+ ulong nb1,
68
+ ulong nb2,
69
+ ulong nb3,
70
+ int n_past,
71
+ int n_dims,
72
+ int n_ctx_orig,
73
+ float freq_base,
74
+ float freq_scale,
75
+ float ext_factor,
76
+ float attn_factor,
77
+ float beta_fast,
78
+ float beta_slow
79
+ ) {
80
+ src0 = (global void*)((global char*)src0 + offset0);
81
+ src1 = (global int*)((global char*)src1 + offset1);
82
+ src2 = (global float*)((global char*)src2 + offset2);
83
+ dst = (global float*)((global char*)dst + offsetd);
84
+
85
+ int i3 = get_group_id(2);
86
+ int i2 = get_group_id(1);
87
+ int i1 = get_group_id(0);
88
+
89
+ float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
90
+
91
+ global int * pos = src1;
92
+
93
+ float theta_base = (float) pos[i2];
94
+ float inv_ndims = -1.f/n_dims;
95
+
96
+ for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
97
+ if (i0 < n_dims) {
98
+ int ic = i0/2;
99
+
100
+ float theta = theta_base * pow(freq_base, inv_ndims*i0);
101
+
102
+ float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
103
+
104
+ float2 cos_sin_theta = rope_yarn(theta/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
105
+
106
+ global float * src = (global float *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
107
+ global float * dst_data = (global float *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
108
+
109
+ float x0 = src[0];
110
+ float x1 = src[1];
111
+
112
+ dst_data[0] = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
113
+ dst_data[1] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
114
+ } else {
115
+ global float * src = (global float *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
116
+ global float * dst_data = (global float *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
117
+
118
+ dst_data[0] = src[0];
119
+ dst_data[1] = src[1];
120
+ }
121
+ }
122
+ }
123
+
124
+ kernel void kernel_rope_norm_f16(
125
+ global void * src0,
126
+ ulong offset0,
127
+ global int * src1,
128
+ ulong offset1,
129
+ global float * src2,
130
+ ulong offset2,
131
+ global float * dst,
132
+ ulong offsetd,
133
+ int ne00,
134
+ int ne01,
135
+ int ne02,
136
+ int ne03,
137
+ ulong nb00,
138
+ ulong nb01,
139
+ ulong nb02,
140
+ ulong nb03,
141
+ int ne0,
142
+ int ne1,
143
+ int ne2,
144
+ int ne3,
145
+ ulong nb0,
146
+ ulong nb1,
147
+ ulong nb2,
148
+ ulong nb3,
149
+ int n_past,
150
+ int n_dims,
151
+ int n_ctx_orig,
152
+ float freq_base,
153
+ float freq_scale,
154
+ float ext_factor,
155
+ float attn_factor,
156
+ float beta_fast,
157
+ float beta_slow
158
+ ) {
159
+ src0 = (global void*)((global char*)src0 + offset0);
160
+ src1 = (global int*)((global char*)src1 + offset1);
161
+ src2 = (global float*)((global char*)src2 + offset2);
162
+ dst = (global float*)((global char*)dst + offsetd);
163
+
164
+ int i3 = get_group_id(2);
165
+ int i2 = get_group_id(1);
166
+ int i1 = get_group_id(0);
167
+
168
+ float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
169
+
170
+ global int * pos = src1;
171
+
172
+ float theta_base = (float) pos[i2];
173
+ float inv_ndims = -1.f/n_dims;
174
+
175
+ for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
176
+ if (i0 < n_dims) {
177
+ int ic = i0/2;
178
+
179
+ float theta = theta_base * pow(freq_base, inv_ndims*i0);
180
+
181
+ float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
182
+
183
+ float2 cos_sin_theta = rope_yarn(theta/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
184
+
185
+ global half * src = (global half *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
186
+ global half * dst_data = (global half *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
187
+
188
+ float x0 = src[0];
189
+ float x1 = src[1];
190
+
191
+ dst_data[0] = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
192
+ dst_data[1] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
193
+ } else {
194
+ global half * src = (global half *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
195
+ global half * dst_data = (global half *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
196
+
197
+ dst_data[0] = src[0];
198
+ dst_data[1] = src[1];
199
+ }
200
+ }
201
+ }
202
+
203
+ kernel void kernel_rope_neox_f32(
204
+ global void * src0,
205
+ ulong offset0,
206
+ global int * src1,
207
+ ulong offset1,
208
+ global float * src2,
209
+ ulong offset2,
210
+ global float * dst,
211
+ ulong offsetd,
212
+ int ne00,
213
+ int ne01,
214
+ int ne02,
215
+ int ne03,
216
+ ulong nb00,
217
+ ulong nb01,
218
+ ulong nb02,
219
+ ulong nb03,
220
+ int ne0,
221
+ int ne1,
222
+ int ne2,
223
+ int ne3,
224
+ ulong nb0,
225
+ ulong nb1,
226
+ ulong nb2,
227
+ ulong nb3,
228
+ int n_past,
229
+ int n_dims,
230
+ int n_ctx_orig,
231
+ float freq_base,
232
+ float freq_scale,
233
+ float ext_factor,
234
+ float attn_factor,
235
+ float beta_fast,
236
+ float beta_slow
237
+ ) {
238
+ src0 = (global void*)((global char*)src0 + offset0);
239
+ src1 = (global int*)((global char*)src1 + offset1);
240
+ src2 = (global float*)((global char*)src2 + offset2);
241
+ dst = (global float*)((global char*)dst + offsetd);
242
+
243
+ int i3 = get_group_id(2);
244
+ int i2 = get_group_id(1);
245
+ int i1 = get_group_id(0);
246
+
247
+ float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
248
+
249
+ global int * pos = src1;
250
+
251
+ float theta_base = (float) pos[i2];
252
+ float inv_ndims = -1.f/n_dims;
253
+
254
+ for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
255
+ if (i0 < n_dims) {
256
+ int ic = i0/2;
257
+
258
+ const float theta = theta_base * pow(freq_base, inv_ndims*i0);
259
+
260
+ const float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
261
+
262
+ float2 cos_sin_theta = rope_yarn(theta/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
263
+
264
+ global float * src = (global float *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
265
+ global float * dst_data = (global float *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
266
+
267
+ const float x0 = src[0];
268
+ const float x1 = src[n_dims/2];
269
+
270
+ dst_data[0] = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
271
+ dst_data[n_dims/2] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
272
+ } else {
273
+ global float * const src = (global float *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
274
+ global float * dst_data = (global float *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
275
+
276
+ dst_data[0] = src[0];
277
+ dst_data[1] = src[1];
278
+ }
279
+ }
280
+ }
281
+
282
+ kernel void kernel_rope_neox_f16(
283
+ global void * src0,
284
+ ulong offset0,
285
+ global int * src1,
286
+ ulong offset1,
287
+ global float * src2,
288
+ ulong offset2,
289
+ global float * dst,
290
+ ulong offsetd,
291
+ int ne00,
292
+ int ne01,
293
+ int ne02,
294
+ int ne03,
295
+ ulong nb00,
296
+ ulong nb01,
297
+ ulong nb02,
298
+ ulong nb03,
299
+ int ne0,
300
+ int ne1,
301
+ int ne2,
302
+ int ne3,
303
+ ulong nb0,
304
+ ulong nb1,
305
+ ulong nb2,
306
+ ulong nb3,
307
+ int n_past,
308
+ int n_dims,
309
+ int n_ctx_orig,
310
+ float freq_base,
311
+ float freq_scale,
312
+ float ext_factor,
313
+ float attn_factor,
314
+ float beta_fast,
315
+ float beta_slow
316
+ ) {
317
+ src0 = (global void*)((global char*)src0 + offset0);
318
+ src1 = (global int*)((global char*)src1 + offset1);
319
+ src2 = (global float*)((global char*)src2 + offset2);
320
+ dst = (global float*)((global char*)dst + offsetd);
321
+
322
+ int i3 = get_group_id(2);
323
+ int i2 = get_group_id(1);
324
+ int i1 = get_group_id(0);
325
+
326
+ float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
327
+
328
+ global int * pos = src1;
329
+
330
+ float theta_base = (float) pos[i2];
331
+ float inv_ndims = -1.f/n_dims;
332
+
333
+ for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
334
+ if (i0 < n_dims) {
335
+ int ic = i0/2;
336
+
337
+ const float theta = theta_base * pow(freq_base, inv_ndims*i0);
338
+
339
+ const float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
340
+
341
+ float2 cos_sin_theta = rope_yarn(theta/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
342
+
343
+ global half * src = (global half *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
344
+ global half * dst_data = (global half *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
345
+
346
+ const float x0 = src[0];
347
+ const float x1 = src[n_dims/2];
348
+
349
+ dst_data[0] = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
350
+ dst_data[n_dims/2] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
351
+ } else {
352
+ global half * const src = (global half *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
353
+ global half * dst_data = (global half *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
354
+
355
+ dst_data[0] = src[0];
356
+ dst_data[1] = src[1];
357
+ }
358
+ }
359
+ }
360
+
361
+ kernel void kernel_rope_multi_f32(
362
+ global void * src0,
363
+ ulong offset0,
364
+ global int * src1,
365
+ ulong offset1,
366
+ global float * src2,
367
+ ulong offset2,
368
+ global float * dst,
369
+ ulong offsetd,
370
+ int ne00,
371
+ int ne01,
372
+ int ne02,
373
+ int ne03,
374
+ ulong nb00,
375
+ ulong nb01,
376
+ ulong nb02,
377
+ ulong nb03,
378
+ int ne0,
379
+ int ne1,
380
+ int ne2,
381
+ int ne3,
382
+ ulong nb0,
383
+ ulong nb1,
384
+ ulong nb2,
385
+ ulong nb3,
386
+ int n_past,
387
+ int n_dims,
388
+ int n_ctx_orig,
389
+ float freq_base,
390
+ float freq_scale,
391
+ float ext_factor,
392
+ float attn_factor,
393
+ float beta_fast,
394
+ float beta_slow,
395
+ int4 sections
396
+ ) {
397
+ src0 = (global void*)((global char*)src0 + offset0);
398
+ src1 = (global int*)((global char*)src1 + offset1);
399
+ src2 = (global float*)((global char*)src2 + offset2);
400
+ dst = (global float*)((global char*)dst + offsetd);
401
+
402
+ int i3 = get_group_id(2);
403
+ int i2 = get_group_id(1);
404
+ int i1 = get_group_id(0);
405
+
406
+ float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
407
+
408
+ global int * pos = src1;
409
+
410
+ const int sect_dims = sections.s0 + sections.s1 + sections.s2 + sections.s3;
411
+ const int sec_w = sections.s1 + sections.s0;
412
+
413
+ float inv_ndims = -1.f/n_dims;
414
+
415
+ for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
416
+ if (i0 < n_dims) {
417
+ int ic = i0/2;
418
+
419
+ const int sector = (i0 / 2) % sect_dims;
420
+ float theta_base = 0.0f;
421
+
422
+ if (sector < sections.s0) {
423
+ theta_base = pos[i2];
424
+ }
425
+ else if (sector >= sections.s0 && sector < sec_w) {
426
+ theta_base = pos[i2 + ne2 * 1];
427
+ }
428
+ else if (sector >= sec_w && sector < sec_w + sections.s2) {
429
+ theta_base = pos[i2 + ne2 * 2];
430
+ }
431
+ else if (sector >= sec_w + sections.s2) {
432
+ theta_base = pos[i2 + ne2 * 3];
433
+ }
434
+
435
+ const float theta = theta_base * pow(freq_base, inv_ndims*i0);
436
+
437
+ const float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
438
+
439
+ float2 cos_sin_theta = rope_yarn(theta/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
440
+
441
+ global float * src = (global float *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
442
+ global float * dst_data = (global float *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
443
+
444
+ const float x0 = src[0];
445
+ const float x1 = src[n_dims/2];
446
+
447
+ dst_data[0] = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
448
+ dst_data[n_dims/2] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
449
+ } else {
450
+ global float * const src = (global float *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
451
+ global float * dst_data = (global float *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
452
+
453
+ dst_data[0] = src[0];
454
+ dst_data[1] = src[1];
455
+ }
456
+ }
457
+ }
458
+
459
+ kernel void kernel_rope_multi_f16(
460
+ global void * src0,
461
+ ulong offset0,
462
+ global int * src1,
463
+ ulong offset1,
464
+ global float * src2,
465
+ ulong offset2,
466
+ global half * dst,
467
+ ulong offsetd,
468
+ int ne00,
469
+ int ne01,
470
+ int ne02,
471
+ int ne03,
472
+ ulong nb00,
473
+ ulong nb01,
474
+ ulong nb02,
475
+ ulong nb03,
476
+ int ne0,
477
+ int ne1,
478
+ int ne2,
479
+ int ne3,
480
+ ulong nb0,
481
+ ulong nb1,
482
+ ulong nb2,
483
+ ulong nb3,
484
+ int n_past,
485
+ int n_dims,
486
+ int n_ctx_orig,
487
+ float freq_base,
488
+ float freq_scale,
489
+ float ext_factor,
490
+ float attn_factor,
491
+ float beta_fast,
492
+ float beta_slow,
493
+ int4 sections
494
+ ) {
495
+ src0 = (global void*)((global char*)src0 + offset0);
496
+ src1 = (global int*)((global char*)src1 + offset1);
497
+ src2 = (global float*)((global char*)src2 + offset2);
498
+ dst = (global float*)((global char*)dst + offsetd);
499
+
500
+ int i3 = get_group_id(2);
501
+ int i2 = get_group_id(1);
502
+ int i1 = get_group_id(0);
503
+
504
+ float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
505
+
506
+ global int * pos = src1;
507
+
508
+ const int sect_dims = sections.s0 + sections.s1 + sections.s2 + sections.s3;
509
+ const int sec_w = sections.s1 + sections.s0;
510
+
511
+ float inv_ndims = -1.f/n_dims;
512
+
513
+ for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
514
+ if (i0 < n_dims) {
515
+ int ic = i0/2;
516
+
517
+ const int sector = (i0 / 2) % sect_dims;
518
+ float theta_base = 0.0f;
519
+
520
+ if (sector < sections.s0) {
521
+ theta_base = pos[i2];
522
+ }
523
+ else if (sector >= sections.s0 && sector < sec_w) {
524
+ theta_base = pos[i2 + ne2 * 1];
525
+ }
526
+ else if (sector >= sec_w && sector < sec_w + sections.s2) {
527
+ theta_base = pos[i2 + ne2 * 2];
528
+ }
529
+ else if (sector >= sec_w + sections.s2) {
530
+ theta_base = pos[i2 + ne2 * 3];
531
+ }
532
+
533
+ const float theta = theta_base * pow(freq_base, inv_ndims*i0);
534
+
535
+ const float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
536
+
537
+ float2 cos_sin_theta = rope_yarn(theta/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
538
+
539
+ global half * src = (global half *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
540
+ global half * dst_data = (global half *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
541
+
542
+ const float x0 = src[0];
543
+ const float x1 = src[n_dims/2];
544
+
545
+ dst_data[0] = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
546
+ dst_data[n_dims/2] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
547
+ } else {
548
+ global half * const src = (global half *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
549
+ global half * dst_data = (global half *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
550
+
551
+ dst_data[0] = src[0];
552
+ dst_data[1] = src[1];
553
+ }
554
+ }
555
+ }
556
+
557
+ kernel void kernel_rope_vision_f32(
558
+ global void * src0,
559
+ ulong offset0,
560
+ global int * src1,
561
+ ulong offset1,
562
+ global float * src2,
563
+ ulong offset2,
564
+ global float * dst,
565
+ ulong offsetd,
566
+ int ne00,
567
+ int ne01,
568
+ int ne02,
569
+ int ne03,
570
+ ulong nb00,
571
+ ulong nb01,
572
+ ulong nb02,
573
+ ulong nb03,
574
+ int ne0,
575
+ int ne1,
576
+ int ne2,
577
+ int ne3,
578
+ ulong nb0,
579
+ ulong nb1,
580
+ ulong nb2,
581
+ ulong nb3,
582
+ int n_past,
583
+ int n_dims,
584
+ int n_ctx_orig,
585
+ float freq_base,
586
+ float freq_scale,
587
+ float ext_factor,
588
+ float attn_factor,
589
+ float beta_fast,
590
+ float beta_slow,
591
+ int4 sections
592
+ ) {
593
+ src0 = (global void*)((global char*)src0 + offset0);
594
+ src1 = (global int*)((global char*)src1 + offset1);
595
+ src2 = (global float*)((global char*)src2 + offset2);
596
+ dst = (global float*)((global char*)dst + offsetd);
597
+
598
+ int i3 = get_group_id(2);
599
+ int i2 = get_group_id(1);
600
+ int i1 = get_group_id(0);
601
+
602
+ float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
603
+
604
+ global int * pos = src1;
605
+
606
+ const int sect_dims = sections.s0 + sections.s1;
607
+ const int sec_w = sections.s1 + sections.s0;
608
+
609
+ float inv_ndims = -1.f/n_dims;
610
+
611
+ for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
612
+ int ic = i0/2;
613
+
614
+ const int sector = (i0/2) % sect_dims;
615
+ float theta_base = 0.0f;
616
+
617
+ if (sector < sections.s0) {
618
+ const int p = sector;
619
+ theta_base = pos[i2] * pow(freq_base, inv_ndims*2.0f*p);
620
+ } else if (sector >= sections.s0 && sector < sec_w) {
621
+ const int p = sector - sections.s0;
622
+ theta_base = pos[i2 + ne2] * pow(freq_base, inv_ndims*2.0f*p);
623
+ }
624
+
625
+ const float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
626
+
627
+ float2 cos_sin_theta = rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
628
+
629
+ global float * src = (global float *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
630
+ global float * dst_data = (global float *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
631
+
632
+ const float x0 = src[0];
633
+ const float x1 = src[n_dims];
634
+
635
+ dst_data[0] = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
636
+ dst_data[n_dims] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
637
+ }
638
+ }
639
+
640
+ kernel void kernel_rope_vision_f16(
641
+ global void * src0,
642
+ ulong offset0,
643
+ global int * src1,
644
+ ulong offset1,
645
+ global float * src2,
646
+ ulong offset2,
647
+ global half * dst,
648
+ ulong offsetd,
649
+ int ne00,
650
+ int ne01,
651
+ int ne02,
652
+ int ne03,
653
+ ulong nb00,
654
+ ulong nb01,
655
+ ulong nb02,
656
+ ulong nb03,
657
+ int ne0,
658
+ int ne1,
659
+ int ne2,
660
+ int ne3,
661
+ ulong nb0,
662
+ ulong nb1,
663
+ ulong nb2,
664
+ ulong nb3,
665
+ int n_past,
666
+ int n_dims,
667
+ int n_ctx_orig,
668
+ float freq_base,
669
+ float freq_scale,
670
+ float ext_factor,
671
+ float attn_factor,
672
+ float beta_fast,
673
+ float beta_slow,
674
+ int4 sections
675
+ ) {
676
+ src0 = (global void*)((global char*)src0 + offset0);
677
+ src1 = (global int*)((global char*)src1 + offset1);
678
+ src2 = (global float*)((global char*)src2 + offset2);
679
+ dst = (global float*)((global char*)dst + offsetd);
680
+
681
+ int i3 = get_group_id(2);
682
+ int i2 = get_group_id(1);
683
+ int i1 = get_group_id(0);
684
+
685
+ float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
686
+
687
+ global int * pos = src1;
688
+
689
+ const int sect_dims = sections.s0 + sections.s1;
690
+ const int sec_w = sections.s1 + sections.s0;
691
+
692
+ float inv_ndims = -1.f/n_dims;
693
+
694
+ for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
695
+ int ic = i0/2;
696
+
697
+ const int sector = (i0/2) % sect_dims;
698
+ float theta_base = 0.0f;
699
+
700
+ if (sector < sections.s0) {
701
+ const int p = sector;
702
+ theta_base = pos[i2] * pow(freq_base, inv_ndims*2.0f*p);
703
+ } else if (sector >= sections.s0 && sector < sec_w) {
704
+ const int p = sector - sections.s0;
705
+ theta_base = pos[i2 + ne2] * pow(freq_base, inv_ndims*2.0f*p);
706
+ }
707
+
708
+ const float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
709
+
710
+ float2 cos_sin_theta = rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
711
+
712
+ global half * src = (global half *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
713
+ global half * dst_data = (global half *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
714
+
715
+ const float x0 = src[0];
716
+ const float x1 = src[n_dims];
717
+
718
+ dst_data[0] = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
719
+ dst_data[n_dims] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
720
+ }
721
+ }
ggml/src/ggml-opencl/kernels/scale.cl ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ //------------------------------------------------------------------------------
4
+ // scale
5
+ //------------------------------------------------------------------------------
6
+ kernel void kernel_scale(
7
+ global float4 * src0,
8
+ ulong offset0,
9
+ global float4 * dst,
10
+ ulong offsetd,
11
+ float scale
12
+ ) {
13
+ src0 = (global float4*)((global char*)src0 + offset0);
14
+ dst = (global float4*)((global char*)dst + offsetd);
15
+ dst[get_global_id(0)] = src0[get_global_id(0)] * scale;
16
+ }
ggml/src/ggml-opencl/kernels/silu.cl ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ //------------------------------------------------------------------------------
4
+ // silu
5
+ //------------------------------------------------------------------------------
6
+ kernel void kernel_silu(
7
+ global float * src0,
8
+ ulong offset0,
9
+ global float * dst,
10
+ ulong offsetd
11
+ ) {
12
+ src0 = (global float*)((global char*)src0 + offset0);
13
+ dst = (global float*)((global char*)dst + offsetd);
14
+
15
+ float x = src0[get_global_id(0)];
16
+ dst[get_global_id(0)] = x / (1.0f + exp(-x));
17
+ }
18
+
19
+ kernel void kernel_silu_4(
20
+ global float4 * src0,
21
+ ulong offset0,
22
+ global float4 * dst,
23
+ ulong offsetd
24
+ ) {
25
+ src0 = (global float4*)((global char*)src0 + offset0);
26
+ dst = (global float4*)((global char*)dst + offsetd);
27
+
28
+ float4 x = src0[get_global_id(0)];
29
+ dst[get_global_id(0)] = x / (1.0f + exp(-x));
30
+ }
ggml/src/ggml-opencl/kernels/softmax_4_f16.cl ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ #ifdef cl_intel_subgroups
4
+ #pragma OPENCL EXTENSION cl_intel_subgroups : enable
5
+ #else
6
+ #pragma OPENCL EXTENSION cl_khr_subgroups : enable
7
+ #endif
8
+
9
+ #ifdef cl_intel_required_subgroup_size
10
+ #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
11
+ #define INTEL_GPU 1
12
+ #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
13
+ #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
14
+ #elif defined(cl_qcom_reqd_sub_group_size)
15
+ #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
16
+ #define ADRENO_GPU 1
17
+ #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
18
+ #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
19
+ #endif
20
+
21
+ #ifdef ADRENO_GPU
22
+ REQD_SUBGROUP_SIZE_64
23
+ #endif
24
+ kernel void kernel_soft_max_4_f16(
25
+ global float * src0,
26
+ ulong offset0,
27
+ global half * src1,
28
+ ulong offset1,
29
+ global float * dst,
30
+ ulong offsetd,
31
+ int ne00,
32
+ int ne01,
33
+ int ne02,
34
+ float scale,
35
+ float max_bias,
36
+ float m0,
37
+ float m1,
38
+ int n_head_log2
39
+ ) {
40
+ src0 = (global float *)((global char *)src0 + offset0);
41
+ src1 = (global half *)((global char *)src1 + offset1);
42
+ dst = (global float *)((global char *)dst + offsetd);
43
+
44
+ int i03 = get_group_id(2);
45
+ int i02 = get_group_id(1);
46
+ int i01 = get_group_id(0);
47
+
48
+ global float4 * psrc4 = (global float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
49
+ global half4 * pmask = (global char *)src1 != (global char *)src0 ? (global half4 *)(src1 + i01*ne00) : 0;
50
+ global float4 * pdst4 = (global float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
51
+
52
+ float slope = 1.0f;
53
+
54
+ // ALiBi
55
+ if (max_bias > 0.0f) {
56
+ int h = i02;
57
+
58
+ float base = h < n_head_log2 ? m0 : m1;
59
+ int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
60
+
61
+ slope = pow(base, exp);
62
+ }
63
+
64
+ // parallel max
65
+ float4 lmax4 = -INFINITY;
66
+ for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
67
+ lmax4 = fmax(lmax4, psrc4[i00]*scale + slope*(pmask ? convert_float4(pmask[i00]) : 0.0f));
68
+ }
69
+ float lmax = fmax(fmax(lmax4.s0, lmax4.s1), fmax(lmax4.s2, lmax4.s3));
70
+
71
+ const float max = sub_group_reduce_max(lmax);
72
+
73
+ // parallel sum
74
+ float4 lsum4 = 0.0f;
75
+ for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
76
+ const float4 exp_psrc4 = exp((psrc4[i00]*scale + slope*(pmask ? convert_float4(pmask[i00]) : 0.0f)) - max);
77
+ lsum4 += exp_psrc4;
78
+ pdst4[i00] = exp_psrc4;
79
+ }
80
+ float lsum = lsum4.s0 + lsum4.s1 + lsum4.s2 + lsum4.s3;
81
+
82
+ const float sum = sub_group_reduce_add(lsum);
83
+
84
+ for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
85
+ pdst4[i00] /= sum;
86
+ }
87
+ }
ggml/src/ggml-opencl/kernels/softmax_4_f32.cl ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ #ifdef cl_intel_subgroups
4
+ #pragma OPENCL EXTENSION cl_intel_subgroups : enable
5
+ #else
6
+ #pragma OPENCL EXTENSION cl_khr_subgroups : enable
7
+ #endif
8
+
9
+ #ifdef cl_intel_required_subgroup_size
10
+ #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
11
+ #define INTEL_GPU 1
12
+ #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
13
+ #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
14
+ #elif defined(cl_qcom_reqd_sub_group_size)
15
+ #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
16
+ #define ADRENO_GPU 1
17
+ #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
18
+ #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
19
+ #endif
20
+
21
+ #ifdef ADRENO_GPU
22
+ REQD_SUBGROUP_SIZE_64
23
+ #endif
24
+ kernel void kernel_soft_max_4(
25
+ global float * src0,
26
+ ulong offset0,
27
+ global float * src1,
28
+ ulong offset1,
29
+ global float * dst,
30
+ ulong offsetd,
31
+ int ne00,
32
+ int ne01,
33
+ int ne02,
34
+ float scale,
35
+ float max_bias,
36
+ float m0,
37
+ float m1,
38
+ int n_head_log2
39
+ ) {
40
+ src0 = (global float*)((global char*)src0 + offset0);
41
+ src1 = (global float*)((global char*)src1 + offset1);
42
+ dst = (global float*)((global char*)dst + offsetd);
43
+
44
+ int i03 = get_group_id(2);
45
+ int i02 = get_group_id(1);
46
+ int i01 = get_group_id(0);
47
+
48
+ global float4 * psrc4 = (global float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
49
+ global float4 * pmask = src1 != src0 ? (global float4 *)(src1 + i01*ne00) : 0;
50
+ global float4 * pdst4 = (global float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
51
+
52
+ float slope = 1.0f;
53
+
54
+ // ALiBi
55
+ if (max_bias > 0.0f) {
56
+ int h = i02;
57
+
58
+ float base = h < n_head_log2 ? m0 : m1;
59
+ int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
60
+
61
+ slope = pow(base, exp);
62
+ }
63
+
64
+ // parallel max
65
+ float4 lmax4 = -INFINITY;
66
+ for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
67
+ lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f));
68
+ }
69
+ float lmax = fmax(fmax(lmax4.s0, lmax4.s1), fmax(lmax4.s2, lmax4.s3));
70
+
71
+ const float max = sub_group_reduce_max(lmax);
72
+
73
+ // parallel sum
74
+ float4 lsum4 = 0.0f;
75
+ for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
76
+ const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f)) - max);
77
+ lsum4 += exp_psrc4;
78
+ pdst4[i00] = exp_psrc4;
79
+ }
80
+ float lsum = lsum4.s0 + lsum4.s1 + lsum4.s2 + lsum4.s3;
81
+
82
+ const float sum = sub_group_reduce_add(lsum);
83
+
84
+ for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
85
+ pdst4[i00] /= sum;
86
+ }
87
+ }
ggml/src/ggml-opencl/kernels/softmax_f16.cl ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ #ifdef cl_intel_subgroups
4
+ #pragma OPENCL EXTENSION cl_intel_subgroups : enable
5
+ #else
6
+ #pragma OPENCL EXTENSION cl_khr_subgroups : enable
7
+ #endif
8
+
9
+ #ifdef cl_intel_required_subgroup_size
10
+ #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
11
+ #define INTEL_GPU 1
12
+ #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
13
+ #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
14
+ #elif defined(cl_qcom_reqd_sub_group_size)
15
+ #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
16
+ #define ADRENO_GPU 1
17
+ #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
18
+ #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
19
+ #endif
20
+
21
+ #ifdef ADRENO_GPU
22
+ REQD_SUBGROUP_SIZE_64
23
+ #endif
24
+ kernel void kernel_soft_max_f16(
25
+ global float * src0,
26
+ ulong offset0,
27
+ global half * src1,
28
+ ulong offset1,
29
+ global float * dst,
30
+ ulong offsetd,
31
+ int ne00,
32
+ int ne01,
33
+ int ne02,
34
+ float scale,
35
+ float max_bias,
36
+ float m0,
37
+ float m1,
38
+ int n_head_log2
39
+ ) {
40
+ src0 = (global float *)((global char *)src0 + offset0);
41
+ src1 = (global half *)((global char *)src1 + offset1);
42
+ dst = (global float *)((global char *)dst + offsetd);
43
+
44
+ int i03 = get_group_id(2);
45
+ int i02 = get_group_id(1);
46
+ int i01 = get_group_id(0);
47
+
48
+ global float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
49
+ global half * pmask = (global char *)src1 != (global char *)src0 ? src1 + i01*ne00 : 0;
50
+ global float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
51
+
52
+ float slope = 1.0f;
53
+
54
+ // ALiBi
55
+ if (max_bias > 0.0f) {
56
+ int h = i02;
57
+
58
+ float base = h < n_head_log2 ? m0 : m1;
59
+ int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
60
+
61
+ slope = pow(base, exp);
62
+ }
63
+
64
+ // parallel max
65
+ float lmax = -INFINITY;
66
+ for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
67
+ lmax = fmax(lmax, psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f));
68
+ }
69
+ float max = sub_group_reduce_max(lmax);
70
+
71
+ // parallel sum
72
+ float lsum = 0.0f;
73
+ for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
74
+ float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f)) - max);
75
+ lsum += exp_psrc0;
76
+ // Remember the result of exp here. exp is expensive, so we really do not
77
+ // wish to compute it twice.
78
+ pdst[i00] = exp_psrc0;
79
+ }
80
+
81
+ const float sum = sub_group_reduce_add(lsum);
82
+
83
+ for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
84
+ pdst[i00] /= sum;
85
+ }
86
+ }
ggml/src/ggml-opencl/kernels/softmax_f32.cl ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ #ifdef cl_intel_subgroups
4
+ #pragma OPENCL EXTENSION cl_intel_subgroups : enable
5
+ #else
6
+ #pragma OPENCL EXTENSION cl_khr_subgroups : enable
7
+ #endif
8
+
9
+ #ifdef cl_intel_required_subgroup_size
10
+ #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
11
+ #define INTEL_GPU 1
12
+ #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
13
+ #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
14
+ #elif defined(cl_qcom_reqd_sub_group_size)
15
+ #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
16
+ #define ADRENO_GPU 1
17
+ #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
18
+ #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
19
+ #endif
20
+
21
+ #ifdef ADRENO_GPU
22
+ REQD_SUBGROUP_SIZE_64
23
+ #endif
24
+ kernel void kernel_soft_max(
25
+ global float * src0,
26
+ ulong offset0,
27
+ global float * src1,
28
+ ulong offset1,
29
+ global float * dst,
30
+ ulong offsetd,
31
+ int ne00,
32
+ int ne01,
33
+ int ne02,
34
+ float scale,
35
+ float max_bias,
36
+ float m0,
37
+ float m1,
38
+ int n_head_log2
39
+ ) {
40
+ src0 = (global float*)((global char*)src0 + offset0);
41
+ src1 = (global float*)((global char*)src1 + offset1);
42
+ dst = (global float*)((global char*)dst + offsetd);
43
+
44
+ int i03 = get_group_id(2);
45
+ int i02 = get_group_id(1);
46
+ int i01 = get_group_id(0);
47
+
48
+ global float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
49
+ global float * pmask = src1 != src0 ? src1 + i01*ne00 : 0;
50
+ global float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
51
+
52
+ float slope = 1.0f;
53
+
54
+ // ALiBi
55
+ if (max_bias > 0.0f) {
56
+ int h = i02;
57
+
58
+ float base = h < n_head_log2 ? m0 : m1;
59
+ int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
60
+
61
+ slope = pow(base, exp);
62
+ }
63
+
64
+ // parallel max
65
+ float lmax = -INFINITY;
66
+ for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
67
+ lmax = fmax(lmax, psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f));
68
+ }
69
+ float max = sub_group_reduce_max(lmax);
70
+
71
+ // parallel sum
72
+ float lsum = 0.0f;
73
+ for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
74
+ float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f)) - max);
75
+ lsum += exp_psrc0;
76
+ // Remember the result of exp here. exp is expensive, so we really do not
77
+ // wish to compute it twice.
78
+ pdst[i00] = exp_psrc0;
79
+ }
80
+
81
+ const float sum = sub_group_reduce_add(lsum);
82
+
83
+ for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
84
+ pdst[i00] /= sum;
85
+ }
86
+ }
ggml/src/ggml-opencl/kernels/transpose.cl ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ // 16-bit transpose, loading/storing a 4x4 tile of elements
4
+ kernel void kernel_transpose_16(
5
+ __read_only image1d_buffer_t input,
6
+ __write_only image1d_buffer_t output,
7
+ const uint rows,
8
+ const uint cols
9
+ ) {
10
+
11
+ const int i = get_global_id(0);
12
+ const int j = get_global_id(1);
13
+ const int i_2 = i<<2;
14
+ const int j_2 = j<<2;
15
+
16
+ half4 temp0 = read_imageh(input, (j_2+0)*cols+i);
17
+ half4 temp1 = read_imageh(input, (j_2+1)*cols+i);
18
+ half4 temp2 = read_imageh(input, (j_2+2)*cols+i);
19
+ half4 temp3 = read_imageh(input, (j_2+3)*cols+i);
20
+
21
+ write_imageh(output, (i_2+0)*rows+j, (half4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0));
22
+ write_imageh(output, (i_2+1)*rows+j, (half4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
23
+ write_imageh(output, (i_2+2)*rows+j, (half4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
24
+ write_imageh(output, (i_2+3)*rows+j, (half4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
25
+ }
26
+
27
+ // 32-bit transpose, loading/storing a 4x4 tile of elements
28
+ kernel void kernel_transpose_32(
29
+ __read_only image1d_buffer_t input,
30
+ __write_only image1d_buffer_t output,
31
+ const uint rows,
32
+ const uint cols
33
+ ) {
34
+
35
+ const int i = get_global_id(0);
36
+ const int j = get_global_id(1);
37
+ const int i_2 = i<<2;
38
+ const int j_2 = j<<2;
39
+
40
+ float4 temp0 = read_imagef(input, (j_2+0)*cols+i);
41
+ float4 temp1 = read_imagef(input, (j_2+1)*cols+i);
42
+ float4 temp2 = read_imagef(input, (j_2+2)*cols+i);
43
+ float4 temp3 = read_imagef(input, (j_2+3)*cols+i);
44
+
45
+ write_imagef(output, (i_2+0)*rows+j, (float4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0));
46
+ write_imagef(output, (i_2+1)*rows+j, (float4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
47
+ write_imagef(output, (i_2+2)*rows+j, (float4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
48
+ write_imagef(output, (i_2+3)*rows+j, (float4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
49
+
50
+ }
51
+
52
+ // 32-bit transpose, loading/storing a 4x4 tile of elements
53
+ // Only used for activations
54
+ // converts to FP16
55
+ // also adds zero padding for non multiple of 8 prompt lengths
56
+ kernel void kernel_transpose_32_16(__read_only image1d_buffer_t input, __write_only image1d_buffer_t output, const uint rows, const uint cols, const uint padded_rows) {
57
+
58
+ const int i = get_global_id(0);
59
+ const int j = get_global_id(1);
60
+ const int i_2 = i<<2;
61
+ const int j_2 = j<<2;
62
+ half4 temp0 = {0,0,0,0}; // initialize outputs to 0
63
+ half4 temp1 = {0,0,0,0};
64
+ half4 temp2 = {0,0,0,0};
65
+ half4 temp3 = {0,0,0,0};
66
+
67
+ if((j_2+0)*cols+i*4+3 < rows*cols*16){ // only load from a valid location. Otherwise keep register data as 0
68
+ temp0 = read_imageh(input, (j_2+0)*cols+i);
69
+ }
70
+ if((j_2+1)*cols+i*4+3 < rows*cols*16){
71
+ temp1 = read_imageh(input, (j_2+1)*cols+i);
72
+ }
73
+ if((j_2+2)*cols+i*4+3 < rows*cols*16){
74
+ temp2 = read_imageh(input, (j_2+2)*cols+i);
75
+ }
76
+ if((j_2+3)*cols+i*4+3 < rows*cols*16){
77
+ temp3 = read_imageh(input, (j_2+3)*cols+i);
78
+ }
79
+
80
+ write_imageh(output, (i_2+0)*padded_rows+j, (half4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0)); // no conditionals for output, includes zero padding
81
+ write_imageh(output, (i_2+1)*padded_rows+j, (half4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
82
+ write_imageh(output, (i_2+2)*padded_rows+j, (half4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
83
+ write_imageh(output, (i_2+3)*padded_rows+j, (half4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
84
+ }