shupeif commited on
Commit
bf73242
·
1 Parent(s): c8008b8

ggml-cpu: support IQ4_NL_4_4 by runtime repack (llama/10541)

Browse files

* ggml-cpu: support IQ4_NL_4_4 by runtime repack

* ggml-cpu: add __ARM_FEATURE_DOTPROD guard

ggml/include/ggml-cpu.h CHANGED
@@ -91,6 +91,7 @@ extern "C" {
91
  GGML_BACKEND_API int ggml_cpu_has_neon (void);
92
  GGML_BACKEND_API int ggml_cpu_has_arm_fma (void);
93
  GGML_BACKEND_API int ggml_cpu_has_fp16_va (void);
 
94
  GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void);
95
  GGML_BACKEND_API int ggml_cpu_has_sve (void);
96
  GGML_BACKEND_API int ggml_cpu_get_sve_cnt (void); // sve vector length in bytes
 
91
  GGML_BACKEND_API int ggml_cpu_has_neon (void);
92
  GGML_BACKEND_API int ggml_cpu_has_arm_fma (void);
93
  GGML_BACKEND_API int ggml_cpu_has_fp16_va (void);
94
+ GGML_BACKEND_API int ggml_cpu_has_dotprod (void);
95
  GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void);
96
  GGML_BACKEND_API int ggml_cpu_has_sve (void);
97
  GGML_BACKEND_API int ggml_cpu_get_sve_cnt (void); // sve vector length in bytes
ggml/include/ggml.h CHANGED
@@ -389,6 +389,9 @@ extern "C" {
389
  GGML_TYPE_Q4_0_8_8 = 33,
390
  GGML_TYPE_TQ1_0 = 34,
391
  GGML_TYPE_TQ2_0 = 35,
 
 
 
392
  GGML_TYPE_COUNT,
393
  };
394
 
 
389
  GGML_TYPE_Q4_0_8_8 = 33,
390
  GGML_TYPE_TQ1_0 = 34,
391
  GGML_TYPE_TQ2_0 = 35,
392
+ GGML_TYPE_IQ4_NL_4_4 = 36,
393
+ // GGML_TYPE_IQ4_NL_4_8 = 37,
394
+ // GGML_TYPE_IQ4_NL_8_8 = 38,
395
  GGML_TYPE_COUNT,
396
  };
397
 
ggml/src/ggml-common.h CHANGED
@@ -418,6 +418,12 @@ typedef struct {
418
  } block_iq4_xs;
419
  static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
420
 
 
 
 
 
 
 
421
  #endif // GGML_COMMON_DECL
422
  #endif // GGML_COMMON_DECL
423
 
 
418
  } block_iq4_xs;
419
  static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
420
 
421
+ typedef struct {
422
+ ggml_half d[4]; // deltas for 4 iq4_nl blocks
423
+ uint8_t qs[QK4_NL * 2];// nibbles / quants for 4 iq4_nl blocks
424
+ } block_iq4_nlx4;
425
+ static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
426
+
427
  #endif // GGML_COMMON_DECL
428
  #endif // GGML_COMMON_DECL
429
 
ggml/src/ggml-cpu/ggml-cpu-aarch64.c CHANGED
@@ -187,6 +187,8 @@ static inline __m256i mul_sum_i8_pairs_int32x8(const __m256i x, const __m256i y)
187
  }
188
  #endif
189
 
 
 
190
  static void quantize_q8_0_4x4(const float * restrict x, void * restrict vy, int64_t k) {
191
  assert(QK8_0 == 32);
192
  assert(k % QK8_0 == 0);
@@ -528,7 +530,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
528
  UNUSED(blocklen);
529
 
530
  #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
531
- if (ggml_cpu_has_neon()) {
532
  const void * b_ptr = vx;
533
  const void * a_ptr = vy;
534
  float * res_ptr = s;
@@ -996,6 +998,102 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
996
  }
997
  }
998
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
999
  void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
1000
  const int qk = QK8_0;
1001
  const int nb = n / qk;
@@ -1017,7 +1115,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
1017
  UNUSED(blocklen);
1018
 
1019
  #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
1020
- if (ggml_cpu_has_neon()) {
1021
  const void * b_ptr = vx;
1022
  const void * a_ptr = vy;
1023
  float * res_ptr = s;
@@ -3386,6 +3484,117 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
3386
  }
3387
  }
3388
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3389
  // FIXME: this code is duplicated from ggml-aarch64.c
3390
  static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
3391
  block_q4_0x4 out;
@@ -3518,6 +3727,70 @@ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block,
3518
  GGML_UNUSED(data_size);
3519
  }
3520
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3521
  // Prepare for optimized kernels if applicable
3522
  void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * restrict data, size_t data_size) {
3523
  if (cur->type == repack_type) {
@@ -3525,20 +3798,30 @@ void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_
3525
  return;
3526
  }
3527
 
3528
- GGML_ASSERT(cur->type == GGML_TYPE_Q4_0);
3529
-
3530
- switch (repack_type) {
3531
- case GGML_TYPE_Q4_0_8_8:
3532
- repack_q4_0_to_q4_0_8_bl(cur, 8, data, data_size);
3533
- break;
3534
- case GGML_TYPE_Q4_0_4_8:
3535
- repack_q4_0_to_q4_0_4_bl(cur, 8, data, data_size);
3536
- break;
3537
- case GGML_TYPE_Q4_0_4_4:
3538
- repack_q4_0_to_q4_0_4_bl(cur, 4, data, data_size);
3539
- break;
3540
- default:
3541
- GGML_ABORT("Unsupported type");
 
 
 
 
 
 
 
 
 
 
3542
  }
3543
  }
3544
 
@@ -3551,9 +3834,13 @@ enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * c
3551
  if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
3552
  return GGML_TYPE_Q4_0_4_8;
3553
  }
3554
- if (ggml_cpu_has_neon()) {
3555
  return GGML_TYPE_Q4_0_4_4;
3556
  }
 
 
 
 
3557
  }
3558
 
3559
  return cur->type;
 
187
  }
188
  #endif
189
 
190
+ static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
191
+
192
  static void quantize_q8_0_4x4(const float * restrict x, void * restrict vy, int64_t k) {
193
  assert(QK8_0 == 32);
194
  assert(k % QK8_0 == 0);
 
530
  UNUSED(blocklen);
531
 
532
  #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
533
+ if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
534
  const void * b_ptr = vx;
535
  const void * a_ptr = vy;
536
  float * res_ptr = s;
 
998
  }
999
  }
1000
 
1001
+ void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
1002
+ const int qk = QK8_0;
1003
+ const int nb = n / qk;
1004
+ const int ncols_interleaved = 4;
1005
+ const int blocklen = 4;
1006
+
1007
+ assert (n % qk == 0);
1008
+ assert (nc % ncols_interleaved == 0);
1009
+
1010
+ UNUSED(s);
1011
+ UNUSED(bs);
1012
+ UNUSED(vx);
1013
+ UNUSED(vy);
1014
+ UNUSED(nr);
1015
+ UNUSED(nc);
1016
+ UNUSED(nb);
1017
+ UNUSED(ncols_interleaved);
1018
+ UNUSED(blocklen);
1019
+
1020
+ #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
1021
+ if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
1022
+ const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl);
1023
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
1024
+ float * res_ptr = s;
1025
+
1026
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1027
+ const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
1028
+
1029
+ float32x4_t sumf = vdupq_n_f32(0);
1030
+ for (int l = 0; l < nb; l++) {
1031
+ uint8x16_t b_0 = vld1q_u8(b_ptr[l].qs + 0);
1032
+ uint8x16_t b_1 = vld1q_u8(b_ptr[l].qs + 16);
1033
+ uint8x16_t b_2 = vld1q_u8(b_ptr[l].qs + 32);
1034
+ uint8x16_t b_3 = vld1q_u8(b_ptr[l].qs + 48);
1035
+
1036
+ int8x16_t b_0_hi = vqtbl1q_s8(kvalues, b_0 >> 4);
1037
+ int8x16_t b_0_lo = vqtbl1q_s8(kvalues, b_0 & 0x0F);
1038
+ int8x16_t b_1_hi = vqtbl1q_s8(kvalues, b_1 >> 4);
1039
+ int8x16_t b_1_lo = vqtbl1q_s8(kvalues, b_1 & 0x0F);
1040
+ int8x16_t b_2_hi = vqtbl1q_s8(kvalues, b_2 >> 4);
1041
+ int8x16_t b_2_lo = vqtbl1q_s8(kvalues, b_2 & 0x0F);
1042
+ int8x16_t b_3_hi = vqtbl1q_s8(kvalues, b_3 >> 4);
1043
+ int8x16_t b_3_lo = vqtbl1q_s8(kvalues, b_3 & 0x0F);
1044
+
1045
+ int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 0);
1046
+ int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16);
1047
+
1048
+ int32x4_t sumi = vdupq_n_s32(0);
1049
+ sumi = vdotq_laneq_s32(sumi, b_0_lo, a_0, 0);
1050
+ sumi = vdotq_laneq_s32(sumi, b_0_hi, a_1, 0);
1051
+ sumi = vdotq_laneq_s32(sumi, b_1_lo, a_0, 1);
1052
+ sumi = vdotq_laneq_s32(sumi, b_1_hi, a_1, 1);
1053
+ sumi = vdotq_laneq_s32(sumi, b_2_lo, a_0, 2);
1054
+ sumi = vdotq_laneq_s32(sumi, b_2_hi, a_1, 2);
1055
+ sumi = vdotq_laneq_s32(sumi, b_3_lo, a_0, 3);
1056
+ sumi = vdotq_laneq_s32(sumi, b_3_hi, a_1, 3);
1057
+
1058
+ float32x4_t a_d = vcvt_f32_f16(vld1_dup_f16((const float16_t *)&a_ptr[l].d));
1059
+ float32x4_t b_d = vcvt_f32_f16(vld1_f16((const float16_t *)b_ptr[l].d));
1060
+ float32x4_t d = a_d * b_d;
1061
+
1062
+ sumf = vmlaq_f32(sumf, d, vcvtq_f32_s32(sumi));
1063
+ }
1064
+
1065
+ vst1q_f32(res_ptr + x * 4, sumf);
1066
+ }
1067
+ return;
1068
+ }
1069
+ #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
1070
+ {
1071
+ float sumf[4];
1072
+ int sumi;
1073
+
1074
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
1075
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1076
+ const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
1077
+
1078
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
1079
+ for (int l = 0; l < nb; l++) {
1080
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1081
+ for (int j = 0; j < ncols_interleaved; j++) {
1082
+ sumi = 0;
1083
+ for (int i = 0; i < blocklen; ++i) {
1084
+ const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
1085
+ const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
1086
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
1087
+ }
1088
+ sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
1089
+ }
1090
+ }
1091
+ }
1092
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
1093
+ }
1094
+ }
1095
+ }
1096
+
1097
  void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
1098
  const int qk = QK8_0;
1099
  const int nb = n / qk;
 
1115
  UNUSED(blocklen);
1116
 
1117
  #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
1118
+ if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
1119
  const void * b_ptr = vx;
1120
  const void * a_ptr = vy;
1121
  float * res_ptr = s;
 
3484
  }
3485
  }
3486
 
3487
+ void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
3488
+ const int qk = QK8_0;
3489
+ const int nb = n / qk;
3490
+ const int ncols_interleaved = 4;
3491
+ const int blocklen = 4;
3492
+
3493
+ assert (n % qk == 0);
3494
+ assert (nr % 4 == 0);
3495
+ assert (nc % ncols_interleaved == 0);
3496
+
3497
+ UNUSED(s);
3498
+ UNUSED(bs);
3499
+ UNUSED(vx);
3500
+ UNUSED(vy);
3501
+ UNUSED(nr);
3502
+ UNUSED(nc);
3503
+ UNUSED(nb);
3504
+ UNUSED(ncols_interleaved);
3505
+ UNUSED(blocklen);
3506
+
3507
+ #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
3508
+ if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
3509
+ const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl);
3510
+
3511
+ for (int y = 0; y < nr / 4; y++) {
3512
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
3513
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
3514
+ const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
3515
+
3516
+ float32x4_t sumf[4];
3517
+ for (int m = 0; m < 4; m++) {
3518
+ sumf[m] = vdupq_n_f32(0);
3519
+ }
3520
+
3521
+ for (int l = 0; l < nb; l++) {
3522
+ float32x4_t a_d = vcvt_f32_f16(vld1_f16((const float16_t *)a_ptr[l].d));
3523
+ float32x4_t b_d = vcvt_f32_f16(vld1_f16((const float16_t *)b_ptr[l].d));
3524
+
3525
+ int32x4_t sumi_0 = vdupq_n_s32(0);
3526
+ int32x4_t sumi_1 = vdupq_n_s32(0);
3527
+ int32x4_t sumi_2 = vdupq_n_s32(0);
3528
+ int32x4_t sumi_3 = vdupq_n_s32(0);
3529
+
3530
+ for (int k = 0; k < 4; k++) {
3531
+ int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 16 * k + 0);
3532
+ int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16 * k + 64);
3533
+
3534
+ uint8x16_t b = vld1q_u8(b_ptr[l].qs + 16 * k);
3535
+ int8x16_t b_hi = vqtbl1q_s8(kvalues, b >> 4);
3536
+ int8x16_t b_lo = vqtbl1q_s8(kvalues, b & 0xF);
3537
+
3538
+ sumi_0 = vdotq_laneq_s32(sumi_0, b_lo, a_0, 0);
3539
+ sumi_1 = vdotq_laneq_s32(sumi_1, b_lo, a_0, 1);
3540
+ sumi_2 = vdotq_laneq_s32(sumi_2, b_lo, a_0, 2);
3541
+ sumi_3 = vdotq_laneq_s32(sumi_3, b_lo, a_0, 3);
3542
+ sumi_0 = vdotq_laneq_s32(sumi_0, b_hi, a_1, 0);
3543
+ sumi_1 = vdotq_laneq_s32(sumi_1, b_hi, a_1, 1);
3544
+ sumi_2 = vdotq_laneq_s32(sumi_2, b_hi, a_1, 2);
3545
+ sumi_3 = vdotq_laneq_s32(sumi_3, b_hi, a_1, 3);
3546
+ }
3547
+
3548
+ sumf[0] = vmlaq_f32(sumf[0], vmulq_laneq_f32(b_d, a_d, 0), vcvtq_f32_s32(sumi_0));
3549
+ sumf[1] = vmlaq_f32(sumf[1], vmulq_laneq_f32(b_d, a_d, 1), vcvtq_f32_s32(sumi_1));
3550
+ sumf[2] = vmlaq_f32(sumf[2], vmulq_laneq_f32(b_d, a_d, 2), vcvtq_f32_s32(sumi_2));
3551
+ sumf[3] = vmlaq_f32(sumf[3], vmulq_laneq_f32(b_d, a_d, 3), vcvtq_f32_s32(sumi_3));
3552
+ }
3553
+
3554
+ for (int m = 0; m < 4; m++) {
3555
+ vst1q_f32(s + (y * 4 + m) * bs + x * 4, sumf[m]);
3556
+ }
3557
+ }
3558
+ }
3559
+ return;
3560
+ }
3561
+ #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
3562
+ {
3563
+ float sumf[4][4];
3564
+ int sumi;
3565
+
3566
+ for (int y = 0; y < nr / 4; y++) {
3567
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
3568
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
3569
+ const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
3570
+ for (int m = 0; m < 4; m++) {
3571
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
3572
+ }
3573
+ for (int l = 0; l < nb; l++) {
3574
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
3575
+ for (int m = 0; m < 4; m++) {
3576
+ for (int j = 0; j < ncols_interleaved; j++) {
3577
+ sumi = 0;
3578
+ for (int i = 0; i < blocklen; ++i) {
3579
+ const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
3580
+ const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
3581
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
3582
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
3583
+ }
3584
+ sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
3585
+ }
3586
+ }
3587
+ }
3588
+ }
3589
+ for (int m = 0; m < 4; m++) {
3590
+ for (int j = 0; j < ncols_interleaved; j++)
3591
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
3592
+ }
3593
+ }
3594
+ }
3595
+ }
3596
+ }
3597
+
3598
  // FIXME: this code is duplicated from ggml-aarch64.c
3599
  static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
3600
  block_q4_0x4 out;
 
3727
  GGML_UNUSED(data_size);
3728
  }
3729
 
3730
+ static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_size_interleave) {
3731
+ block_iq4_nlx4 out;
3732
+
3733
+ for (int i = 0; i < 4; i++) {
3734
+ out.d[i] = in[i].d;
3735
+ }
3736
+
3737
+ const int end = QK4_NL * 2 / blck_size_interleave;
3738
+
3739
+ if (blck_size_interleave == 8) {
3740
+ for (int i = 0; i < end; ++i) {
3741
+ int src_id = i % 4;
3742
+ int src_offset = (i / 4) * blck_size_interleave;
3743
+ int dst_offset = i * blck_size_interleave;
3744
+
3745
+ // Using memcpy to avoid unaligned memory accesses
3746
+ memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
3747
+ }
3748
+ } else if (blck_size_interleave == 4) {
3749
+ for (int i = 0; i < end; ++i) {
3750
+ int src_id = i % 4;
3751
+ int src_offset = (i / 4) * blck_size_interleave;
3752
+ int dst_offset = i * blck_size_interleave;
3753
+
3754
+ memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint32_t));
3755
+ }
3756
+ } else {
3757
+ GGML_ASSERT(false);
3758
+ }
3759
+
3760
+ return out;
3761
+ }
3762
+
3763
+ static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * restrict data, size_t data_size) {
3764
+ GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
3765
+ GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
3766
+
3767
+ block_iq4_nlx4 * dst = (block_iq4_nlx4 *)t->data;
3768
+ const block_iq4_nl * src = (const block_iq4_nl *)data;
3769
+ block_iq4_nl dst_tmp[4];
3770
+ int nrow = t->ne[1]; // Number of rows
3771
+ int nrows_interleaved = 4;
3772
+ int nblocks = t->ne[0] / QK4_0;
3773
+
3774
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
3775
+
3776
+ if (nrow % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
3777
+ return -1;
3778
+ }
3779
+
3780
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
3781
+ for (int64_t x = 0; x < nblocks; x++) {
3782
+ for (int i = 0; i < nrows_interleaved; i++) {
3783
+ dst_tmp[i] = src[x + i * nblocks];
3784
+ }
3785
+ *dst++ = make_block_iq4_nlx4(dst_tmp, interleave_block);
3786
+ }
3787
+ src += nrows_interleaved * nblocks;
3788
+ }
3789
+ return 0;
3790
+
3791
+ GGML_UNUSED(data_size);
3792
+ }
3793
+
3794
  // Prepare for optimized kernels if applicable
3795
  void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * restrict data, size_t data_size) {
3796
  if (cur->type == repack_type) {
 
3798
  return;
3799
  }
3800
 
3801
+ if (cur->type == GGML_TYPE_Q4_0) {
3802
+ switch (repack_type) {
3803
+ case GGML_TYPE_Q4_0_8_8:
3804
+ repack_q4_0_to_q4_0_8_bl(cur, 8, data, data_size);
3805
+ break;
3806
+ case GGML_TYPE_Q4_0_4_8:
3807
+ repack_q4_0_to_q4_0_4_bl(cur, 8, data, data_size);
3808
+ break;
3809
+ case GGML_TYPE_Q4_0_4_4:
3810
+ repack_q4_0_to_q4_0_4_bl(cur, 4, data, data_size);
3811
+ break;
3812
+ default:
3813
+ GGML_ABORT("Unsupported type");
3814
+ }
3815
+ } else if (cur->type == GGML_TYPE_IQ4_NL) {
3816
+ switch (repack_type) {
3817
+ case GGML_TYPE_IQ4_NL_4_4:
3818
+ repack_iq4_nl_to_iq4_nl_4_bl(cur, 4, data, data_size);
3819
+ break;
3820
+ default:
3821
+ GGML_ABORT("Unsupported type");
3822
+ }
3823
+ } else {
3824
+ GGML_ABORT("Unsupported type");
3825
  }
3826
  }
3827
 
 
3834
  if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
3835
  return GGML_TYPE_Q4_0_4_8;
3836
  }
3837
+ if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
3838
  return GGML_TYPE_Q4_0_4_4;
3839
  }
3840
+ } else if (cur->type == GGML_TYPE_IQ4_NL) {
3841
+ if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
3842
+ return GGML_TYPE_IQ4_NL_4_4;
3843
+ }
3844
  }
3845
 
3846
  return cur->type;
ggml/src/ggml-cpu/ggml-cpu-aarch64.h CHANGED
@@ -15,11 +15,13 @@ void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
15
  void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
16
  void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
17
  void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 
18
 
19
  // GEMM
20
  void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
21
  void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
22
  void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 
23
 
24
  void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * data, size_t data_size);
25
  enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur);
 
15
  void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
16
  void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
17
  void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
18
+ void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
19
 
20
  // GEMM
21
  void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
22
  void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
23
  void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
24
+ void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
25
 
26
  void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * data, size_t data_size);
27
  enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur);
ggml/src/ggml-cpu/ggml-cpu.c CHANGED
@@ -109,10 +109,11 @@ static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
109
  #if defined(__ARM_ARCH)
110
  struct ggml_arm_arch_features_type {
111
  int has_neon;
 
112
  int has_i8mm;
113
  int has_sve;
114
  int sve_cnt;
115
- } ggml_arm_arch_features = {-1, -1, -1, 0};
116
  #endif
117
 
118
 
@@ -446,6 +447,15 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
446
  .vec_dot_type = GGML_TYPE_Q8_K,
447
  .nrows = 1,
448
  },
 
 
 
 
 
 
 
 
 
449
  };
450
 
451
  const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
@@ -2439,6 +2449,7 @@ static void ggml_init_arm_arch_features(void) {
2439
  uint32_t hwcap2 = getauxval(AT_HWCAP2);
2440
 
2441
  ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
 
2442
  ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
2443
  ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE);
2444
 
@@ -2453,6 +2464,11 @@ static void ggml_init_arm_arch_features(void) {
2453
  }
2454
  ggml_arm_arch_features.has_neon = oldp;
2455
 
 
 
 
 
 
2456
  if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) {
2457
  oldp = 0;
2458
  }
@@ -9133,6 +9149,7 @@ static void ggml_compute_forward_clamp(
9133
  case GGML_TYPE_Q4_0_4_4:
9134
  case GGML_TYPE_Q4_0_4_8:
9135
  case GGML_TYPE_Q4_0_8_8:
 
9136
  case GGML_TYPE_I8:
9137
  case GGML_TYPE_I16:
9138
  case GGML_TYPE_I32:
@@ -13880,6 +13897,14 @@ int ggml_cpu_has_neon(void) {
13880
  #endif
13881
  }
13882
 
 
 
 
 
 
 
 
 
13883
  int ggml_cpu_has_sve(void) {
13884
  #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
13885
  return ggml_arm_arch_features.has_sve;
 
109
  #if defined(__ARM_ARCH)
110
  struct ggml_arm_arch_features_type {
111
  int has_neon;
112
+ int has_dotprod;
113
  int has_i8mm;
114
  int has_sve;
115
  int sve_cnt;
116
+ } ggml_arm_arch_features = {-1, -1, -1, -1, 0};
117
  #endif
118
 
119
 
 
447
  .vec_dot_type = GGML_TYPE_Q8_K,
448
  .nrows = 1,
449
  },
450
+ [GGML_TYPE_IQ4_NL_4_4] = {
451
+ .from_float = NULL,
452
+ .vec_dot = NULL,
453
+ .vec_dot_type = GGML_TYPE_Q8_0,
454
+ .nrows = 1,
455
+ .ncols = 4,
456
+ .gemv = ggml_gemv_iq4_nl_4x4_q8_0,
457
+ .gemm = ggml_gemm_iq4_nl_4x4_q8_0,
458
+ },
459
  };
460
 
461
  const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
 
2449
  uint32_t hwcap2 = getauxval(AT_HWCAP2);
2450
 
2451
  ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
2452
+ ggml_arm_arch_features.has_dotprod = !!(hwcap && HWCAP_ASIMDDP);
2453
  ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
2454
  ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE);
2455
 
 
2464
  }
2465
  ggml_arm_arch_features.has_neon = oldp;
2466
 
2467
+ if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) != 0) {
2468
+ oldp = 0;
2469
+ }
2470
+ ggml_arm_arch_features.has_dotprod = oldp;
2471
+
2472
  if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) {
2473
  oldp = 0;
2474
  }
 
9149
  case GGML_TYPE_Q4_0_4_4:
9150
  case GGML_TYPE_Q4_0_4_8:
9151
  case GGML_TYPE_Q4_0_8_8:
9152
+ case GGML_TYPE_IQ4_NL_4_4:
9153
  case GGML_TYPE_I8:
9154
  case GGML_TYPE_I16:
9155
  case GGML_TYPE_I32:
 
13897
  #endif
13898
  }
13899
 
13900
+ int ggml_cpu_has_dotprod(void) {
13901
+ #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_DOTPROD)
13902
+ return ggml_arm_arch_features.has_dotprod;
13903
+ #else
13904
+ return 0;
13905
+ #endif
13906
+ }
13907
+
13908
  int ggml_cpu_has_sve(void) {
13909
  #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
13910
  return ggml_arm_arch_features.has_sve;
ggml/src/ggml-cpu/ggml-cpu.cpp CHANGED
@@ -457,7 +457,7 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
457
  const struct ggml_tensor * src1 = op->src[1];
458
 
459
  if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) {
460
- if (op->op != GGML_OP_MUL_MAT || src0->type != GGML_TYPE_Q4_0 || ggml_aarch64_get_optimal_repack_type(src0) == GGML_TYPE_Q4_0) {
461
  return false;
462
  }
463
  }
 
457
  const struct ggml_tensor * src1 = op->src[1];
458
 
459
  if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) {
460
+ if (op->op != GGML_OP_MUL_MAT || src0->type == ggml_aarch64_get_optimal_repack_type(src0)) {
461
  return false;
462
  }
463
  }
ggml/src/ggml.c CHANGED
@@ -831,6 +831,15 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
831
  .to_float = (ggml_to_float_t) dequantize_row_tq2_0,
832
  .from_float_ref = (ggml_from_float_t) quantize_row_tq2_0_ref,
833
  },
 
 
 
 
 
 
 
 
 
834
  };
835
 
836
  const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {
 
831
  .to_float = (ggml_to_float_t) dequantize_row_tq2_0,
832
  .from_float_ref = (ggml_from_float_t) quantize_row_tq2_0_ref,
833
  },
834
+ [GGML_TYPE_IQ4_NL_4_4] = {
835
+ .type_name = "iq4_nl_4x4",
836
+ .blck_size = QK4_NL,
837
+ .blck_size_interleave = 4,
838
+ .type_size = sizeof(block_iq4_nl),
839
+ .is_quantized = true,
840
+ .to_float = NULL,
841
+ .from_float_ref = NULL,
842
+ },
843
  };
844
 
845
  const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {