John Balis ggerganov commited on
Commit
04a5333
·
1 Parent(s): 9e35f6d

ggml : add `ggml_upscale_ext` (ggml/814)

Browse files

* initial commit with CPU implementation of upscale to shape and test, cuda implementation next

* experimental commit to see if dst shape is correct

* test version

* test

* removed unnecessary params

* refactor

* fixed tests

* ggml : metal impl + cleanup + sycl dev warnings

* patched ggml_upscale cuda op to handle non-contiguous tensors, added test for non-contiguous behavior

* metal : fix upsacle op to support nb00 + style

---------

Co-authored-by: Georgi Gerganov <[email protected]>

Files changed (6) hide show
  1. ggml-cuda/upscale.cu +33 -30
  2. ggml-metal.m +8 -2
  3. ggml-metal.metal +13 -8
  4. ggml-sycl.cpp +4 -0
  5. ggml.c +47 -17
  6. ggml.h +12 -0
ggml-cuda/upscale.cu CHANGED
@@ -1,35 +1,36 @@
1
  #include "upscale.cuh"
2
 
3
- static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int ne00xne01, const int scale_factor) {
4
- // blockIdx.z: idx of ne02*ne03
5
- // blockIdx.y: idx of ne01*scale_factor, aka ne1
6
- // blockIDx.x: idx of ne00*scale_factor / BLOCK_SIZE
7
- // ne00xne01: ne00 * ne01
8
- int ne0 = ne00 * scale_factor;
9
- int nidx = threadIdx.x + blockIdx.x * blockDim.x;
10
- if (nidx >= ne0) {
11
  return;
12
  }
13
- // operation
14
- int i00 = nidx / scale_factor;
15
- int i01 = blockIdx.y / scale_factor;
16
- int offset_src =
17
- i00 +
18
- i01 * ne00 +
19
- blockIdx.z * ne00xne01;
20
- int offset_dst =
21
- nidx +
22
- blockIdx.y * ne0 +
23
- blockIdx.z * ne0 * gridDim.y;
24
- dst[offset_dst] = x[offset_src];
25
  }
26
 
27
- static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int ne03,
28
- const int scale_factor, cudaStream_t stream) {
29
- int ne0 = (ne00 * scale_factor);
30
- int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
31
- dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02*ne03);
32
- upscale_f32<<<gridDim, CUDA_UPSCALE_BLOCK_SIZE, 0, stream>>>(x, dst, ne00, ne00 * ne01, scale_factor);
 
 
 
33
  }
34
 
35
  void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -39,10 +40,12 @@ void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
39
  cudaStream_t stream = ctx.stream();
40
 
41
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
42
- GGML_ASSERT(dst->type == GGML_TYPE_F32);
43
- GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
44
 
45
- const int scale_factor = dst->op_params[0];
 
 
 
46
 
47
- upscale_f32_cuda(src0_d, dst_d, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], scale_factor, stream);
48
  }
 
1
  #include "upscale.cuh"
2
 
3
+ static __global__ void upscale_f32(const float * x, float * dst,
4
+ const int nb00, const int nb01, const int nb02, const int nb03,
5
+ const int ne10, const int ne11, const int ne12, const int ne13,
6
+ const float sf0, const float sf1, const float sf2, const float sf3) {
7
+ int index = threadIdx.x + blockIdx.x * blockDim.x;
8
+ if (index >= ne10 * ne11 * ne12 * ne13) {
 
 
9
  return;
10
  }
11
+
12
+ int i10 = index % ne10;
13
+ int i11 = (index / ne10) % ne11;
14
+ int i12 = (index / (ne10 * ne11)) % ne12;
15
+ int i13 = (index / (ne10 * ne11 * ne12)) % ne13;
16
+
17
+ int i00 = i10 / sf0;
18
+ int i01 = i11 / sf1;
19
+ int i02 = i12 / sf2;
20
+ int i03 = i13 / sf3;
21
+
22
+ dst[index] = *(float *)((char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
23
  }
24
 
25
+ static void upscale_f32_cuda(const float * x, float * dst,
26
+ const int nb00, const int nb01, const int nb02, const int nb03,
27
+ const int ne10, const int ne11, const int ne12, const int ne13,
28
+ const float sf0, const float sf1, const float sf2, const float sf3,
29
+ cudaStream_t stream) {
30
+ int dst_size = ne10 * ne11 * ne12 * ne13;
31
+ int num_blocks = (dst_size + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
32
+
33
+ upscale_f32<<<num_blocks, CUDA_UPSCALE_BLOCK_SIZE,0,stream>>>(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3);
34
  }
35
 
36
  void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 
40
  cudaStream_t stream = ctx.stream();
41
 
42
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
43
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
44
 
45
+ const float sf0 = (float)dst->ne[0]/src0->ne[0];
46
+ const float sf1 = (float)dst->ne[1]/src0->ne[1];
47
+ const float sf2 = (float)dst->ne[2]/src0->ne[2];
48
+ const float sf3 = (float)dst->ne[3]/src0->ne[3];
49
 
50
+ upscale_f32_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, stream);
51
  }
ggml-metal.m CHANGED
@@ -2353,7 +2353,10 @@ static enum ggml_status ggml_metal_graph_compute(
2353
  {
2354
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
2355
 
2356
- const int sf = dst->op_params[0];
 
 
 
2357
 
2358
  const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UPSCALE_F32].pipeline;
2359
 
@@ -2376,7 +2379,10 @@ static enum ggml_status ggml_metal_graph_compute(
2376
  [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
2377
  [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16];
2378
  [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
2379
- [encoder setBytes:&sf length:sizeof(sf) atIndex:18];
 
 
 
2380
 
2381
  const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
2382
 
 
2353
  {
2354
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
2355
 
2356
+ const float sf0 = (float)ne0/src0->ne[0];
2357
+ const float sf1 = (float)ne1/src0->ne[1];
2358
+ const float sf2 = (float)ne2/src0->ne[2];
2359
+ const float sf3 = (float)ne3/src0->ne[3];
2360
 
2361
  const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UPSCALE_F32].pipeline;
2362
 
 
2379
  [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
2380
  [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16];
2381
  [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
2382
+ [encoder setBytes:&sf0 length:sizeof(sf0) atIndex:18];
2383
+ [encoder setBytes:&sf1 length:sizeof(sf1) atIndex:19];
2384
+ [encoder setBytes:&sf2 length:sizeof(sf2) atIndex:20];
2385
+ [encoder setBytes:&sf3 length:sizeof(sf3) atIndex:21];
2386
 
2387
  const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
2388
 
ggml-metal.metal CHANGED
@@ -1852,7 +1852,10 @@ kernel void kernel_upscale_f32(
1852
  constant uint64_t & nb1,
1853
  constant uint64_t & nb2,
1854
  constant uint64_t & nb3,
1855
- constant int32_t & sf,
 
 
 
1856
  uint3 tgpig[[threadgroup_position_in_grid]],
1857
  uint3 tpitg[[thread_position_in_threadgroup]],
1858
  uint3 ntg[[threads_per_threadgroup]]) {
@@ -1861,15 +1864,17 @@ kernel void kernel_upscale_f32(
1861
  const int64_t i2 = tgpig.y;
1862
  const int64_t i1 = tgpig.x;
1863
 
1864
- const int64_t i03 = i3;
1865
- const int64_t i02 = i2;
1866
- const int64_t i01 = i1/sf;
1867
-
1868
- device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
1869
- device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1);
1870
 
1871
  for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
1872
- dst_ptr[i0] = src0_ptr[i0/sf];
 
 
 
 
 
1873
  }
1874
  }
1875
 
 
1852
  constant uint64_t & nb1,
1853
  constant uint64_t & nb2,
1854
  constant uint64_t & nb3,
1855
+ constant float & sf0,
1856
+ constant float & sf1,
1857
+ constant float & sf2,
1858
+ constant float & sf3,
1859
  uint3 tgpig[[threadgroup_position_in_grid]],
1860
  uint3 tpitg[[thread_position_in_threadgroup]],
1861
  uint3 ntg[[threads_per_threadgroup]]) {
 
1864
  const int64_t i2 = tgpig.y;
1865
  const int64_t i1 = tgpig.x;
1866
 
1867
+ const int64_t i03 = i3/sf3;
1868
+ const int64_t i02 = i2/sf2;
1869
+ const int64_t i01 = i1/sf1;
 
 
 
1870
 
1871
  for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
1872
+ const int64_t i00 = i0/sf0;
1873
+
1874
+ device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
1875
+ device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
1876
+
1877
+ dst_ptr[0] = src0_ptr[0];
1878
  }
1879
  }
1880
 
ggml-sycl.cpp CHANGED
@@ -13987,6 +13987,10 @@ inline void ggml_sycl_op_upscale(const ggml_tensor *src0,
13987
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
13988
  GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
13989
 
 
 
 
 
13990
  const int scale_factor = dst->op_params[0];
13991
 
13992
  upscale_f32_sycl(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
 
13987
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
13988
  GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
13989
 
13990
+ #pragma message("TODO: generalize upscale operator")
13991
+ #pragma message(" https://github.com/ggerganov/ggml/pull/814")
13992
+ GGML_ASSERT(false && "TODO: generalize upscale operator);
13993
+
13994
  const int scale_factor = dst->op_params[0];
13995
 
13996
  upscale_f32_sycl(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
ggml.c CHANGED
@@ -6293,7 +6293,10 @@ struct ggml_tensor * ggml_pool_2d(
6293
  static struct ggml_tensor * ggml_upscale_impl(
6294
  struct ggml_context * ctx,
6295
  struct ggml_tensor * a,
6296
- int scale_factor) {
 
 
 
6297
  bool is_node = false;
6298
 
6299
  if (a->grad) {
@@ -6301,19 +6304,45 @@ static struct ggml_tensor * ggml_upscale_impl(
6301
  is_node = true;
6302
  }
6303
 
 
 
 
 
 
6304
  struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
6305
- a->ne[0] * scale_factor,
6306
- a->ne[1] * scale_factor,
6307
- a->ne[2], a->ne[3]);
 
 
6308
 
6309
  result->op = GGML_OP_UPSCALE;
6310
- result->op_params[0] = scale_factor;
6311
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6312
  result->src[0] = a;
6313
 
6314
  return result;
6315
  }
6316
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6317
  struct ggml_tensor * ggml_pad(
6318
  struct ggml_context * ctx,
6319
  struct ggml_tensor * a,
@@ -6338,12 +6367,7 @@ struct ggml_tensor * ggml_pad(
6338
  return result;
6339
  }
6340
 
6341
- struct ggml_tensor * ggml_upscale(
6342
- struct ggml_context * ctx,
6343
- struct ggml_tensor * a,
6344
- int scale_factor) {
6345
- return ggml_upscale_impl(ctx, a, scale_factor);
6346
- }
6347
 
6348
  struct ggml_tensor * ggml_arange(
6349
  struct ggml_context * ctx,
@@ -6365,6 +6389,8 @@ struct ggml_tensor * ggml_arange(
6365
  return result;
6366
  }
6367
 
 
 
6368
  struct ggml_tensor * ggml_timestep_embedding(
6369
  struct ggml_context * ctx,
6370
  struct ggml_tensor * timesteps,
@@ -14820,25 +14846,28 @@ static void ggml_compute_forward_upscale_f32(
14820
  return;
14821
  }
14822
 
14823
- GGML_ASSERT(src0->nb[0] == sizeof(float));
14824
 
14825
  const int ith = params->ith;
14826
  const int nth = params->nth;
14827
 
14828
  GGML_TENSOR_UNARY_OP_LOCALS
14829
 
14830
- const int scale_factor = dst->op_params[0];
 
 
 
14831
 
14832
  // TODO: optimize
14833
 
14834
  for (int64_t i3 = 0; i3 < ne3; i3++) {
14835
- const int64_t i03 = i3;
14836
  for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
14837
- const int64_t i02 = i2;
14838
  for (int64_t i1 = 0; i1 < ne1; i1++) {
14839
- const int64_t i01 = i1 / scale_factor;
14840
  for (int64_t i0 = 0; i0 < ne0; i0++) {
14841
- const int64_t i00 = i0 / scale_factor;
14842
 
14843
  const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
14844
  float * y = (float *)((char *) dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
@@ -14868,6 +14897,7 @@ static void ggml_compute_forward_upscale(
14868
  }
14869
  }
14870
 
 
14871
  // ggml_compute_forward_pad
14872
 
14873
  static void ggml_compute_forward_pad_f32(
 
6293
  static struct ggml_tensor * ggml_upscale_impl(
6294
  struct ggml_context * ctx,
6295
  struct ggml_tensor * a,
6296
+ int ne0,
6297
+ int ne1,
6298
+ int ne2,
6299
+ int ne3) {
6300
  bool is_node = false;
6301
 
6302
  if (a->grad) {
 
6304
  is_node = true;
6305
  }
6306
 
6307
+ GGML_ASSERT(a->ne[0] <= ne0);
6308
+ GGML_ASSERT(a->ne[1] <= ne1);
6309
+ GGML_ASSERT(a->ne[2] <= ne2);
6310
+ GGML_ASSERT(a->ne[3] <= ne3);
6311
+
6312
  struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
6313
+ ne0,
6314
+ ne1,
6315
+ ne2,
6316
+ ne3
6317
+ );
6318
 
6319
  result->op = GGML_OP_UPSCALE;
6320
+
6321
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6322
  result->src[0] = a;
6323
 
6324
  return result;
6325
  }
6326
 
6327
+ struct ggml_tensor * ggml_upscale(
6328
+ struct ggml_context * ctx,
6329
+ struct ggml_tensor * a,
6330
+ int scale_factor) {
6331
+ return ggml_upscale_impl(ctx, a, a->ne[0] * scale_factor, a->ne[1] * scale_factor, a->ne[2], a->ne[3]);
6332
+ }
6333
+
6334
+ struct ggml_tensor * ggml_upscale_ext(
6335
+ struct ggml_context * ctx,
6336
+ struct ggml_tensor * a,
6337
+ int ne0,
6338
+ int ne1,
6339
+ int ne2,
6340
+ int ne3) {
6341
+ return ggml_upscale_impl(ctx, a, ne0, ne1, ne2, ne3);
6342
+ }
6343
+
6344
+ // ggml_pad
6345
+
6346
  struct ggml_tensor * ggml_pad(
6347
  struct ggml_context * ctx,
6348
  struct ggml_tensor * a,
 
6367
  return result;
6368
  }
6369
 
6370
+ // ggml_arange
 
 
 
 
 
6371
 
6372
  struct ggml_tensor * ggml_arange(
6373
  struct ggml_context * ctx,
 
6389
  return result;
6390
  }
6391
 
6392
+ // ggml_timestep_embedding
6393
+
6394
  struct ggml_tensor * ggml_timestep_embedding(
6395
  struct ggml_context * ctx,
6396
  struct ggml_tensor * timesteps,
 
14846
  return;
14847
  }
14848
 
14849
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
14850
 
14851
  const int ith = params->ith;
14852
  const int nth = params->nth;
14853
 
14854
  GGML_TENSOR_UNARY_OP_LOCALS
14855
 
14856
+ const float sf0 = (float)ne0/src0->ne[0];
14857
+ const float sf1 = (float)ne1/src0->ne[1];
14858
+ const float sf2 = (float)ne2/src0->ne[2];
14859
+ const float sf3 = (float)ne3/src0->ne[3];
14860
 
14861
  // TODO: optimize
14862
 
14863
  for (int64_t i3 = 0; i3 < ne3; i3++) {
14864
+ const int64_t i03 = i3 / sf3;
14865
  for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
14866
+ const int64_t i02 = i2 / sf2;
14867
  for (int64_t i1 = 0; i1 < ne1; i1++) {
14868
+ const int64_t i01 = i1 / sf1;
14869
  for (int64_t i0 = 0; i0 < ne0; i0++) {
14870
+ const int64_t i00 = i0 / sf0;
14871
 
14872
  const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
14873
  float * y = (float *)((char *) dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
 
14897
  }
14898
  }
14899
 
14900
+
14901
  // ggml_compute_forward_pad
14902
 
14903
  static void ggml_compute_forward_pad_f32(
ggml.h CHANGED
@@ -1674,12 +1674,24 @@ extern "C" {
1674
  float p1);
1675
 
1676
  // nearest interpolate
 
1677
  // used in stable-diffusion
1678
  GGML_API struct ggml_tensor * ggml_upscale(
1679
  struct ggml_context * ctx,
1680
  struct ggml_tensor * a,
1681
  int scale_factor);
1682
 
 
 
 
 
 
 
 
 
 
 
 
1683
  // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
1684
  GGML_API struct ggml_tensor * ggml_pad(
1685
  struct ggml_context * ctx,
 
1674
  float p1);
1675
 
1676
  // nearest interpolate
1677
+ // multiplies ne0 and ne1 by scale factor
1678
  // used in stable-diffusion
1679
  GGML_API struct ggml_tensor * ggml_upscale(
1680
  struct ggml_context * ctx,
1681
  struct ggml_tensor * a,
1682
  int scale_factor);
1683
 
1684
+ // nearest interpolate
1685
+ // nearest interpolate to specified dimensions
1686
+ // used in tortoise.cpp
1687
+ GGML_API struct ggml_tensor * ggml_upscale_ext(
1688
+ struct ggml_context * ctx,
1689
+ struct ggml_tensor * a,
1690
+ int ne0,
1691
+ int ne1,
1692
+ int ne2,
1693
+ int ne3);
1694
+
1695
  // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
1696
  GGML_API struct ggml_tensor * ggml_pad(
1697
  struct ggml_context * ctx,