Diego Devesa commited on
Commit
2b7ae5e
·
1 Parent(s): 137a0dc

ggml : fix fallback to CPU for ununsupported ops (llama/15118)

Browse files
ggml/src/ggml-backend.cpp CHANGED
@@ -1071,6 +1071,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1071
  }
1072
  }
1073
  }
 
 
 
 
 
1074
  }
1075
 
1076
  // pass 5: split graph, find tensors that need to be copied
@@ -1098,7 +1103,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1098
 
1099
  const int node_backend_id = tensor_backend_id(node);
1100
 
1101
- assert(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback
1102
 
1103
  // check if we should start a new split based on the sources of the current node
1104
  bool need_new_split = false;
@@ -1156,7 +1161,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1156
 
1157
  size_t src_id = hash_id(src);
1158
  const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
1159
- assert(src_backend_id != -1); // all inputs should be assigned by now
1160
 
1161
  if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
1162
  if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
 
1071
  }
1072
  }
1073
  }
1074
+ // if the node is still unassigned, assign it to the first backend that supports it
1075
+ for (int b = 0; b < sched->n_backends && *cur_backend_id == -1; b++) {
1076
+ ggml_backend_sched_set_if_supported(sched, node, b, cur_backend_id);
1077
+ }
1078
+ GGML_ASSERT(*cur_backend_id != -1);
1079
  }
1080
 
1081
  // pass 5: split graph, find tensors that need to be copied
 
1103
 
1104
  const int node_backend_id = tensor_backend_id(node);
1105
 
1106
+ GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback
1107
 
1108
  // check if we should start a new split based on the sources of the current node
1109
  bool need_new_split = false;
 
1161
 
1162
  size_t src_id = hash_id(src);
1163
  const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
1164
+ GGML_ASSERT(src_backend_id != -1); // all inputs should be assigned by now
1165
 
1166
  if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
1167
  if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
ggml/src/ggml-cpu/ggml-cpu.cpp CHANGED
@@ -35,7 +35,7 @@
35
 
36
  // ggml-backend interface
37
 
38
- std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type() {
39
  static std::vector<ggml_backend_buffer_type_t> bufts = []() {
40
  std::vector<ggml_backend_buffer_type_t> bufts;
41
 
@@ -57,8 +57,6 @@ std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type
57
  }
58
  #endif
59
 
60
- bufts.push_back(NULL);
61
-
62
  return bufts;
63
  }();
64
 
@@ -66,14 +64,20 @@ std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type
66
  }
67
 
68
  static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_type(ggml_backend_dev_t device) {
69
- return ggml_backend_cpu_get_extra_buffers_type().data();
 
 
 
 
 
 
70
 
71
  GGML_UNUSED(device);
72
  }
73
 
74
  static bool ggml_backend_cpu_is_extra_buffer_type(ggml_backend_buffer_type_t buft) {
75
- for (auto * extra : ggml_backend_cpu_get_extra_buffers_type()) {
76
- if (extra && extra == buft) {
77
  return true;
78
  }
79
  }
@@ -397,20 +401,13 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
397
  return true;
398
  }
399
 
400
- // extra_buffer_op?
401
- for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
402
- if (extra) {
403
- auto buf_extra = (ggml::cpu::extra_buffer_type*) extra->context;
404
- if (buf_extra && buf_extra->supports_op(dev, op)) {
405
- return true;
406
- }
407
- }
408
- }
409
-
410
- // the other case need host buffer.
411
- for (int i = 0; i < GGML_MAX_SRC; i++) {
412
- if (op->src[i] && op->src[i]->buffer && !ggml_backend_buft_is_host(op->src[i]->buffer->buft)) {
413
- return false;
414
  }
415
  }
416
 
 
35
 
36
  // ggml-backend interface
37
 
38
+ std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_types() {
39
  static std::vector<ggml_backend_buffer_type_t> bufts = []() {
40
  std::vector<ggml_backend_buffer_type_t> bufts;
41
 
 
57
  }
58
  #endif
59
 
 
 
60
  return bufts;
61
  }();
62
 
 
64
  }
65
 
66
  static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_type(ggml_backend_dev_t device) {
67
+ static std::vector<ggml_backend_buffer_type_t> extra_bufts = [] {
68
+ std::vector<ggml_backend_buffer_type_t> bufts = ggml_backend_cpu_get_extra_buffer_types();
69
+ bufts.push_back(nullptr);
70
+ return bufts;
71
+ }();
72
+
73
+ return extra_bufts.data();
74
 
75
  GGML_UNUSED(device);
76
  }
77
 
78
  static bool ggml_backend_cpu_is_extra_buffer_type(ggml_backend_buffer_type_t buft) {
79
+ for (auto * extra : ggml_backend_cpu_get_extra_buffer_types()) {
80
+ if (extra == buft) {
81
  return true;
82
  }
83
  }
 
401
  return true;
402
  }
403
 
404
+ // check extra buffer types
405
+ // note: only the first sources are checked for extra buffer types to reduce overhead, increase if necessary
406
+ for (int i = 0; i < 4; i++) {
407
+ if (op->src[i] && op->src[i]->buffer &&
408
+ ggml_backend_cpu_is_extra_buffer_type(op->src[i]->buffer->buft)) {
409
+ auto * buf_extra = (ggml::cpu::extra_buffer_type *) op->src[i]->buffer->buft->context;
410
+ return buf_extra->supports_op(dev, op);
 
 
 
 
 
 
 
411
  }
412
  }
413
 
ggml/src/ggml-cpu/traits.cpp CHANGED
@@ -10,7 +10,7 @@ extra_buffer_type::~extra_buffer_type() {}
10
  } // namespace ggml::cpu
11
 
12
  bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) {
13
- for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
14
  if (extra && extra->context) {
15
  auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
16
  auto tensor_traits = buf_extra->get_tensor_traits(op);
@@ -23,7 +23,7 @@ bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct
23
  }
24
 
25
  bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size) {
26
- for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
27
  if (extra && extra->context) {
28
  auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
29
  auto tensor_traits = buf_extra->get_tensor_traits(op);
 
10
  } // namespace ggml::cpu
11
 
12
  bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) {
13
+ for (auto extra : ggml_backend_cpu_get_extra_buffer_types()) {
14
  if (extra && extra->context) {
15
  auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
16
  auto tensor_traits = buf_extra->get_tensor_traits(op);
 
23
  }
24
 
25
  bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size) {
26
+ for (auto extra : ggml_backend_cpu_get_extra_buffer_types()) {
27
  if (extra && extra->context) {
28
  auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
29
  auto tensor_traits = buf_extra->get_tensor_traits(op);
ggml/src/ggml-cpu/traits.h CHANGED
@@ -33,6 +33,6 @@ class extra_buffer_type {
33
  } // namespace ggml::cpu
34
 
35
  // implemented in ggml-cpu.cpp.
36
- std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffers_type();
37
 
38
  #endif
 
33
  } // namespace ggml::cpu
34
 
35
  // implemented in ggml-cpu.cpp.
36
+ std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_types();
37
 
38
  #endif