Spaces:
Running
Running
Diego Devesa
commited on
Commit
·
2b7ae5e
1
Parent(s):
137a0dc
ggml : fix fallback to CPU for ununsupported ops (llama/15118)
Browse files- ggml/src/ggml-backend.cpp +7 -2
- ggml/src/ggml-cpu/ggml-cpu.cpp +17 -20
- ggml/src/ggml-cpu/traits.cpp +2 -2
- ggml/src/ggml-cpu/traits.h +1 -1
ggml/src/ggml-backend.cpp
CHANGED
|
@@ -1071,6 +1071,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1071 |
}
|
| 1072 |
}
|
| 1073 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1074 |
}
|
| 1075 |
|
| 1076 |
// pass 5: split graph, find tensors that need to be copied
|
|
@@ -1098,7 +1103,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1098 |
|
| 1099 |
const int node_backend_id = tensor_backend_id(node);
|
| 1100 |
|
| 1101 |
-
|
| 1102 |
|
| 1103 |
// check if we should start a new split based on the sources of the current node
|
| 1104 |
bool need_new_split = false;
|
|
@@ -1156,7 +1161,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1156 |
|
| 1157 |
size_t src_id = hash_id(src);
|
| 1158 |
const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
|
| 1159 |
-
|
| 1160 |
|
| 1161 |
if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
|
| 1162 |
if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
|
|
|
|
| 1071 |
}
|
| 1072 |
}
|
| 1073 |
}
|
| 1074 |
+
// if the node is still unassigned, assign it to the first backend that supports it
|
| 1075 |
+
for (int b = 0; b < sched->n_backends && *cur_backend_id == -1; b++) {
|
| 1076 |
+
ggml_backend_sched_set_if_supported(sched, node, b, cur_backend_id);
|
| 1077 |
+
}
|
| 1078 |
+
GGML_ASSERT(*cur_backend_id != -1);
|
| 1079 |
}
|
| 1080 |
|
| 1081 |
// pass 5: split graph, find tensors that need to be copied
|
|
|
|
| 1103 |
|
| 1104 |
const int node_backend_id = tensor_backend_id(node);
|
| 1105 |
|
| 1106 |
+
GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback
|
| 1107 |
|
| 1108 |
// check if we should start a new split based on the sources of the current node
|
| 1109 |
bool need_new_split = false;
|
|
|
|
| 1161 |
|
| 1162 |
size_t src_id = hash_id(src);
|
| 1163 |
const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
|
| 1164 |
+
GGML_ASSERT(src_backend_id != -1); // all inputs should be assigned by now
|
| 1165 |
|
| 1166 |
if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
|
| 1167 |
if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
|
ggml/src/ggml-cpu/ggml-cpu.cpp
CHANGED
|
@@ -35,7 +35,7 @@
|
|
| 35 |
|
| 36 |
// ggml-backend interface
|
| 37 |
|
| 38 |
-
std::vector<ggml_backend_buffer_type_t
|
| 39 |
static std::vector<ggml_backend_buffer_type_t> bufts = []() {
|
| 40 |
std::vector<ggml_backend_buffer_type_t> bufts;
|
| 41 |
|
|
@@ -57,8 +57,6 @@ std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type
|
|
| 57 |
}
|
| 58 |
#endif
|
| 59 |
|
| 60 |
-
bufts.push_back(NULL);
|
| 61 |
-
|
| 62 |
return bufts;
|
| 63 |
}();
|
| 64 |
|
|
@@ -66,14 +64,20 @@ std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type
|
|
| 66 |
}
|
| 67 |
|
| 68 |
static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_type(ggml_backend_dev_t device) {
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
GGML_UNUSED(device);
|
| 72 |
}
|
| 73 |
|
| 74 |
static bool ggml_backend_cpu_is_extra_buffer_type(ggml_backend_buffer_type_t buft) {
|
| 75 |
-
for (auto * extra :
|
| 76 |
-
if (extra
|
| 77 |
return true;
|
| 78 |
}
|
| 79 |
}
|
|
@@ -397,20 +401,13 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
|
|
| 397 |
return true;
|
| 398 |
}
|
| 399 |
|
| 400 |
-
//
|
| 401 |
-
for
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
}
|
| 408 |
-
}
|
| 409 |
-
|
| 410 |
-
// the other case need host buffer.
|
| 411 |
-
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
| 412 |
-
if (op->src[i] && op->src[i]->buffer && !ggml_backend_buft_is_host(op->src[i]->buffer->buft)) {
|
| 413 |
-
return false;
|
| 414 |
}
|
| 415 |
}
|
| 416 |
|
|
|
|
| 35 |
|
| 36 |
// ggml-backend interface
|
| 37 |
|
| 38 |
+
std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_types() {
|
| 39 |
static std::vector<ggml_backend_buffer_type_t> bufts = []() {
|
| 40 |
std::vector<ggml_backend_buffer_type_t> bufts;
|
| 41 |
|
|
|
|
| 57 |
}
|
| 58 |
#endif
|
| 59 |
|
|
|
|
|
|
|
| 60 |
return bufts;
|
| 61 |
}();
|
| 62 |
|
|
|
|
| 64 |
}
|
| 65 |
|
| 66 |
static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_type(ggml_backend_dev_t device) {
|
| 67 |
+
static std::vector<ggml_backend_buffer_type_t> extra_bufts = [] {
|
| 68 |
+
std::vector<ggml_backend_buffer_type_t> bufts = ggml_backend_cpu_get_extra_buffer_types();
|
| 69 |
+
bufts.push_back(nullptr);
|
| 70 |
+
return bufts;
|
| 71 |
+
}();
|
| 72 |
+
|
| 73 |
+
return extra_bufts.data();
|
| 74 |
|
| 75 |
GGML_UNUSED(device);
|
| 76 |
}
|
| 77 |
|
| 78 |
static bool ggml_backend_cpu_is_extra_buffer_type(ggml_backend_buffer_type_t buft) {
|
| 79 |
+
for (auto * extra : ggml_backend_cpu_get_extra_buffer_types()) {
|
| 80 |
+
if (extra == buft) {
|
| 81 |
return true;
|
| 82 |
}
|
| 83 |
}
|
|
|
|
| 401 |
return true;
|
| 402 |
}
|
| 403 |
|
| 404 |
+
// check extra buffer types
|
| 405 |
+
// note: only the first sources are checked for extra buffer types to reduce overhead, increase if necessary
|
| 406 |
+
for (int i = 0; i < 4; i++) {
|
| 407 |
+
if (op->src[i] && op->src[i]->buffer &&
|
| 408 |
+
ggml_backend_cpu_is_extra_buffer_type(op->src[i]->buffer->buft)) {
|
| 409 |
+
auto * buf_extra = (ggml::cpu::extra_buffer_type *) op->src[i]->buffer->buft->context;
|
| 410 |
+
return buf_extra->supports_op(dev, op);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 411 |
}
|
| 412 |
}
|
| 413 |
|
ggml/src/ggml-cpu/traits.cpp
CHANGED
|
@@ -10,7 +10,7 @@ extra_buffer_type::~extra_buffer_type() {}
|
|
| 10 |
} // namespace ggml::cpu
|
| 11 |
|
| 12 |
bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) {
|
| 13 |
-
for (auto extra :
|
| 14 |
if (extra && extra->context) {
|
| 15 |
auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
|
| 16 |
auto tensor_traits = buf_extra->get_tensor_traits(op);
|
|
@@ -23,7 +23,7 @@ bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct
|
|
| 23 |
}
|
| 24 |
|
| 25 |
bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size) {
|
| 26 |
-
for (auto extra :
|
| 27 |
if (extra && extra->context) {
|
| 28 |
auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
|
| 29 |
auto tensor_traits = buf_extra->get_tensor_traits(op);
|
|
|
|
| 10 |
} // namespace ggml::cpu
|
| 11 |
|
| 12 |
bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) {
|
| 13 |
+
for (auto extra : ggml_backend_cpu_get_extra_buffer_types()) {
|
| 14 |
if (extra && extra->context) {
|
| 15 |
auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
|
| 16 |
auto tensor_traits = buf_extra->get_tensor_traits(op);
|
|
|
|
| 23 |
}
|
| 24 |
|
| 25 |
bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size) {
|
| 26 |
+
for (auto extra : ggml_backend_cpu_get_extra_buffer_types()) {
|
| 27 |
if (extra && extra->context) {
|
| 28 |
auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
|
| 29 |
auto tensor_traits = buf_extra->get_tensor_traits(op);
|
ggml/src/ggml-cpu/traits.h
CHANGED
|
@@ -33,6 +33,6 @@ class extra_buffer_type {
|
|
| 33 |
} // namespace ggml::cpu
|
| 34 |
|
| 35 |
// implemented in ggml-cpu.cpp.
|
| 36 |
-
std::vector<ggml_backend_buffer_type_t> &
|
| 37 |
|
| 38 |
#endif
|
|
|
|
| 33 |
} // namespace ggml::cpu
|
| 34 |
|
| 35 |
// implemented in ggml-cpu.cpp.
|
| 36 |
+
std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_types();
|
| 37 |
|
| 38 |
#endif
|