Spaces:
Running
Running
lhez
opencl: add new ops - `argsort`, `div`, `sub`, `addrows`, `sigmoid`, `group_norm` (llama/13787)
1ab0f23
| // Workgroup must be a subgroup | |
| REQD_SUBGROUP_SIZE_32 | |
| REQD_SUBGROUP_SIZE_64 | |
| kernel void kernel_group_norm( | |
| global float * src0, | |
| ulong offset0, | |
| global float * dst, | |
| ulong offsetd, | |
| int ne, | |
| int group_size, | |
| float eps | |
| ) { | |
| src0 = (global float *)((global char *)src0 + offset0); | |
| dst = (global float *)((global char *)dst + offsetd); | |
| int start = get_group_id(0) * group_size; | |
| int end = start + group_size; | |
| start += get_local_id(0); | |
| if (end >= ne) { | |
| end = ne; | |
| } | |
| float tmp = 0.0f; | |
| for (int j = start; j < end; j += get_local_size(0)) { | |
| tmp += src0[j]; | |
| } | |
| tmp = sub_group_reduce_add(tmp); | |
| const float mean = tmp / group_size; | |
| tmp = 0.0f; | |
| for (int j = start; j < end; j += get_local_size(0)) { | |
| float xi = src0[j] - mean; | |
| dst[j] = xi; | |
| tmp += xi * xi; | |
| } | |
| tmp = sub_group_reduce_add(tmp); | |
| const float variance = tmp / group_size; | |
| const float scale = 1.0f/sqrt(variance + eps); | |
| for (int j = start; j < end; j += get_local_size(0)) { | |
| dst[j] *= scale; | |
| } | |
| } | |