whisper.cpp

Running

jeffbolznv commited on Apr 9, 2025

Commit

4e46f41

1 Parent(s): 9dcb047

vulkan: Use fp16 for the flash attention P*V multiplication (llama/12783)

Files changed (1) hide show

ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp CHANGED Viewed

@@ -330,9 +330,11 @@ void main() {
         // resize eM by using smear/reduce
         coopMatReduceNV(eMdiag, eM, gl_CooperativeMatrixReduceRowNV, smearReduce);
-        O = eMdiag * O;
-        O = coopMatMulAdd(P_A, V, O);
     }
     // If there is split_k, then the split_k resolve shader does the final

         // resize eM by using smear/reduce
         coopMatReduceNV(eMdiag, eM, gl_CooperativeMatrixReduceRowNV, smearReduce);
+        // multiply with fp16 accumulation, then add to O.
+        coopmat<float16_t, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator> PV = coopmat<float16_t, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator>(0);
+        PV = coopMatMulAdd(P_A, V, PV);
+        O = eMdiag * O + coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator>(PV);
     }
     // If there is split_k, then the split_k resolve shader does the final