whisper.cpp

Running

App Files Files Community

ggerganov commited on Sep 25, 2022

Commit

f9c5a09

unverified ·

0 Parent(s):

Initial release

Browse files

Files changed (11) hide show

.gitattributes +12 -0
.gitignore +3 -0
Makefile +109 -0
convert-pt-to-ggml.py +328 -0
dr_wav.h +0 -0
ggml.c +0 -0
ggml.h +527 -0
main.cpp +2116 -0
models/.gitignore +1 -0
samples/.gitignore +1 -0
samples/jfk.wav +3 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,12 @@

+bindings/go/samples/jfk.wav filter=lfs diff=lfs merge=lfs -text
+models/for-tests-ggml-base.bin filter=lfs diff=lfs merge=lfs -text
+models/for-tests-ggml-base.en.bin filter=lfs diff=lfs merge=lfs -text
+models/for-tests-ggml-large.bin filter=lfs diff=lfs merge=lfs -text
+models/for-tests-ggml-medium.bin filter=lfs diff=lfs merge=lfs -text
+models/for-tests-ggml-medium.en.bin filter=lfs diff=lfs merge=lfs -text
+models/for-tests-ggml-small.bin filter=lfs diff=lfs merge=lfs -text
+models/for-tests-ggml-small.en.bin filter=lfs diff=lfs merge=lfs -text
+models/for-tests-ggml-tiny.bin filter=lfs diff=lfs merge=lfs -text
+models/for-tests-ggml-tiny.en.bin filter=lfs diff=lfs merge=lfs -text
+models/for-tests-silero-v5.1.2-ggml.bin filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+sync.sh
+main
+*.o

Makefile ADDED Viewed

	@@ -0,0 +1,109 @@

+main: ggml.o main.o
+	g++ -o main ggml.o main.o
+ggml.o: ggml.c ggml.h
+	gcc -O3 -mavx -mavx2 -mfma -mf16c -c ggml.c
+main.o: main.cpp ggml.h
+	g++ -O3 -std=c++11 -c main.cpp
+# clean up the directory
+clean:
+	rm -f *.o main
+# run the program
+run: main
+	./main
+# download the following audio samples into folder "./samples":
+.PHONY: samples
+samples:
+	@echo "Downloading samples..."
+	mkdir -p samples
+	@wget --quiet --show-progress -O samples/gb0.ogg https://upload.wikimedia.org/wikipedia/commons/2/22/George_W._Bush%27s_weekly_radio_address_%28November_1%2C_2008%29.oga
+	@wget --quiet --show-progress -O samples/gb1.ogg https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg
+	@wget --quiet --show-progress -O samples/hp0.ogg https://upload.wikimedia.org/wikipedia/en/d/d4/En.henryfphillips.ogg
+	@echo "Converting to 16-bit WAV ..."
+	@ffmpeg -loglevel -0 -y -i samples/gb0.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/gb0.wav
+	@ffmpeg -loglevel -0 -y -i samples/gb1.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/gb1.wav
+	@ffmpeg -loglevel -0 -y -i samples/hp0.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/hp0.wav
+.PHONY: tiny.en
+tiny.en: main
+	@echo "Downloading tiny.en (75 MB just once)"
+	mkdir -p models
+	@if [ ! -f models/ggml-tiny.en.bin ]; then \
+		wget --quiet --show-progress -O models/ggml-tiny.en.bin https://ggml.ggerganov.com/ggml-model-whisper-tiny.en.bin ; \
+	fi
+	@echo "==============================================="
+	@echo "Running tiny.en on all samples in ./samples ..."
+	@echo "==============================================="
+	@echo ""
+	@for f in samples/*.wav; do \
+		echo "----------------------------------------------" ; \
+		echo "[+] Running base.en on $$f ... (run 'ffplay $$f' to listen)" ; \
+	    echo "----------------------------------------------" ; \
+		echo "" ; \
+		./main -m models/ggml-tiny.en.bin -f $$f ; \
+		echo "" ; \
+	done
+.PHONY: base.en
+base.en: main
+	@echo "Downloading base.en (142 MB just once)"
+	mkdir -p models
+	@if [ ! -f models/ggml-base.en.bin ]; then \
+		wget --quiet --show-progress -O models/ggml-base.en.bin https://ggml.ggerganov.com/ggml-model-whisper-base.en.bin ; \
+	fi
+	@echo "==============================================="
+	@echo "Running base.en on all samples in ./samples ..."
+	@echo "==============================================="
+	@echo ""
+	@for f in samples/*.wav; do \
+		echo "----------------------------------------------" ; \
+		echo "[+] Running base.en on $$f ... (run 'ffplay $$f' to listen)" ; \
+	    echo "----------------------------------------------" ; \
+		echo "" ; \
+		./main -m models/ggml-base.en.bin -f $$f ; \
+		echo "" ; \
+	done
+.PHONY: small.en
+small.en: main
+	@echo "Downloading small.en (466 MB just once)"
+	mkdir -p models
+	@if [ ! -f models/ggml-small.en.bin ]; then \
+		wget --quiet --show-progress -O models/ggml-small.en.bin https://ggml.ggerganov.com/ggml-model-whisper-small.en.bin ; \
+	fi
+	@echo "==============================================="
+	@echo "Running small.en on all samples in ./samples ..."
+	@echo "==============================================="
+	@echo ""
+	@for f in samples/*.wav; do \
+		echo "----------------------------------------------" ; \
+		echo "[+] Running base.en on $$f ... (run 'ffplay $$f' to listen)" ; \
+	    echo "----------------------------------------------" ; \
+		echo "" ; \
+		./main -m models/ggml-small.en.bin -f $$f ; \
+		echo "" ; \
+	done
+.PHONY: medium.en
+medium.en: main
+	@echo "Downloading medium.en (1.5 GB just once)"
+	mkdir -p models
+	@if [ ! -f models/ggml-medium.en.bin ]; then \
+		wget --quiet --show-progress -O models/ggml-medium.en.bin https://ggml.ggerganov.com/ggml-model-whisper-medium.en.bin ; \
+	fi
+	@echo "==============================================="
+	@echo "Running medium.en on all samples in ./samples ..."
+	@echo "==============================================="
+	@echo ""
+	@for f in samples/*.wav; do \
+		echo "----------------------------------------------" ; \
+		echo "[+] Running base.en on $$f ... (run 'ffplay $$f' to listen)" ; \
+	    echo "----------------------------------------------" ; \
+		echo "" ; \
+		./main -m models/ggml-medium.en.bin -f $$f ; \
+		echo "" ; \
+	done

convert-pt-to-ggml.py ADDED Viewed

	@@ -0,0 +1,328 @@

+# Convert Whisper transformer model from PyTorch to ggml format
+#
+# Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium
+#
+# You need to clone the original repo in ~/path/to/repo/whisper/
+#
+#  git clone https://github.com/openai/whisper ~/path/to/repo/whisper/
+#
+# It is used to various assets needed by the algorithm:
+#
+#  - tokenizer
+#  - mel filters
+#
+# Also, you need to have the original models in ~/.cache/whisper/
+# See the original repo for more details.
+#
+# This script loads the specified model and whisper assets and saves them in ggml format.
+# The output is a single binary file containing the following information:
+#
+#  - hparams
+#  - mel filters
+#  - tokenizer vocab
+#  - model variables
+#
+# For each variable, write the following:
+#
+#  - Number of dimensions (int)
+#  - Name length (int)
+#  - Dimensions (int[n_dims])
+#  - Name (char[name_length])
+#  - Data (float[n_dims])
+#
+import io
+import os
+import sys
+import struct
+import json
+import code
+import torch
+import numpy as np
+from transformers import GPTJForCausalLM
+from transformers import GPT2TokenizerFast
+# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110
+LANGUAGES = {
+    "en": "english",
+    "zh": "chinese",
+    "de": "german",
+    "es": "spanish",
+    "ru": "russian",
+    "ko": "korean",
+    "fr": "french",
+    "ja": "japanese",
+    "pt": "portuguese",
+    "tr": "turkish",
+    "pl": "polish",
+    "ca": "catalan",
+    "nl": "dutch",
+    "ar": "arabic",
+    "sv": "swedish",
+    "it": "italian",
+    "id": "indonesian",
+    "hi": "hindi",
+    "fi": "finnish",
+    "vi": "vietnamese",
+    "iw": "hebrew",
+    "uk": "ukrainian",
+    "el": "greek",
+    "ms": "malay",
+    "cs": "czech",
+    "ro": "romanian",
+    "da": "danish",
+    "hu": "hungarian",
+    "ta": "tamil",
+    "no": "norwegian",
+    "th": "thai",
+    "ur": "urdu",
+    "hr": "croatian",
+    "bg": "bulgarian",
+    "lt": "lithuanian",
+    "la": "latin",
+    "mi": "maori",
+    "ml": "malayalam",
+    "cy": "welsh",
+    "sk": "slovak",
+    "te": "telugu",
+    "fa": "persian",
+    "lv": "latvian",
+    "bn": "bengali",
+    "sr": "serbian",
+    "az": "azerbaijani",
+    "sl": "slovenian",
+    "kn": "kannada",
+    "et": "estonian",
+    "mk": "macedonian",
+    "br": "breton",
+    "eu": "basque",
+    "is": "icelandic",
+    "hy": "armenian",
+    "ne": "nepali",
+    "mn": "mongolian",
+    "bs": "bosnian",
+    "kk": "kazakh",
+    "sq": "albanian",
+    "sw": "swahili",
+    "gl": "galician",
+    "mr": "marathi",
+    "pa": "punjabi",
+    "si": "sinhala",
+    "km": "khmer",
+    "sn": "shona",
+    "yo": "yoruba",
+    "so": "somali",
+    "af": "afrikaans",
+    "oc": "occitan",
+    "ka": "georgian",
+    "be": "belarusian",
+    "tg": "tajik",
+    "sd": "sindhi",
+    "gu": "gujarati",
+    "am": "amharic",
+    "yi": "yiddish",
+    "lo": "lao",
+    "uz": "uzbek",
+    "fo": "faroese",
+    "ht": "haitian creole",
+    "ps": "pashto",
+    "tk": "turkmen",
+    "nn": "nynorsk",
+    "mt": "maltese",
+    "sa": "sanskrit",
+    "lb": "luxembourgish",
+    "my": "myanmar",
+    "bo": "tibetan",
+    "tl": "tagalog",
+    "mg": "malagasy",
+    "as": "assamese",
+    "tt": "tatar",
+    "haw": "hawaiian",
+    "ln": "lingala",
+    "ha": "hausa",
+    "ba": "bashkir",
+    "jw": "javanese",
+    "su": "sundanese",
+}
+# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L273-L292
+def build_tokenizer(path_to_whisper_repo: str, name: str = "gpt2"):
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    path = os.path.join(path_to_whisper_repo, "whisper/assets", name)
+    tokenizer = GPT2TokenizerFast.from_pretrained(path)
+    specials = [
+        "<|startoftranscript|>",
+        *[f"<|{lang}|>" for lang in LANGUAGES.keys()],
+        "<|translate|>",
+        "<|transcribe|>",
+        "<|startoflm|>",
+        "<|startofprev|>",
+        "<|nocaptions|>",
+        "<|notimestamps|>",
+    ]
+    tokenizer.add_special_tokens(dict(additional_special_tokens=specials))
+    return tokenizer
+# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+if len(sys.argv) < 4:
+    print("Usage: convert-pt-to-ggml.py model.pt path-to-whisper-repo dir-output [use-f32]\n")
+    sys.exit(1)
+fname_inp   = sys.argv[1]
+dir_whisper = sys.argv[2]
+dir_out     = sys.argv[3]
+# try to load PyTorch binary data
+try:
+    model_bytes = open(fname_inp, "rb").read()
+    with io.BytesIO(model_bytes) as fp:
+        checkpoint = torch.load(fp, map_location="cpu")
+except:
+    print("Error: failed to load PyTorch model file: %s" % fname_inp)
+    sys.exit(1)
+hparams = checkpoint["dims"]
+print("hparams:", hparams)
+list_vars = checkpoint["model_state_dict"]
+#print(list_vars['encoder.positional_embedding'])
+#print(list_vars['encoder.conv1.weight'])
+#print(list_vars['encoder.conv1.weight'].shape)
+# load mel filters
+n_mels = hparams["n_mels"]
+with np.load(os.path.join(dir_whisper, "whisper/assets", "mel_filters.npz")) as f:
+    filters = torch.from_numpy(f[f"mel_{n_mels}"])
+    #print (filters)
+#code.interact(local=locals())
+multilingual = hparams["n_vocab"] == 51865
+tokenizer = build_tokenizer(dir_whisper, multilingual and "multilingual" or "gpt2")
+#print(tokenizer)
+#print(tokenizer.name_or_path)
+#print(len(tokenizer.additional_special_tokens))
+dir_tokenizer = tokenizer.name_or_path
+# output in the same directory as the model
+fname_out = dir_out + "/ggml-model.bin"
+with open(dir_tokenizer + "/vocab.json", "r") as f:
+    tokens = json.load(f)
+# use 16-bit or 32-bit floats
+use_f16 = True
+if len(sys.argv) > 4:
+    use_f16 = False
+    fname_out = dir_out + "/ggml-model-f32.bin"
+fout = open(fname_out, "wb")
+fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
+fout.write(struct.pack("i", hparams["n_vocab"]))
+fout.write(struct.pack("i", hparams["n_audio_ctx"]))
+fout.write(struct.pack("i", hparams["n_audio_state"]))
+fout.write(struct.pack("i", hparams["n_audio_head"]))
+fout.write(struct.pack("i", hparams["n_audio_layer"]))
+fout.write(struct.pack("i", hparams["n_text_ctx"]))
+fout.write(struct.pack("i", hparams["n_text_state"]))
+fout.write(struct.pack("i", hparams["n_text_head"]))
+fout.write(struct.pack("i", hparams["n_text_layer"]))
+fout.write(struct.pack("i", hparams["n_mels"]))
+fout.write(struct.pack("i", use_f16))
+# write mel filters
+fout.write(struct.pack("i", filters.shape[0]))
+fout.write(struct.pack("i", filters.shape[1]))
+for i in range(filters.shape[0]):
+    for j in range(filters.shape[1]):
+        fout.write(struct.pack("f", filters[i][j]))
+byte_encoder = bytes_to_unicode()
+byte_decoder = {v:k for k, v in byte_encoder.items()}
+fout.write(struct.pack("i", len(tokens)))
+for key in tokens:
+    text = bytearray([byte_decoder[c] for c in key]).decode('utf-8', errors='replace').encode('utf-8')
+    fout.write(struct.pack("i", len(text)))
+    fout.write(text)
+for name in list_vars.keys():
+    data = list_vars[name].squeeze().numpy()
+    print("Processing variable: " + name + " with shape: ", data.shape)
+    # reshape conv bias from [n] to [n, 1]
+    if name == "encoder.conv1.bias" or \
+       name == "encoder.conv2.bias":
+        data = data.reshape(data.shape[0], 1)
+        print("  Reshaped variable: " + name + " to shape: ", data.shape)
+    n_dims = len(data.shape);
+    # looks like the whisper models are in f16 by default
+    # so we need to convert the small tensors to f32 until we fully support f16 in ggml
+    # ftype == 0 -> float32, ftype == 1 -> float16
+    ftype = 1;
+    if use_f16:
+        if n_dims < 2 or \
+                name == "encoder.conv1.bias"   or \
+                name == "encoder.conv2.bias"   or \
+                name == "encoder.positional_embedding" or \
+                name == "decoder.positional_embedding":
+            ftype = 0
+            data = data.astype(np.float32)
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype = 0
+    else:
+        data = data.astype(np.float32)
+        ftype = 0
+    #if name.startswith("encoder"):
+    #    if name.endswith("mlp.0.weight") or \
+    #       name.endswith("mlp.2.weight"):
+    #        print("  Transposing")
+    #        data = data.transpose()
+    # header
+    str = name.encode('utf-8')
+    fout.write(struct.pack("iii", n_dims, len(str), ftype))
+    for i in range(n_dims):
+        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
+    fout.write(str);
+    # data
+    data.tofile(fout)
+fout.close()
+print("Done. Output file: " + fname_out)
+print("")

dr_wav.h ADDED Viewed

The diff for this file is too large to render. See raw diff

ggml.c ADDED Viewed

The diff for this file is too large to render. See raw diff

ggml.h ADDED Viewed

	@@ -0,0 +1,527 @@

+#pragma once
+#ifdef  __cplusplus
+extern "C" {
+#endif
+#include <stdint.h>
+#include <stddef.h>
+#include <stdbool.h>
+#define GGML_MAX_DIMS     4
+#define GGML_MAX_NODES    4096
+#define GGML_MAX_PARAMS   16
+#define GGML_MAX_CONTEXTS 16
+#ifdef __ARM_NEON
+// we use the built-in 16-bit float type
+typedef __fp16 ggml_fp16_t;
+#else
+typedef uint16_t ggml_fp16_t;
+#endif
+float ggml_fp16_to_fp32(ggml_fp16_t x);
+ggml_fp16_t ggml_fp32_to_fp16(float x);
+struct ggml_object;
+struct ggml_context;
+enum ggml_type {
+    GGML_TYPE_I8,
+    GGML_TYPE_I16,
+    GGML_TYPE_I32,
+    GGML_TYPE_F16,
+    GGML_TYPE_F32,
+    GGML_TYPE_COUNT,
+};
+enum ggml_op {
+    GGML_OP_NONE = 0,
+    GGML_OP_DUP,
+    GGML_OP_ADD,
+    GGML_OP_SUB,
+    GGML_OP_MUL,
+    GGML_OP_DIV,
+    GGML_OP_SQR,
+    GGML_OP_SQRT,
+    GGML_OP_SUM,
+    GGML_OP_MEAN,
+    GGML_OP_REPEAT,
+    GGML_OP_ABS,
+    GGML_OP_SGN,
+    GGML_OP_NEG,
+    GGML_OP_STEP,
+    GGML_OP_RELU,
+    GGML_OP_GELU,
+    GGML_OP_NORM, // normalize
+    GGML_OP_MUL_MAT,
+    GGML_OP_SCALE,
+    GGML_OP_CPY,
+    GGML_OP_RESHAPE,
+    GGML_OP_VIEW,
+    GGML_OP_PERMUTE,
+    GGML_OP_TRANSPOSE,
+    GGML_OP_GET_ROWS,
+    GGML_OP_DIAG_MASK_INF,
+    GGML_OP_SOFT_MAX,
+    GGML_OP_ROPE,
+    GGML_OP_CONV_1D_1S,
+    GGML_OP_CONV_1D_2S,
+    GGML_OP_COUNT,
+};
+// n-dimensional tensor
+struct ggml_tensor {
+    enum ggml_type type;
+    int    n_dims;
+    int    ne[GGML_MAX_DIMS]; // number of elements
+    size_t nb[GGML_MAX_DIMS]; // stride in bytes:
+                              // nb[0] = sizeof(type)
+                              // nb[1] = nb[0]   * ne[0] + padding
+                              // nb[i] = nb[i-1] * ne[i-1]
+    // compute data
+    enum ggml_op op;
+    bool is_param;
+    struct ggml_tensor * grad;
+    struct ggml_tensor * src0;
+    struct ggml_tensor * src1;
+    // thread scheduling
+    int n_tasks;
+    // performance
+    int     perf_runs;
+    int64_t perf_cycles;
+    int64_t perf_time_us;
+    void * data;
+    char pad[8];
+};
+// computation graph
+struct ggml_cgraph {
+    int n_nodes;
+    int n_leafs;
+    int n_threads;
+    size_t work_size;
+    struct ggml_tensor * work;
+    struct ggml_tensor * nodes[GGML_MAX_NODES];
+    struct ggml_tensor * grads[GGML_MAX_NODES];
+    struct ggml_tensor * leafs[GGML_MAX_NODES];
+    // performance
+    int     perf_runs;
+    int64_t perf_cycles;
+    int64_t perf_time_us;
+};
+struct ggml_init_params {
+    // memory pool
+    size_t mem_size;   // bytes
+    void * mem_buffer; // if NULL, memory will be allocated internally
+};
+int64_t ggml_time_ms(void);
+int64_t ggml_time_us(void);
+int64_t ggml_cycles(void);
+int64_t ggml_cycles_per_ms(void);
+void ggml_print_object (const struct ggml_object * obj);
+void ggml_print_objects(const struct ggml_context * ctx);
+int    ggml_nelements(const struct ggml_tensor * tensor);
+size_t ggml_nbytes   (const struct ggml_tensor * tensor);
+size_t ggml_type_size   (enum ggml_type type);
+size_t ggml_element_size(const struct ggml_tensor * tensor);
+struct ggml_context * ggml_init(struct ggml_init_params params);
+void ggml_free(struct ggml_context * ctx);
+size_t ggml_used_mem(const struct ggml_context * ctx);
+struct ggml_tensor * ggml_new_tensor(
+        struct ggml_context * ctx,
+        enum   ggml_type type,
+        int    n_dims,
+        const int *ne);
+struct ggml_tensor * ggml_new_tensor_1d(
+        struct ggml_context * ctx,
+        enum   ggml_type type,
+        int    ne0);
+struct ggml_tensor * ggml_new_tensor_2d(
+        struct ggml_context * ctx,
+        enum   ggml_type type,
+        int    ne0,
+        int    ne1);
+struct ggml_tensor * ggml_new_tensor_3d(
+        struct ggml_context * ctx,
+        enum   ggml_type type,
+        int    ne0,
+        int    ne1,
+        int    ne2);
+struct ggml_tensor * ggml_new_tensor_4d(
+        struct ggml_context * ctx,
+        enum   ggml_type type,
+        int    ne0,
+        int    ne1,
+        int    ne2,
+        int    ne3);
+struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
+struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
+struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
+struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
+struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
+float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
+void  ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
+ void * ggml_get_data    (const struct ggml_tensor * tensor);
+float * ggml_get_data_f32(const struct ggml_tensor * tensor);
+//
+// operations on tensors with backpropagation
+//
+struct ggml_tensor * ggml_dup(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a);
+struct ggml_tensor * ggml_add(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b);
+struct ggml_tensor * ggml_sub(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b);
+struct ggml_tensor * ggml_mul(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b);
+struct ggml_tensor * ggml_div(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b);
+struct ggml_tensor * ggml_sqr(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a);
+struct ggml_tensor * ggml_sqrt(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a);
+// return scalar
+// TODO: compute sum along rows
+struct ggml_tensor * ggml_sum(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a);
+// mean along rows
+struct ggml_tensor * ggml_mean(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a);
+// if a is the same shape as b, and a is not parameter, return a
+// otherwise, return a new tensor: repeat(a) to fit in b
+struct ggml_tensor * ggml_repeat(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b);
+struct ggml_tensor * ggml_abs(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a);
+struct ggml_tensor * ggml_sgn(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a);
+struct ggml_tensor * ggml_neg(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a);
+struct ggml_tensor * ggml_step(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a);
+struct ggml_tensor * ggml_relu(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a);
+// TODO: double-check this computation is correct
+struct ggml_tensor * ggml_gelu(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a);
+// normalize along rows
+// TODO: eps is hardcoded to 1e-5 for now
+struct ggml_tensor * ggml_norm(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a);
+// A: m rows, n columns
+// B: p rows, n columns (i.e. we transpose it internally)
+// result is m columns, p rows
+struct ggml_tensor * ggml_mul_mat(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b);
+//
+// operations on tensors without backpropagation
+//
+// in-place, returns view(a)
+struct ggml_tensor * ggml_scale(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b);
+// a -> b, return view(b)
+struct ggml_tensor * ggml_cpy(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b);
+// return view(a), b specifies the new shape
+// TODO: when we start computing gradient, make a copy instead of view
+struct ggml_tensor * ggml_reshape(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b);
+// return view(a)
+// TODO: when we start computing gradient, make a copy instead of view
+struct ggml_tensor * ggml_reshape_2d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   ne0,
+        int                   ne1);
+// return view(a)
+// TODO: when we start computing gradient, make a copy instead of view
+struct ggml_tensor * ggml_reshape_3d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   ne0,
+        int                   ne1,
+        int                   ne2);
+// offset in bytes
+struct ggml_tensor * ggml_view_1d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   ne0,
+        size_t                offset);
+struct ggml_tensor * ggml_view_2d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   ne0,
+        int                   ne1,
+        size_t                nb1, // row stride in bytes
+        size_t                offset);
+struct ggml_tensor * ggml_permute(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   axis0,
+        int                   axis1,
+        int                   axis2,
+        int                   axis3);
+// alias for ggml_permute(ctx, a, 1, 0, 2, 3)
+struct ggml_tensor * ggml_transpose(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a);
+struct ggml_tensor * ggml_get_rows(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b);
+// set elements above the diagonal to -INF
+// in-place, returns view(a)
+struct ggml_tensor * ggml_diag_mask_inf(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past);
+// in-place, returns view(a)
+struct ggml_tensor * ggml_soft_max(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a);
+// rotary position embedding
+// in-place, returns view(a)
+// if mode == 1, skip n_past elements
+// TODO: avoid creating a new tensor every time
+struct ggml_tensor * ggml_rope(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past,
+        int                   n_dims,
+        int                   mode);
+// padding = 1
+// TODO: we don't support extra parameters for now
+//       that's why we are hard-coding the stride, padding, and dilation
+//       not great ..
+struct ggml_tensor * ggml_conv_1d_1s(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b);
+struct ggml_tensor * ggml_conv_1d_2s(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b);
+//
+// automatic differentiation
+//
+void ggml_set_param(
+        struct ggml_context * ctx,
+        struct ggml_tensor * tensor);
+void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
+struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
+struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
+void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
+void ggml_graph_reset  (struct ggml_cgraph * cgraph);
+// print info and performance information for the graph
+void ggml_graph_print(const struct ggml_cgraph * cgraph);
+// dump the graph into a file using the dot format
+void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
+//
+// optimization
+//
+// optimization methods
+enum ggml_opt_type {
+    GGML_OPT_ADAM,
+    GGML_OPT_LBFGS,
+};
+// linesearch methods
+enum ggml_linesearch {
+    GGML_LINESEARCH_DEFAULT = 1,
+    GGML_LINESEARCH_BACKTRACKING_ARMIJO       = 0,
+    GGML_LINESEARCH_BACKTRACKING_WOLFE        = 1,
+    GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
+};
+// optimization return values
+enum ggml_opt_result {
+    GGML_OPT_OK = 0,
+    GGML_OPT_DID_NOT_CONVERGE,
+    GGML_OPT_NO_CONTEXT,
+    GGML_OPT_INVALID_WOLFE,
+    GGML_OPT_FAIL,
+    GGML_LINESEARCH_FAIL = -128,
+    GGML_LINESEARCH_MINIMUM_STEP,
+    GGML_LINESEARCH_MAXIMUM_STEP,
+    GGML_LINESEARCH_MAXIMUM_ITERATIONS,
+    GGML_LINESEARCH_INVALID_PARAMETERS,
+};
+// optimization parameters
+//
+//   see ggml.c (ggml_opt_default_params) for default values
+//
+struct ggml_opt_params {
+    enum ggml_opt_type type;
+    int n_threads;
+    // delta-based convergence test
+    //
+    //   if past == 0 - disabled
+    //   if past > 0:
+    //     stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
+    //
+    int past;
+    float delta;
+    // maximum number of iterations without improvement
+    //
+    //   if 0 - disabled
+    //   if > 0:
+    //     assume convergence if no cost improvement in this number of iterations
+    //
+    int max_no_improvement;
+    bool print_forward_graph;
+    bool print_backward_graph;
+    union {
+        // ADAM parameters
+        struct {
+            int n_iter;
+            float alpha; // learning rate
+            float beta1;
+            float beta2;
+            float eps;   // epsilon for numerical stability
+            float eps_f; // epsilon for convergence test
+            float eps_g; // epsilon for convergence test
+        } adam;
+        // LBFGS parameters
+        struct {
+            int m; // number of corrections to approximate the inv. Hessian
+            int n_iter;
+            int max_linesearch;
+            float eps;      // convergence tolerance
+            float ftol;     // line search tolerance
+            float wolfe;
+            float min_step;
+            float max_step;
+            enum ggml_linesearch linesearch;
+        } lbfgs;
+    };
+};
+struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
+// optimize the function defined by the tensor f
+enum ggml_opt_result ggml_opt(
+        struct ggml_context * ctx,
+        struct ggml_opt_params params,
+        struct ggml_tensor * f);
+#ifdef  __cplusplus
+}
+#endif

main.cpp ADDED Viewed

	@@ -0,0 +1,2116 @@

+#include "ggml.h"
+// third-party utilities
+// use your favorite implementations
+#define DR_WAV_IMPLEMENTATION
+#include "dr_wav.h"
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <thread>
+#include <vector>
+enum e_model {
+    MODEL_UNKNOWN,
+    MODEL_TINY,
+    MODEL_BASE,
+    MODEL_SMALL,
+    MODEL_MEDIUM,
+    MODEL_LARGE,
+};
+const size_t MB = 1024*1024;
+const std::map<e_model, size_t> MEM_REQ_MODEL = {
+    { MODEL_TINY,    100ull*MB },
+    { MODEL_BASE,    190ull*MB },
+    { MODEL_SMALL,   610ull*MB },
+    { MODEL_MEDIUM, 1900ull*MB },
+    { MODEL_LARGE,  3600ull*MB },
+};
+const std::map<e_model, size_t> MEM_REQ_ENCODE = {
+    { MODEL_TINY,     80ull*MB },
+    { MODEL_BASE,    128ull*MB },
+    { MODEL_SMALL,   300ull*MB },
+    { MODEL_MEDIUM,  680ull*MB },
+    { MODEL_LARGE,  1100ull*MB },
+};
+const std::map<e_model, size_t> MEM_REQ_ENCODE_LAYER = {
+    { MODEL_TINY,    170ull*MB },
+    { MODEL_BASE,    230ull*MB },
+    { MODEL_SMALL,   350ull*MB },
+    { MODEL_MEDIUM,  450ull*MB },
+    { MODEL_LARGE,   570ull*MB },
+};
+const std::map<e_model, size_t> MEM_REQ_DECODE = {
+    { MODEL_TINY,    190ull*MB },
+    { MODEL_BASE,    190ull*MB },
+    { MODEL_SMALL,   190ull*MB },
+    { MODEL_MEDIUM,  200ull*MB },
+    { MODEL_LARGE,   200ull*MB },
+};
+const std::map<e_model, size_t> MEM_REQ_DECODE_LAYER = {
+    { MODEL_TINY,     32ull*MB },
+    { MODEL_BASE,     44ull*MB },
+    { MODEL_SMALL,    64ull*MB },
+    { MODEL_MEDIUM,   84ull*MB },
+    { MODEL_LARGE,   110ull*MB },
+};
+const int SAMPLE_RATE = 16000;
+const int N_FFT       = 400;
+const int N_MEL       = 80;
+const int HOP_LENGTH  = 160;
+const int CHUNK_SIZE  = 30; // seconds
+struct whisper_mel {
+    int n_len;
+    int n_mel;
+    std::vector<float> data;
+};
+struct whisper_filters {
+    int32_t n_mel;
+    int32_t n_fft;
+    std::vector<float> data;
+};
+struct whisper_vocab {
+    using id    = int32_t;
+    using token = std::string;
+    int n_vocab = 51864;
+    std::map<token, id> token_to_id;
+    std::map<id, token> id_to_token;
+    id token_eot  = 50256;
+    id token_sot  = 50257;
+    id token_prev = 50360;
+    id token_solm = 50361; // ??
+    id token_beg  = 50363;
+    bool is_multilingual() const {
+        return n_vocab == 51865;
+    }
+};
+// command-line parameters
+struct whisper_params {
+    int32_t seed      = -1; // RNG seed
+    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    int32_t max_tokens_per_iter = 64;
+    bool verbose = false;
+    bool print_special_tokens = false;
+    std::string model = "models/whisper-tiny.en/ggml-model.bin"; // model path
+    std::string fname_inp = "default.wav";
+};
+void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
+bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
+    for (int i = 1; i < argc; i++) {
+        std::string arg = argv[i];
+        if (arg == "-s" || arg == "--seed") {
+            params.seed = std::stoi(argv[++i]);
+        } else if (arg == "-t" || arg == "--threads") {
+            params.n_threads = std::stoi(argv[++i]);
+        } else if (arg == "-T" || arg == "--tokens") {
+            params.max_tokens_per_iter = std::stoi(argv[++i]);
+        } else if (arg == "-v" || arg == "--verbose") {
+            params.verbose = true;
+        } else if (arg == "-ps" || arg == "--print_special") {
+            params.print_special_tokens = true;
+        } else if (arg == "-m" || arg == "--model") {
+            params.model = argv[++i];
+        } else if (arg == "-f" || arg == "--file") {
+            params.fname_inp = argv[++i];
+        } else if (arg == "-h" || arg == "--help") {
+            whisper_print_usage(argc, argv, params);
+            exit(0);
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            whisper_print_usage(argc, argv, params);
+            exit(0);
+        }
+    }
+    return true;
+}
+void whisper_print_usage(int argc, char ** argv, const whisper_params & params) {
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help            show this help message and exit\n");
+    fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1)\n");
+    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
+    fprintf(stderr, "  -T N, --tokens N      maximum number of tokens to generate per iteration (default: %d)\n", params.max_tokens_per_iter);
+    fprintf(stderr, "  -v, --verbose         verbose output\n");
+    fprintf(stderr, "  -ps, --print_special  print special tokens\n");
+    fprintf(stderr, "  -m FNAME, --model FNAME\n");
+    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
+    fprintf(stderr, "  -f FNAME, --file FNAME\n");
+    fprintf(stderr, "                        input WAV file path (default: %s)\n", params.fname_inp.c_str());
+    fprintf(stderr, "\n");
+}
+// medium
+// hparams: {
+// 'n_mels': 80,
+// 'n_vocab': 51864,
+// 'n_audio_ctx': 1500,
+// 'n_audio_state': 1024,
+// 'n_audio_head': 16,
+// 'n_audio_layer': 24,
+// 'n_text_ctx': 448,
+// 'n_text_state': 1024,
+// 'n_text_head': 16,
+// 'n_text_layer': 24
+// }
+//
+// default hparams (Whisper tiny)
+struct whisper_hparams {
+    int32_t n_vocab       = 51864;
+    int32_t n_audio_ctx   = 1500;
+    int32_t n_audio_state = 384;
+    int32_t n_audio_head  = 6;
+    int32_t n_audio_layer = 4;
+    int32_t n_text_ctx    = 448;
+    int32_t n_text_state  = 384;
+    int32_t n_text_head   = 6;
+    int32_t n_text_layer  = 4;
+    int32_t n_mels        = 80;
+    int32_t f16           = 1;
+};
+// audio encoding layer
+struct whisper_layer_encoder {
+    // encoder.blocks.*.attn_ln
+    struct ggml_tensor * attn_ln_0_w;
+    struct ggml_tensor * attn_ln_0_b;
+    // encoder.blocks.*.attn.out
+    struct ggml_tensor * attn_ln_1_w;
+    struct ggml_tensor * attn_ln_1_b;
+    // encoder.blocks.*.attn.query
+    struct ggml_tensor * attn_q_w;
+    struct ggml_tensor * attn_q_b;
+    // encoder.blocks.*.attn.key
+    struct ggml_tensor * attn_k_w;
+    // encoder.blocks.*.attn.value
+    struct ggml_tensor * attn_v_w;
+    struct ggml_tensor * attn_v_b;
+    // encoder.blocks.*.mlp_ln
+    struct ggml_tensor * mlp_ln_w;
+    struct ggml_tensor * mlp_ln_b;
+    // encoder.blocks.*.mlp.0
+    struct ggml_tensor * mlp_0_w;
+    struct ggml_tensor * mlp_0_b;
+    // encoder.blocks.*.mlp.2
+    struct ggml_tensor * mlp_1_w;
+    struct ggml_tensor * mlp_1_b;
+};
+// token decoding layer
+struct whisper_layer_decoder {
+    // decoder.blocks.*.attn_ln
+    struct ggml_tensor * attn_ln_0_w;
+    struct ggml_tensor * attn_ln_0_b;
+    // decoder.blocks.*.attn.out
+    struct ggml_tensor * attn_ln_1_w;
+    struct ggml_tensor * attn_ln_1_b;
+    // decoder.blocks.*.attn.query
+    struct ggml_tensor * attn_q_w;
+    struct ggml_tensor * attn_q_b;
+    // decoder.blocks.*.attn.key
+    struct ggml_tensor * attn_k_w;
+    // decoder.blocks.*.attn.value
+    struct ggml_tensor * attn_v_w;
+    struct ggml_tensor * attn_v_b;
+    // decoder.blocks.*.cross_attn_ln
+    struct ggml_tensor * cross_attn_ln_0_w;
+    struct ggml_tensor * cross_attn_ln_0_b;
+    // decoder.blocks.*.cross_attn.out
+    struct ggml_tensor * cross_attn_ln_1_w;
+    struct ggml_tensor * cross_attn_ln_1_b;
+    // decoder.blocks.*.cross_attn.query
+    struct ggml_tensor * cross_attn_q_w;
+    struct ggml_tensor * cross_attn_q_b;
+    // decoder.blocks.*.cross_attn.key
+    struct ggml_tensor * cross_attn_k_w;
+    // decoder.blocks.*.cross_attn.value
+    struct ggml_tensor * cross_attn_v_w;
+    struct ggml_tensor * cross_attn_v_b;
+    // decoder.blocks.*.mlp_ln
+    struct ggml_tensor * mlp_ln_w;
+    struct ggml_tensor * mlp_ln_b;
+    // decoder.blocks.*.mlp.0
+    struct ggml_tensor * mlp_0_w;
+    struct ggml_tensor * mlp_0_b;
+    // decoder.blocks.*.mlp.2
+    struct ggml_tensor * mlp_1_w;
+    struct ggml_tensor * mlp_1_b;
+};
+struct whisper_model {
+    e_model type = MODEL_UNKNOWN;
+    whisper_hparams hparams;
+    whisper_filters filters;
+    // encoder.positional_embedding
+    struct ggml_tensor * e_pe;
+    // encoder.conv1
+    struct ggml_tensor * e_conv_1_w;
+    struct ggml_tensor * e_conv_1_b;
+    // encoder.conv2
+    struct ggml_tensor * e_conv_2_w;
+    struct ggml_tensor * e_conv_2_b;
+    // encoder.ln_post
+    struct ggml_tensor * e_ln_w;
+    struct ggml_tensor * e_ln_b;
+    // decoder.positional_embedding
+    struct ggml_tensor * d_pe; // DD
+    // decoder.token_embedding
+    struct ggml_tensor * d_te; // DD
+    // decoder.ln
+    struct ggml_tensor * d_ln_w; // DD
+    struct ggml_tensor * d_ln_b; // DD
+    std::vector<whisper_layer_encoder> layers_encoder;
+    std::vector<whisper_layer_decoder> layers_decoder;
+    // key + value memory
+    struct ggml_tensor * memory_k;
+    struct ggml_tensor * memory_v;
+    struct ggml_tensor * memory_cross_k;
+    struct ggml_tensor * memory_cross_v;
+    //
+    struct ggml_context * ctx;
+    std::map<std::string, struct ggml_tensor *> tensors;
+};
+// load the model from a ggml file
+//
+// file format:
+//
+//   - hparams
+//   - pre-computed mel filters
+//   - vocab
+//   - weights
+//
+// see the convert-pt-to-ggml.py script for details
+//
+bool whisper_model_load(const std::string & fname, whisper_model & model, whisper_vocab & vocab) {
+    printf("%s: loading model from '%s'\n", __func__, fname.c_str());
+    auto fin = std::ifstream(fname, std::ios::binary);
+    if (!fin) {
+        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
+        return false;
+    }
+    // verify magic
+    {
+        uint32_t magic;
+        fin.read((char *) &magic, sizeof(magic));
+        if (magic != 0x67676d6c) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
+            return false;
+        }
+    }
+    //load hparams
+    {
+        auto & hparams = model.hparams;
+        fin.read((char *) &hparams.n_vocab,       sizeof(hparams.n_vocab));
+        fin.read((char *) &hparams.n_audio_ctx,   sizeof(hparams.n_audio_ctx));
+        fin.read((char *) &hparams.n_audio_state, sizeof(hparams.n_audio_state));
+        fin.read((char *) &hparams.n_audio_head,  sizeof(hparams.n_audio_head));
+        fin.read((char *) &hparams.n_audio_layer, sizeof(hparams.n_audio_layer));
+        fin.read((char *) &hparams.n_text_ctx,    sizeof(hparams.n_text_ctx));
+        fin.read((char *) &hparams.n_text_state,  sizeof(hparams.n_text_state));
+        fin.read((char *) &hparams.n_text_head,   sizeof(hparams.n_text_head));
+        fin.read((char *) &hparams.n_text_layer,  sizeof(hparams.n_text_layer));
+        fin.read((char *) &hparams.n_mels,        sizeof(hparams.n_mels));
+        fin.read((char *) &hparams.f16,           sizeof(hparams.f16));
+        assert(hparams.n_text_state == hparams.n_audio_state);
+        if (hparams.n_audio_layer == 4) {
+            model.type = e_model::MODEL_TINY;
+        }
+        if (hparams.n_audio_layer == 6) {
+            model.type = e_model::MODEL_BASE;
+        }
+        if (hparams.n_audio_layer == 12) {
+            model.type = e_model::MODEL_SMALL;
+        }
+        if (hparams.n_audio_layer == 24) {
+            model.type = e_model::MODEL_MEDIUM;
+        }
+        if (hparams.n_audio_layer == 32) {
+            model.type = e_model::MODEL_LARGE;
+        }
+        printf("%s: n_vocab       = %d\n", __func__, hparams.n_vocab);
+        printf("%s: n_audio_ctx   = %d\n", __func__, hparams.n_audio_ctx);
+        printf("%s: n_audio_state = %d\n", __func__, hparams.n_audio_state);
+        printf("%s: n_audio_head  = %d\n", __func__, hparams.n_audio_head);
+        printf("%s: n_audio_layer = %d\n", __func__, hparams.n_audio_layer);
+        printf("%s: n_text_ctx    = %d\n", __func__, hparams.n_text_ctx);
+        printf("%s: n_text_state  = %d\n", __func__, hparams.n_text_state);
+        printf("%s: n_text_head   = %d\n", __func__, hparams.n_text_head);
+        printf("%s: n_text_layer  = %d\n", __func__, hparams.n_text_layer);
+        printf("%s: n_mels        = %d\n", __func__, hparams.n_mels);
+        printf("%s: f16           = %d\n", __func__, hparams.f16);
+        printf("%s: type          = %d\n", __func__, model.type);
+        const size_t mem_required =
+                   MEM_REQ_MODEL.at(model.type) +
+                  MEM_REQ_ENCODE.at(model.type) +
+            MEM_REQ_ENCODE_LAYER.at(model.type) +
+                  MEM_REQ_DECODE.at(model.type) +
+            MEM_REQ_DECODE_LAYER.at(model.type);
+        printf("%s: mem_required  = %.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
+    }
+    // load mel filters
+    {
+        auto & filters = model.filters;
+        fin.read((char *) &filters.n_mel, sizeof(filters.n_mel));
+        fin.read((char *) &filters.n_fft, sizeof(filters.n_fft));
+        filters.data.resize(filters.n_mel * filters.n_fft);
+        fin.read((char *) filters.data.data(), filters.data.size() * sizeof(float));
+    }
+    // load vocab
+    {
+        int32_t n_vocab = 0;
+        fin.read((char *) &n_vocab, sizeof(n_vocab));
+        //if (n_vocab != model.hparams.n_vocab) {
+        //    fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
+        //            __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
+        //    return false;
+        //}
+        std::string word;
+        for (int i = 0; i < n_vocab; i++) {
+            uint32_t len;
+            fin.read((char *) &len, sizeof(len));
+            word.resize(len);
+            fin.read((char *) word.data(), len);
+            vocab.token_to_id[word] = i;
+            vocab.id_to_token[i] = word;
+            //printf("%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
+        }
+        vocab.n_vocab = model.hparams.n_vocab;
+        if (vocab.is_multilingual()) {
+            vocab.token_eot++;
+            vocab.token_sot++;
+            vocab.token_prev++;
+            vocab.token_solm++;
+            vocab.token_beg++;
+        }
+        if (n_vocab < model.hparams.n_vocab) {
+            printf("%s: adding %d extra tokens\n", __func__, model.hparams.n_vocab - n_vocab);
+            for (int i = n_vocab; i < model.hparams.n_vocab; i++) {
+                if (i > vocab.token_beg) {
+                    word = "[_TT_" + std::to_string(i - vocab.token_beg) + "]";
+                } else if (i == vocab.token_eot) {
+                    word = "[_EOT_]";
+                } else if (i == vocab.token_sot) {
+                    word = "[_SOT_]";
+                } else if (i == vocab.token_prev) {
+                    word = "[_PREV_]";
+                } else if (i == vocab.token_beg) {
+                    word = "[_BEG_]";
+                } else {
+                    word = "[_extra_token_" + std::to_string(i) + "]";
+                }
+                vocab.token_to_id[word] = i;
+                vocab.id_to_token[i] = word;
+            }
+        }
+    }
+    // for the big tensors, we have the option to store the data in 16-bit floats
+    // in order to save memory and also to speed up the computation
+    const ggml_type wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
+    auto & ctx = model.ctx;
+    size_t ctx_size = 0;
+    {
+        const auto & hparams = model.hparams;
+        const int n_vocab = hparams.n_vocab;
+        const int n_audio_ctx   = hparams.n_audio_ctx;
+        const int n_audio_state = hparams.n_audio_state;
+        const int n_audio_layer = hparams.n_audio_layer;
+        const int n_text_ctx = hparams.n_text_ctx;
+        const int n_text_state = hparams.n_text_state;
+        const int n_text_layer = hparams.n_text_layer;
+        const int n_mels = hparams.n_mels;
+        // encoder
+        {
+            // TODO: F16 .. maybe not?
+            ctx_size += n_audio_ctx*n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_pe;
+            ctx_size += 3*n_mels*n_audio_state*ggml_type_size(wtype);         // e_conv_1_w
+            ctx_size +=          n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_conv_1_b
+            ctx_size += 3*n_audio_state*n_audio_state*ggml_type_size(wtype);         // e_conv_2_w
+            ctx_size +=                 n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_conv_2_b
+            ctx_size += n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_ln_w;
+            ctx_size += n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_ln_b;
+        }
+        // decoder
+        {
+            // TODO: F16 .. maybe not?
+            ctx_size += n_text_ctx*n_text_state*ggml_type_size(GGML_TYPE_F32); // d_pe;
+            ctx_size += n_vocab*n_text_state*ggml_type_size(wtype); // d_te;
+            ctx_size += n_text_state*ggml_type_size(GGML_TYPE_F32); // d_ln_w;
+            ctx_size += n_text_state*ggml_type_size(GGML_TYPE_F32); // d_ln_b;
+        }
+        // encoder layers
+        {
+            ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_w
+            ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_b
+            ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_size(wtype));         // mlp_0_w
+            ctx_size += n_audio_layer*(              4*n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_0_b
+            ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_size(wtype));         // mlp_1_w
+            ctx_size += n_audio_layer*(                n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_1_b
+            ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_w
+            ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_b
+            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype));         // attn_q_w
+            ctx_size += n_audio_layer*(              n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_q_b
+            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype)); // attn_k_w
+            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype));         // attn_v_w
+            ctx_size += n_audio_layer*(              n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_v_b
+            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype));         // attn_ln_1_w
+            ctx_size += n_audio_layer*(              n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_1_b
+        }
+        // decoder layers
+        {
+            ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_w
+            ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_b
+            ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_size(wtype));         // mlp_0_w
+            ctx_size += n_text_layer*(             4*n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_0_b
+            ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_size(wtype));         // mlp_1_w
+            ctx_size += n_text_layer*(               n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_1_b
+            ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_w
+            ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_b
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype));         // attn_q_w
+            ctx_size += n_text_layer*(             n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_q_b
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // attn_k_w
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype));         // attn_v_w
+            ctx_size += n_text_layer*(             n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_v_b
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype));         // attn_ln_1_w
+            ctx_size += n_text_layer*(             n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_1_b
+                                                                                                //
+            ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_ln_0_w
+            ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_ln_0_b
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype));         // cross_attn_q_w
+            ctx_size += n_text_layer*(             n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_q_b
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // cross_attn_k_w
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype));         // cross_attn_v_w
+            ctx_size += n_text_layer*(             n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_v_b
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype));         // cross_attn_ln_1_w
+            ctx_size += n_text_layer*(             n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_ln_1_b
+        }
+        ctx_size += n_text_layer*n_text_ctx*n_text_state*ggml_type_size(GGML_TYPE_F32); // memory_k
+        ctx_size += n_text_layer*n_text_ctx*n_text_state*ggml_type_size(GGML_TYPE_F32); // memory_v
+        ctx_size += n_text_layer*n_audio_ctx*n_text_state*ggml_type_size(GGML_TYPE_F32); // memory_cross_k
+        ctx_size += n_text_layer*n_audio_ctx*n_text_state*ggml_type_size(GGML_TYPE_F32); // memory_cross_v
+        ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*256; // object overhead
+        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
+    }
+    // create the ggml context
+    {
+        struct ggml_init_params params = {
+            .mem_size   = ctx_size,
+            .mem_buffer = NULL,
+        };
+        model.ctx = ggml_init(params);
+        if (!model.ctx) {
+            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+            return false;
+        }
+    }
+    // prepare memory for the weights
+    {
+        const auto & hparams = model.hparams;
+        const int n_vocab = hparams.n_vocab;
+        const int n_audio_ctx   = hparams.n_audio_ctx;
+        const int n_audio_state = hparams.n_audio_state;
+        const int n_audio_layer = hparams.n_audio_layer;
+        const int n_text_ctx = hparams.n_text_ctx;
+        const int n_text_state = hparams.n_text_state;
+        const int n_text_layer = hparams.n_text_layer;
+        const int n_mels = hparams.n_mels;
+        model.layers_encoder.resize(n_audio_layer);
+        model.layers_decoder.resize(n_text_layer);
+        // encoder
+        {
+            model.e_pe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_state, n_audio_ctx);
+            model.e_conv_1_w = ggml_new_tensor_3d(ctx, wtype,         3, n_mels, n_audio_state);
+            model.e_conv_1_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state);
+            model.e_conv_2_w = ggml_new_tensor_3d(ctx, wtype,         3, n_audio_state, n_audio_state);
+            model.e_conv_2_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state);
+            model.e_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
+            model.e_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
+            // map by name
+            model.tensors["encoder.positional_embedding"] = model.e_pe;
+            model.tensors["encoder.conv1.weight"] = model.e_conv_1_w;
+            model.tensors["encoder.conv1.bias"]   = model.e_conv_1_b;
+            model.tensors["encoder.conv2.weight"] = model.e_conv_2_w;
+            model.tensors["encoder.conv2.bias"]   = model.e_conv_2_b;
+            model.tensors["encoder.ln_post.weight"] = model.e_ln_w;
+            model.tensors["encoder.ln_post.bias"]   = model.e_ln_b;
+            for (int i = 0; i < n_audio_layer; ++i) {
+                auto & layer = model.layers_encoder[i];
+                layer.mlp_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
+                layer.mlp_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
+                layer.mlp_0_w = ggml_new_tensor_2d(ctx, wtype,           n_audio_state, 4*n_audio_state);
+                layer.mlp_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_audio_state);
+                layer.mlp_1_w = ggml_new_tensor_2d(ctx, wtype,         4*n_audio_state, n_audio_state);
+                layer.mlp_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);
+                layer.attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
+                layer.attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
+                layer.attn_q_w = ggml_new_tensor_2d(ctx, wtype,         n_audio_state, n_audio_state);
+                layer.attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
+                layer.attn_k_w = ggml_new_tensor_2d(ctx, wtype,         n_audio_state, n_audio_state);
+                layer.attn_v_w = ggml_new_tensor_2d(ctx, wtype,         n_audio_state, n_audio_state);
+                layer.attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
+                layer.attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype,         n_audio_state, n_audio_state);
+                layer.attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
+                // map by name
+                model.tensors["encoder.blocks." + std::to_string(i) + ".mlp_ln.weight"] = layer.mlp_ln_w;
+                model.tensors["encoder.blocks." + std::to_string(i) + ".mlp_ln.bias"]   = layer.mlp_ln_b;
+                model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.0.weight"] = layer.mlp_0_w;
+                model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.0.bias"]   = layer.mlp_0_b;
+                model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.2.weight"] = layer.mlp_1_w;
+                model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.2.bias"]   = layer.mlp_1_b;
+                model.tensors["encoder.blocks." + std::to_string(i) + ".attn_ln.weight"] = layer.attn_ln_0_w;
+                model.tensors["encoder.blocks." + std::to_string(i) + ".attn_ln.bias"]   = layer.attn_ln_0_b;
+                model.tensors["encoder.blocks." + std::to_string(i) + ".attn.query.weight"] = layer.attn_q_w;
+                model.tensors["encoder.blocks." + std::to_string(i) + ".attn.query.bias"]   = layer.attn_q_b;
+                model.tensors["encoder.blocks." + std::to_string(i) + ".attn.key.weight"] = layer.attn_k_w;
+                model.tensors["encoder.blocks." + std::to_string(i) + ".attn.value.weight"] = layer.attn_v_w;
+                model.tensors["encoder.blocks." + std::to_string(i) + ".attn.value.bias"]   = layer.attn_v_b;
+                model.tensors["encoder.blocks." + std::to_string(i) + ".attn.out.weight"] = layer.attn_ln_1_w;
+                model.tensors["encoder.blocks." + std::to_string(i) + ".attn.out.bias"]   = layer.attn_ln_1_b;
+            }
+        }
+        // decoder
+        {
+            model.d_pe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_text_state, n_text_ctx);
+            model.d_te = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_vocab);
+            model.d_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
+            model.d_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
+            // map by name
+            model.tensors["decoder.positional_embedding"] = model.d_pe;
+            model.tensors["decoder.token_embedding.weight"] = model.d_te;
+            model.tensors["decoder.ln.weight"] = model.d_ln_w;
+            model.tensors["decoder.ln.bias"]   = model.d_ln_b;
+            for (int i = 0; i < n_text_layer; ++i) {
+                auto & layer = model.layers_decoder[i];
+                layer.mlp_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
+                layer.mlp_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
+                layer.mlp_0_w = ggml_new_tensor_2d(ctx, wtype,           n_text_state, 4*n_text_state);
+                layer.mlp_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_text_state);
+                layer.mlp_1_w = ggml_new_tensor_2d(ctx, wtype,         4*n_text_state, n_text_state);
+                layer.mlp_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
+                layer.attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
+                layer.attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
+                layer.attn_q_w = ggml_new_tensor_2d(ctx, wtype,         n_text_state, n_text_state);
+                layer.attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
+                layer.attn_k_w = ggml_new_tensor_2d(ctx, wtype,         n_text_state, n_text_state);
+                layer.attn_v_w = ggml_new_tensor_2d(ctx, wtype,         n_text_state, n_text_state);
+                layer.attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
+                layer.attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype,         n_text_state, n_text_state);
+                layer.attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
+                layer.cross_attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
+                layer.cross_attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
+                layer.cross_attn_q_w = ggml_new_tensor_2d(ctx, wtype,         n_text_state, n_text_state);
+                layer.cross_attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
+                layer.cross_attn_k_w = ggml_new_tensor_2d(ctx, wtype,         n_text_state, n_text_state);
+                layer.cross_attn_v_w = ggml_new_tensor_2d(ctx, wtype,         n_text_state, n_text_state);
+                layer.cross_attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
+                layer.cross_attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype,         n_text_state, n_text_state);
+                layer.cross_attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
+                // map by name
+                model.tensors["decoder.blocks." + std::to_string(i) + ".mlp_ln.weight"] = layer.mlp_ln_w;
+                model.tensors["decoder.blocks." + std::to_string(i) + ".mlp_ln.bias"]   = layer.mlp_ln_b;
+                model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.0.weight"] = layer.mlp_0_w;
+                model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.0.bias"]   = layer.mlp_0_b;
+                model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.2.weight"] = layer.mlp_1_w;
+                model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.2.bias"]   = layer.mlp_1_b;
+                model.tensors["decoder.blocks." + std::to_string(i) + ".attn_ln.weight"] = layer.attn_ln_0_w;
+                model.tensors["decoder.blocks." + std::to_string(i) + ".attn_ln.bias"]   = layer.attn_ln_0_b;
+                model.tensors["decoder.blocks." + std::to_string(i) + ".attn.query.weight"] = layer.attn_q_w;
+                model.tensors["decoder.blocks." + std::to_string(i) + ".attn.query.bias"]   = layer.attn_q_b;
+                model.tensors["decoder.blocks." + std::to_string(i) + ".attn.key.weight"] = layer.attn_k_w;
+                model.tensors["decoder.blocks." + std::to_string(i) + ".attn.value.weight"] = layer.attn_v_w;
+                model.tensors["decoder.blocks." + std::to_string(i) + ".attn.value.bias"]   = layer.attn_v_b;
+                model.tensors["decoder.blocks." + std::to_string(i) + ".attn.out.weight"] = layer.attn_ln_1_w;
+                model.tensors["decoder.blocks." + std::to_string(i) + ".attn.out.bias"]   = layer.attn_ln_1_b;
+                model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn_ln.weight"] = layer.cross_attn_ln_0_w;
+                model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn_ln.bias"]   = layer.cross_attn_ln_0_b;
+                model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.query.weight"] = layer.cross_attn_q_w;
+                model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.query.bias"]   = layer.cross_attn_q_b;
+                model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.key.weight"] = layer.cross_attn_k_w;
+                model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.value.weight"] = layer.cross_attn_v_w;
+                model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.value.bias"]   = layer.cross_attn_v_b;
+                model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.out.weight"] = layer.cross_attn_ln_1_w;
+                model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.out.bias"]   = layer.cross_attn_ln_1_b;
+            }
+        }
+    }
+    // key + value memory
+    {
+        const auto & hparams = model.hparams;
+        const int n_text_state = hparams.n_text_state;
+        const int n_text_layer = hparams.n_text_layer;
+        const int n_text_ctx   = hparams.n_text_ctx;
+        {
+            const int n_mem      = n_text_layer*n_text_ctx;
+            const int n_elements = n_text_state*n_mem;
+            model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
+            model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
+        }
+        {
+            const int n_audio_ctx   = hparams.n_audio_ctx;
+            const int n_mem      = n_text_layer*n_audio_ctx;
+            const int n_elements = n_text_state*n_mem;
+            model.memory_cross_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
+            model.memory_cross_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
+        }
+        const size_t memory_size =
+            ggml_nbytes(model.memory_k)       + ggml_nbytes(model.memory_v) +
+            ggml_nbytes(model.memory_cross_k) + ggml_nbytes(model.memory_cross_v);
+        printf("%s: memory size = %8.2f MB \n", __func__, memory_size/1024.0/1024.0);
+    }
+    // load weights
+    {
+        size_t total_size = 0;
+        while (true) {
+            int32_t n_dims;
+            int32_t length;
+            int32_t ftype;
+            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
+            fin.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));
+            if (fin.eof()) {
+                break;
+            }
+            int32_t nelements = 1;
+            int32_t ne[3] = { 1, 1, 1 };
+            for (int i = 0; i < n_dims; ++i) {
+                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+                nelements *= ne[i];
+            }
+            std::string name(length, 0);
+            fin.read(&name[0], length);
+            if (model.tensors.find(name.data()) == model.tensors.end()) {
+                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
+                return false;
+            }
+            auto tensor = model.tensors[name.data()];
+            if (ggml_nelements(tensor) != nelements) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
+                return false;
+            }
+            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1] || tensor->ne[2] != ne[2]) {
+                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d, %d], expected [%d, %d, %d]\n",
+                        __func__, name.data(), tensor->ne[0], tensor->ne[1], tensor->ne[2], ne[0], ne[1], ne[2]);
+                return false;
+            }
+            const size_t bpe = (ftype == 0) ? sizeof(float) : sizeof(ggml_fp16_t);
+            if (nelements*bpe != ggml_nbytes(tensor)) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
+                        __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
+                return false;
+            }
+            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
+            //printf("%24s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
+            total_size += ggml_nbytes(tensor);
+        }
+        printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
+    }
+    fin.close();
+    return true;
+}
+// evaluate the encoder
+//
+// given audio recording (more specifically, its log mel spectrogram), runs forward pass of the encoder
+// part of the transformer model and returns the encoded features
+//
+//   - model:      the model
+//   - n_threads:  number of threads to use
+//   - mel_offset: offset in the mel spectrogram (i.e. audio offset)
+//   - mel_inp:    input mel spectrogram
+//   - features:   output encoded features
+//
+bool whisper_encode(
+        const whisper_model & model,
+        const int n_threads,
+        const int mel_offset,
+        const whisper_mel & mel_inp,
+              std::vector<float> & features) {
+    const auto & hparams = model.hparams;
+    const int n_vocab = hparams.n_vocab;
+    const int n_ctx   = hparams.n_audio_ctx;
+    const int n_state = hparams.n_audio_state;
+    const int n_head  = hparams.n_audio_head;
+    const int n_layer = hparams.n_audio_layer;
+    const int N = n_ctx;
+    const int n_mels = hparams.n_mels;
+    assert(mel_inp.n_mel == n_mels);
+    struct ggml_init_params params;
+    {
+        static size_t buf_size = MEM_REQ_ENCODE.at(model.type);
+        static void * buf = malloc(buf_size);
+        params = {
+            .mem_size   = buf_size,
+            .mem_buffer = buf,
+        };
+    }
+    struct ggml_context * ctx0 = ggml_init(params);
+    struct ggml_tensor * mel = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 2*n_ctx, n_mels);
+    assert(mel->type == GGML_TYPE_F32);
+    {
+        float * dst = (float *) mel->data;
+        memset(dst, 0, ggml_nbytes(mel));
+        const int i0 = std::min(mel_offset, mel_inp.n_len);
+        const int i1 = std::min(mel_offset + 2*n_ctx, mel_inp.n_len);
+        for (int j = 0; j < mel_inp.n_mel; ++j) {
+            for (int i = i0; i < i1; ++i) {
+                dst[j*2*n_ctx + (i - i0)] = mel_inp.data[j*mel_inp.n_len + i];
+            }
+        }
+    }
+    struct ggml_tensor * cur;
+    // convolution + gelu
+    {
+        cur = ggml_conv_1d_1s(ctx0, model.e_conv_1_w, mel);
+        cur = ggml_add(ctx0,
+                ggml_repeat(ctx0,
+                    model.e_conv_1_b,
+                    cur),
+                cur);
+        cur = ggml_gelu(ctx0, cur);
+        cur = ggml_conv_1d_2s(ctx0, model.e_conv_2_w, cur);
+        cur = ggml_add(ctx0,
+                ggml_repeat(ctx0,
+                    model.e_conv_2_b,
+                    cur),
+                cur);
+        cur = ggml_gelu(ctx0, cur);
+    }
+    cur = ggml_add(ctx0, model.e_pe, ggml_transpose(ctx0, cur));
+    struct ggml_tensor * inpL = cur;
+    for (int il = 0; il < n_layer; ++il) {
+        const auto & layer = model.layers_encoder[il];
+        // create separate context for each layer to reduce memory usage
+        struct ggml_init_params paramsL;
+        {
+            static size_t buf_size = MEM_REQ_ENCODE_LAYER.at(model.type);
+            static void * buf = malloc(buf_size);
+            paramsL = {
+                .mem_size   = buf_size,
+                .mem_buffer = buf,
+            };
+        }
+        struct ggml_context * ctxL = ggml_init(paramsL);
+        // norm
+        {
+            cur = ggml_norm(ctxL, inpL);
+            // cur = ln_0_w*cur + ln_0_b
+            cur = ggml_add(ctxL,
+                    ggml_mul(ctxL,
+                        ggml_repeat(ctxL, layer.attn_ln_0_w, cur),
+                        cur),
+                    ggml_repeat(ctxL, layer.attn_ln_0_b, cur));
+        }
+        // self-attention
+        {
+            struct ggml_tensor * Qcur = ggml_mul_mat(ctxL,
+                    layer.attn_q_w,
+                    cur);
+            Qcur = ggml_add(ctxL,
+                    ggml_repeat(ctxL,
+                        layer.attn_q_b,
+                        Qcur),
+                    Qcur);
+            Qcur = ggml_scale(ctxL, Qcur, ggml_new_f32(ctxL, pow(float(n_state)/n_head, -0.25)));
+            // no bias for Key
+            struct ggml_tensor * Kcur = ggml_mul_mat(ctxL,
+                    layer.attn_k_w,
+                    cur);
+            Kcur = ggml_scale(ctxL, Kcur, ggml_new_f32(ctxL, pow(float(n_state)/n_head, -0.25)));
+            struct ggml_tensor * Vcur = ggml_mul_mat(ctxL,
+                    layer.attn_v_w,
+                    cur);
+            Vcur = ggml_add(ctxL,
+                    ggml_repeat(ctxL,
+                        layer.attn_v_b,
+                        Vcur),
+                    Vcur);
+            // ------
+            struct ggml_tensor * Q =
+                ggml_permute(ctxL,
+                        ggml_cpy(ctxL,
+                            Qcur,
+                            ggml_new_tensor_3d(ctxL, GGML_TYPE_F32, n_state/n_head, n_head, N)),
+                        0, 2, 1, 3);
+            struct ggml_tensor * K =
+                ggml_permute(ctxL,
+                        ggml_cpy(ctxL,
+                            Kcur,
+                            ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_head, N)), // F16 !
+                        0, 2, 1, 3);
+            //// BLAS attempt
+            //struct ggml_tensor * KQ =
+            //    ggml_mul_mat(ctxL,
+            //        ggml_cpy(ctxL, K, ggml_new_tensor_3d(ctxL, GGML_TYPE_F32, n_state/n_head, N, n_head)),
+            //        ggml_cpy(ctxL, Q, ggml_new_tensor_3d(ctxL, GGML_TYPE_F32, n_state/n_head, N, n_head)));
+            // K * Q
+            struct ggml_tensor * KQ = ggml_mul_mat(ctxL, K, Q);
+            //struct ggml_tensor * K =
+            //    ggml_cpy(ctxL,
+            //            ggml_permute(ctxL,
+            //                ggml_reshape_3d(ctxL,
+            //                    Kcur,
+            //                    n_state/n_head, n_head, N),
+            //                1, 2, 0, 3),
+            //            ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, N, n_state/n_head, n_head)
+            //            );
+            //// K * Q
+            //struct ggml_tensor * KQ = ggml_mul_mat(ctxL, ggml_transpose(ctxL, K), Q);
+            //struct ggml_tensor * KQ_scaled =
+            //    ggml_scale(ctxL,
+            //            KQ,
+            //            ggml_new_f32(ctxL, 1.0f/sqrt(float(n_state)/n_head))
+            //            );
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctxL, KQ);
+            //struct ggml_tensor * V_trans =
+            //    ggml_permute(ctxL,
+            //            ggml_cpy(ctxL,
+            //                Vcur,
+            //                ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_head, N)),
+            //            1, 2, 0, 3);
+            //struct ggml_tensor * KQV = ggml_mul_mat(ctxL, V_trans, KQ_soft_max);
+            struct ggml_tensor * V =
+                ggml_cpy(ctxL,
+                        ggml_permute(ctxL,
+                            ggml_reshape_3d(ctxL,
+                                Vcur,
+                                n_state/n_head, n_head, N),
+                            0, 2, 1, 3),
+                        ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, N, n_head) // F16 !
+                        );
+            struct ggml_tensor * KQV = ggml_mul_mat(ctxL, ggml_transpose(ctxL, V), KQ_soft_max);
+            struct ggml_tensor * KQV_merged = ggml_permute(ctxL, KQV, 0, 2, 1, 3);
+            cur = ggml_cpy(ctxL,
+                    KQV_merged,
+                    ggml_new_tensor_2d(ctxL, GGML_TYPE_F32, n_state, N));
+        }
+        // projection
+        {
+            cur = ggml_mul_mat(ctxL,
+                    layer.attn_ln_1_w,
+                    cur);
+            cur = ggml_add(ctxL,
+                    ggml_repeat(ctxL, layer.attn_ln_1_b, cur),
+                    cur);
+        }
+        // add the input
+        cur = ggml_add(ctxL, cur, inpL);
+        struct ggml_tensor * inpFF = cur;
+        // feed-forward network
+        {
+            // norm
+            {
+                cur = ggml_norm(ctxL, inpFF);
+                // cur = mlp_ln_w*cur + mlp_ln_b
+                cur = ggml_add(ctxL,
+                        ggml_mul(ctxL,
+                            ggml_repeat(ctxL, layer.mlp_ln_w, cur),
+                            cur),
+                        ggml_repeat(ctxL, layer.mlp_ln_b, cur));
+            }
+            // fully connected
+            cur = ggml_mul_mat(ctxL,
+                    layer.mlp_0_w,
+                    cur);
+            cur = ggml_add(ctxL,
+                    ggml_repeat(ctxL, layer.mlp_0_b, cur),
+                    cur);
+            // GELU activation
+            cur = ggml_gelu(ctxL, cur);
+            // projection
+            cur = ggml_mul_mat(ctxL,
+                    layer.mlp_1_w,
+                    cur);
+            cur = ggml_add(ctxL,
+                    ggml_repeat(ctxL, layer.mlp_1_b, cur),
+                    cur);
+        }
+        // output from this layer
+        struct ggml_tensor * inpO = ggml_add(ctxL, cur, inpFF);
+        {
+            struct ggml_cgraph gf = { .n_threads = n_threads };
+            ggml_build_forward_expand(&gf, inpO);
+            ggml_graph_compute       (ctxL, &gf);
+            //ggml_graph_print(&gf);
+        }
+        // TODO: this is a hack to have per-layer computation graphs - need to come up with something better
+        // input for next layer (inpO -> inpL)
+        memcpy(inpL->data, inpO->data, ggml_nbytes(inpL));
+        inpL->op = GGML_OP_NONE;
+        inpL->src0 = NULL;
+        inpL->src1 = NULL;
+        //printf("%s: - used_mem(%d) = %f MB\n", __func__, il, ggml_used_mem(ctxL)/1024.0/1024.0);
+        ggml_free(ctxL);
+    }
+    cur = inpL;
+    // norm
+    {
+        cur = ggml_norm(ctx0, cur);
+        // cur = ln_f_g*cur + ln_f_b
+        cur = ggml_add(ctx0,
+                ggml_mul(ctx0,
+                    ggml_repeat(ctx0, model.e_ln_w, cur),
+                    cur),
+                ggml_repeat(ctx0, model.e_ln_b, cur));
+    }
+    // run the computation
+    {
+        struct ggml_cgraph gf = { .n_threads = n_threads };
+        ggml_build_forward_expand(&gf, cur);
+        ggml_graph_compute       (ctx0, &gf);
+        //ggml_graph_print(&gf);
+    }
+    // cur
+    //{
+    //    printf("ne0 = %d\n", cur->ne[0]);
+    //    printf("ne1 = %d\n", cur->ne[1]);
+    //    for (int i = 0; i < 10; ++i) {
+    //        printf("%8.4f ", ((float *)(cur->data))[i]);
+    //    }
+    //    printf("... ");
+    //    for (int i = cur->ne[0] - 10; i < cur->ne[0]; ++i) {
+    //        printf("%8.4f ", ((float *)(cur->data))[i]);
+    //    }
+    //    printf("\n");
+    //}
+    // pre-compute cross-attention memory
+    {
+        struct ggml_cgraph gf = { .n_threads = n_threads };
+        // TODO: hack to disconnect the encoded features from the previous graph
+        cur->op = GGML_OP_NONE;
+        cur->src0 = NULL;
+        cur->src1 = NULL;
+        for (int il = 0; il < model.hparams.n_text_layer; ++il) {
+            auto & layer = model.layers_decoder[il];
+            struct ggml_tensor * Kcross = ggml_mul_mat(ctx0,
+                    layer.cross_attn_k_w,
+                    cur);
+            Kcross = ggml_scale(ctx0, Kcross, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+            struct ggml_tensor * Vcross = ggml_mul_mat(ctx0,
+                    layer.cross_attn_v_w,
+                    cur);
+            Vcross = ggml_add(ctx0,
+                    ggml_repeat(ctx0,
+                        layer.cross_attn_v_b,
+                        Vcross),
+                    Vcross);
+            struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_cross_k, n_state*n_ctx, (ggml_element_size(model.memory_cross_k)*n_state)*(il*n_ctx));
+            struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_cross_v, n_state*n_ctx, (ggml_element_size(model.memory_cross_v)*n_state)*(il*n_ctx));
+            ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcross, k));
+            ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcross, v));
+        }
+        ggml_graph_compute(ctx0, &gf);
+    }
+    ////////////////////////////////////////////////////////////////////////////
+    // output the features
+    assert(cur->type == GGML_TYPE_F32);
+    features.resize(cur->ne[0]*cur->ne[1]);
+    memcpy(features.data(), cur->data, features.size()*sizeof(float));
+    //printf("%s: used_mem = %f MB\n", __func__, ggml_used_mem(ctx0)/1024.0/1024.0);
+    ggml_free(ctx0);
+    return true;
+}
+// evaluate the decoder
+//
+// given text prompt + audio features -> predicts the probabilities for the next token
+//
+//   - model:      the model
+//   - n_threads:  number of threads to use
+//   - n_past:     prompt length
+//   - prompt:     text prompt
+//   - logits_out: output logits
+//   - probs_out:  output probabilities
+//
+bool whisper_decode(
+        const whisper_model & model,
+        const int n_threads,
+        const int n_past,
+        const std::vector<whisper_vocab::id> & prompt,
+              std::vector<float> & logits_out,
+              std::vector<float> & probs_out) {
+    const auto & hparams = model.hparams;
+    const int n_vocab = hparams.n_vocab;
+    const int n_ctx   = hparams.n_text_ctx;
+    const int n_state = hparams.n_text_state;
+    const int n_head  = hparams.n_text_head;
+    const int n_layer = hparams.n_text_layer;
+    const int N = prompt.size();
+    const int M = hparams.n_audio_ctx;
+    struct ggml_init_params params;
+    {
+        static size_t buf_size = MEM_REQ_DECODE.at(model.type);
+        static void * buf = malloc(buf_size);
+        params = {
+            .mem_size   = buf_size,
+            .mem_buffer = buf,
+        };
+    }
+    struct ggml_context * ctx0 = ggml_init(params);
+    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    memcpy(embd->data, prompt.data(), N*ggml_element_size(embd));
+    struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    for (int i = 0; i < N; ++i) {
+        ((int32_t *) position->data)[i] = n_past + i;
+    }
+    // wte + wpe
+    struct ggml_tensor * cur =
+        ggml_add(ctx0,
+                ggml_get_rows(ctx0, model.d_te, embd),
+                ggml_get_rows(ctx0, model.d_pe, position));
+    struct ggml_tensor * inpL = cur;
+    for (int il = 0; il < n_layer; ++il) {
+        const auto & layer = model.layers_decoder[il];
+        struct ggml_init_params paramsL;
+        {
+            static size_t buf_size = MEM_REQ_DECODE_LAYER.at(model.type);
+            static void * buf = malloc(buf_size);
+            paramsL = {
+                .mem_size   = buf_size,
+                .mem_buffer = buf,
+            };
+        }
+        struct ggml_context * ctxL = ggml_init(paramsL);
+        struct ggml_cgraph gf = { .n_threads = n_threads };
+        // norm
+        {
+            cur = ggml_norm(ctxL, inpL);
+            // cur = ln_0_w*cur + ln_0_b
+            cur = ggml_add(ctxL,
+                    ggml_mul(ctxL,
+                        ggml_repeat(ctxL, layer.attn_ln_0_w, cur),
+                        cur),
+                    ggml_repeat(ctxL, layer.attn_ln_0_b, cur));
+        }
+        // self-attention
+        {
+            struct ggml_tensor * Qcur = ggml_mul_mat(ctxL,
+                    layer.attn_q_w,
+                    cur);
+            Qcur = ggml_add(ctxL,
+                    ggml_repeat(ctxL,
+                        layer.attn_q_b,
+                        Qcur),
+                    Qcur);
+            Qcur = ggml_scale(ctxL, Qcur, ggml_new_f32(ctxL, pow(float(n_state)/n_head, -0.25)));
+            // no bias for Key
+            struct ggml_tensor * Kcur = ggml_mul_mat(ctxL,
+                    layer.attn_k_w,
+                    cur);
+            Kcur = ggml_scale(ctxL, Kcur, ggml_new_f32(ctxL, pow(float(n_state)/n_head, -0.25)));
+            struct ggml_tensor * Vcur = ggml_mul_mat(ctxL,
+                    layer.attn_v_w,
+                    cur);
+            Vcur = ggml_add(ctxL,
+                    ggml_repeat(ctxL,
+                        layer.attn_v_b,
+                        Vcur),
+                    Vcur);
+            // store key and value to memory
+            {
+                struct ggml_tensor * k = ggml_view_1d(ctxL, model.memory_k, N*n_state, (ggml_element_size(model.memory_k)*n_state)*(il*n_ctx + n_past));
+                struct ggml_tensor * v = ggml_view_1d(ctxL, model.memory_v, N*n_state, (ggml_element_size(model.memory_v)*n_state)*(il*n_ctx + n_past));
+                ggml_build_forward_expand(&gf, ggml_cpy(ctxL, Kcur, k));
+                ggml_build_forward_expand(&gf, ggml_cpy(ctxL, Vcur, v));
+            }
+            // ------
+            struct ggml_tensor * Q =
+                ggml_permute(ctxL,
+                        ggml_cpy(ctxL,
+                            Qcur,
+                            ggml_new_tensor_3d(ctxL, GGML_TYPE_F32, n_state/n_head, n_head, N)),
+                        0, 2, 1, 3);
+            struct ggml_tensor * K =
+                ggml_permute(ctxL,
+                        ggml_reshape_3d(ctxL,
+                            ggml_view_1d(ctxL, model.memory_k, (n_past + N)*n_state, il*n_ctx*ggml_element_size(model.memory_k)*n_state),
+                            n_state/n_head, n_head, n_past + N),
+                        0, 2, 1, 3);
+            // K * Q
+            struct ggml_tensor * KQ = ggml_mul_mat(ctxL, K, Q);
+            //struct ggml_tensor * KQ_scaled =
+            //    ggml_scale(ctxL,
+            //            KQ,
+            //            ggml_new_f32(ctxL, 1.0f/sqrt(float(n_state)/n_head))
+            //            );
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctxL, KQ, n_past);
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctxL, KQ_masked);
+            struct ggml_tensor * V_trans =
+                ggml_permute(ctxL,
+                        ggml_reshape_3d(ctxL,
+                            ggml_view_1d(ctxL, model.memory_v, (n_past + N)*n_state, il*n_ctx*ggml_element_size(model.memory_v)*n_state),
+                            n_state/n_head, n_head, n_past + N),
+                        1, 2, 0, 3);
+            struct ggml_tensor * KQV = ggml_mul_mat(ctxL, V_trans, KQ_soft_max);
+            struct ggml_tensor * KQV_merged = ggml_permute(ctxL, KQV, 0, 2, 1, 3);
+            cur = ggml_cpy(ctxL,
+                    KQV_merged,
+                    ggml_new_tensor_2d(ctxL, GGML_TYPE_F32, n_state, N));
+        }
+        {
+            cur = ggml_mul_mat(ctxL,
+                    layer.attn_ln_1_w,
+                    cur);
+            cur = ggml_add(ctxL,
+                    ggml_repeat(ctxL, layer.attn_ln_1_b, cur),
+                    cur);
+        }
+        // add the input
+        struct ggml_tensor * inpCA = ggml_add(ctxL, cur, inpL);
+        // norm
+        {
+            cur = ggml_norm(ctxL, inpCA); // Note we use inpCA here
+            // cur = ln_0_w*cur + ln_0_b
+            cur = ggml_add(ctxL,
+                    ggml_mul(ctxL,
+                        ggml_repeat(ctxL, layer.cross_attn_ln_0_w, cur),
+                        cur),
+                    ggml_repeat(ctxL, layer.cross_attn_ln_0_b, cur));
+        }
+        // cross-attention
+        {
+            struct ggml_tensor * Qcur = ggml_mul_mat(ctxL,
+                    layer.cross_attn_q_w,
+                    cur);
+            Qcur = ggml_add(ctxL,
+                    ggml_repeat(ctxL,
+                        layer.cross_attn_q_b,
+                        Qcur),
+                    Qcur);
+            Qcur = ggml_scale(ctxL, Qcur, ggml_new_f32(ctxL, pow(float(n_state)/n_head, -0.25)));
+            // Kcross is already scaled
+            struct ggml_tensor * Kcross =
+                ggml_reshape_3d(ctxL,
+                        ggml_view_1d(ctxL, model.memory_cross_k, M*n_state, il*M*ggml_element_size(model.memory_cross_k)*n_state),
+                        n_state/n_head, n_head, M);
+            struct ggml_tensor * Vcross =
+                ggml_reshape_3d(ctxL,
+                        ggml_view_1d(ctxL, model.memory_cross_v, M*n_state, il*M*ggml_element_size(model.memory_cross_v)*n_state),
+                        n_state/n_head, n_head, M);
+            // ------
+            struct ggml_tensor * Q =
+                ggml_permute(ctxL,
+                        ggml_cpy(ctxL,
+                            Qcur,
+                            ggml_new_tensor_3d(ctxL, GGML_TYPE_F32, n_state/n_head, n_head, N)),
+                        0, 2, 1, 3);
+            struct ggml_tensor * K = ggml_permute(ctxL, Kcross, 0, 2, 1, 3);
+            // K * Q
+            struct ggml_tensor * KQ = ggml_mul_mat(ctxL, K, Q);
+            //struct ggml_tensor * KQ_scaled =
+            //    ggml_scale(ctxL,
+            //            KQ,
+            //            ggml_new_f32(ctxL, 1.0f/sqrt(float(n_state)/n_head))
+            //            );
+            // no masking for cross-attention
+            //struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctxL, KQ_scaled, n_past);
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctxL, KQ);
+            struct ggml_tensor * V_trans = ggml_permute(ctxL, Vcross, 1, 2, 0, 3);
+            struct ggml_tensor * KQV = ggml_mul_mat(ctxL, V_trans, KQ_soft_max);
+            struct ggml_tensor * KQV_merged = ggml_permute(ctxL, KQV, 0, 2, 1, 3);
+            // cur = KQV_merged.contiguous().view(n_state, N)
+            cur = ggml_cpy(ctxL,
+                    KQV_merged,
+                    ggml_new_tensor_2d(ctxL, GGML_TYPE_F32, n_state, N));
+        }
+        // projection
+        {
+            cur = ggml_mul_mat(ctxL,
+                    layer.cross_attn_ln_1_w,
+                    cur);
+            cur = ggml_add(ctxL,
+                    ggml_repeat(ctxL, layer.cross_attn_ln_1_b, cur),
+                    cur);
+        }
+        // add the input
+        cur = ggml_add(ctxL, cur, inpCA);
+        struct ggml_tensor * inpFF = cur;
+        // feed-forward network
+        {
+            // norm
+            {
+                cur = ggml_norm(ctxL, inpFF);
+                // cur = ln_2_g*cur + ln_2_b
+                // [ 768, N]
+                cur = ggml_add(ctxL,
+                        ggml_mul(ctxL,
+                            ggml_repeat(ctxL, layer.mlp_ln_w, cur),
+                            cur),
+                        ggml_repeat(ctxL, layer.mlp_ln_b, cur));
+            }
+            // fully connected
+            cur = ggml_mul_mat(ctxL,
+                    layer.mlp_0_w,
+                    cur);
+            cur = ggml_add(ctxL,
+                    ggml_repeat(ctxL, layer.mlp_0_b, cur),
+                    cur);
+            // GELU activation
+            cur = ggml_gelu(ctxL, cur);
+            // projection
+            cur = ggml_mul_mat(ctxL,
+                    layer.mlp_1_w,
+                    cur);
+            cur = ggml_add(ctxL,
+                    ggml_repeat(ctxL, layer.mlp_1_b, cur),
+                    cur);
+        }
+        // output from this layer
+        struct ggml_tensor * inpO = ggml_add(ctxL, cur, inpFF);
+        {
+            ggml_build_forward_expand(&gf, inpO);
+            ggml_graph_compute       (ctxL, &gf);
+            //ggml_graph_print(&gf);
+        }
+        // TODO: this is a hack to have per-layer computation graphs - need to come up with something better
+        // input for next layer (inpO -> inpL)
+        memcpy(inpL->data, inpO->data, ggml_nbytes(inpL));
+        inpL->op = GGML_OP_NONE;
+        inpL->src0 = NULL;
+        inpL->src1 = NULL;
+        if (N > 1) {
+            //printf("%s: - used_mem(%d) = %f MB\n", __func__, il, ggml_used_mem(ctxL)/1024.0/1024.0);
+        }
+        ggml_free(ctxL);
+    }
+    cur = inpL;
+    // norm
+    {
+        cur = ggml_norm(ctx0, cur);
+        cur = ggml_add(ctx0,
+                ggml_mul(ctx0,
+                    ggml_repeat(ctx0, model.d_ln_w, cur),
+                    cur),
+                ggml_repeat(ctx0, model.d_ln_b, cur));
+    }
+    struct ggml_tensor * logits = ggml_mul_mat(ctx0, model.d_te, cur);
+    // logits -> probs
+    cur = ggml_dup(ctx0, logits);
+    cur = ggml_soft_max(ctx0, cur); // in-place
+    // run the computation
+    {
+        struct ggml_cgraph gf = { .n_threads = n_threads };
+        ggml_build_forward_expand(&gf, cur);
+        ggml_graph_compute       (ctx0, &gf);
+    }
+    logits_out.resize(N*n_vocab);
+    memcpy(logits_out.data(), ggml_get_data(logits), sizeof(float)*N*n_vocab);
+    probs_out.resize(N*n_vocab);
+    memcpy(probs_out.data(), ggml_get_data(cur), sizeof(float)*N*n_vocab);
+    //if (N > 1) {
+    //    const float mem_per_token = ggml_used_mem(ctx0)/1024.0/1024.0/N;
+    //    printf("%s: used_mem = %f MB / %f per token\n", __func__, ggml_used_mem(ctx0)/1024.0/1024.0, mem_per_token);
+    //    printf("%s: max mem = %f MB\n", __func__, mem_per_token*model.hparams.n_text_ctx);
+    //}
+    ggml_free(ctx0);
+    return true;
+}
+// the most basic sampling scheme - select the top token
+// TODO: beam search
+// TODO: temperature
+whisper_vocab::id whisper_sample_best(
+        const whisper_vocab & vocab,
+        const float * probs,
+        double temp,
+        int offset = 0) {
+    int n_logits = vocab.id_to_token.size();
+    std::vector<std::pair<double, whisper_vocab::id>> probs_id;
+    probs_id.reserve(n_logits);
+    for (int i = offset; i < n_logits; i++) {
+        probs_id.push_back(std::make_pair(probs[i], i));
+    }
+    const int top_k = 10;
+    // find the top K tokens
+    std::partial_sort(
+            probs_id.begin(),
+            probs_id.begin() + top_k, probs_id.end(),
+            [](const std::pair<double, whisper_vocab::id> & a, const std::pair<double, whisper_vocab::id> & b) {
+        return a.first > b.first;
+    });
+    probs_id.resize(top_k);
+    //printf("\n");
+    //for (int i = 0; i < (int) probs_id.size(); i++) {
+    //    printf("%d: '%s' %f, %d\n", i, vocab.id_to_token.at(probs_id[i].second).c_str(), probs_id[i].first, probs_id[i].second);
+    //}
+    int res = 0;
+    while (probs_id[res].second == vocab.token_solm && res < (int) probs_id.size() - 1) {
+        res++;
+    }
+    return probs_id[res].second;
+}
+// Cooley-Tukey FFT
+// poor man's implmentation - use something better
+// input is real-valued
+// output is complex-valued
+void fft(const std::vector<float> & in, std::vector<float> & out) {
+    out.resize(in.size()*2);
+    int N = in.size();
+    if (N == 1) {
+        out[0] = in[0];
+        out[1] = 0;
+        return;
+    }
+    std::vector<float> even;
+    std::vector<float> odd;
+    for (int i = 0; i < N; i++) {
+        if (i % 2 == 0) {
+            even.push_back(in[i]);
+        } else {
+            odd.push_back(in[i]);
+        }
+    }
+    std::vector<float> even_fft;
+    std::vector<float> odd_fft;
+    fft(even, even_fft);
+    fft(odd, odd_fft);
+    for (int k = 0; k < N/2; k++) {
+        float theta = 2*M_PI*k/N;
+        float re = cos(theta);
+        float im = -sin(theta);
+        float re_odd = odd_fft[2*k + 0];
+        float im_odd = odd_fft[2*k + 1];
+        out[2*k + 0] = even_fft[2*k + 0] + re*re_odd - im*im_odd;
+        out[2*k + 1] = even_fft[2*k + 1] + re*im_odd + im*re_odd;
+        out[2*(k + N/2) + 0] = even_fft[2*k + 0] - re*re_odd + im*im_odd;
+        out[2*(k + N/2) + 1] = even_fft[2*k + 1] - re*im_odd - im*re_odd;
+    }
+}
+// ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L92-L124
+bool log_mel_spectrogram(
+    const std::vector<float> sf32,
+    const int sample_rate,
+    const int fft_size,
+    const int fft_step,
+    const int n_mel,
+    const int n_threads,
+    const whisper_filters & filters,
+    whisper_mel & mel) {
+    const int n_sample = sf32.size();
+    const float * samples = sf32.data();
+    // Hanning window
+    std::vector<float> hann;
+    hann.resize(fft_size);
+    for (int i = 0; i < fft_size; i++) {
+        hann[i] = 0.5*(1.0 - cos((2.0*M_PI*i)/(fft_size)));
+    }
+    mel.n_mel = n_mel;
+    mel.n_len = (n_sample)/fft_step;
+    mel.data.resize(mel.n_mel*mel.n_len);
+    const int n_fft = 1 + fft_size/2;
+    printf("%s: n_sample = %d, n_len = %d\n", __func__, n_sample, mel.n_len);
+    printf("%s: recording length: %f s\n", __func__, (float) n_sample/sample_rate);
+    std::vector<std::thread> workers(n_threads);
+    for (int iw = 0; iw < n_threads; ++iw) {
+        workers[iw] = std::thread([&](int ith) {
+            std::vector<float> fft_in;
+            fft_in.resize(fft_size);
+            for (int i = 0; i < fft_size; i++) {
+                fft_in[i] = 0.0;
+            }
+            std::vector<float> fft_out;
+            fft_out.resize(2*fft_size);
+            for (int i = ith; i < mel.n_len; i += n_threads) {
+                const int offset = i*fft_step;
+                // apply Hanning window
+                for (int j = 0; j < fft_size; j++) {
+                    if (offset + j < n_sample) {
+                        fft_in[j] = hann[j]*samples[offset + j];
+                    } else {
+                        fft_in[j] = 0.0;
+                    }
+                }
+                // FFT -> mag^2
+                fft(fft_in, fft_out);
+                for (int j = 0; j < n_fft; j++) {
+                    fft_out[j] = (fft_out[2*j + 0]*fft_out[2*j + 0] + fft_out[2*j + 1]*fft_out[2*j + 1]);
+                }
+                // mel spectrogram
+                for (int j = 0; j < mel.n_mel; j++) {
+                    double sum = 0.0;
+                    for (int k = 0; k < n_fft; k++) {
+                        sum += fft_out[k]*filters.data[j*n_fft + k];
+                    }
+                    if (sum < 1e-10) {
+                        sum = 1e-10;
+                    }
+                    sum = log10(sum);
+                    mel.data[j*mel.n_len + i] = sum;
+                }
+            }
+        }, iw);
+    }
+    for (int iw = 0; iw < n_threads; ++iw) {
+        workers[iw].join();
+    }
+    // clamping and normalization
+    double mmax = -1e20;
+    for (int i = 0; i < mel.n_mel*mel.n_len; i++) {
+        if (mel.data[i] > mmax) {
+            mmax = mel.data[i];
+        }
+    }
+    mmax -= 8.0;
+    for (int i = 0; i < mel.n_mel*mel.n_len; i++) {
+        if (mel.data[i] < mmax) {
+            mel.data[i] = mmax;
+        }
+        mel.data[i] = (mel.data[i] + 4.0)/4.0;
+    }
+    return true;
+}
+int main(int argc, char ** argv) {
+    const int64_t t_main_start_us = ggml_time_us();
+    whisper_params params;
+    params.model = "models/whisper-tiny.en/ggml-model.bin";
+    if (whisper_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+    if (params.seed < 0) {
+        params.seed = time(NULL);
+    }
+    // Model loading
+    //printf("%s: seed = %d\n", __func__, params.seed);
+    int64_t t_load_us   = 0;
+    int64_t t_mel_us    = 0;
+    int64_t t_sample_us  = 0;
+    int64_t t_encode_us = 0;
+    int64_t t_decode_us = 0;
+    whisper_vocab vocab;
+    whisper_model model;
+    // load the model
+    {
+        const int64_t t_start_us = ggml_time_us();
+        if (!whisper_model_load(params.model, model, vocab)) {
+            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
+            return 1;
+        }
+        t_load_us = ggml_time_us() - t_start_us;
+    }
+    // WAV input
+    std::vector<float> pcmf32;
+    {
+        drwav wav;
+        if (!drwav_init_file(&wav, params.fname_inp.c_str(), NULL)) {
+            fprintf(stderr, "%s: failed to open WAV file '%s' - check your input\n", argv[0], params.fname_inp.c_str());
+            return 2;
+        }
+        if (wav.channels != 1) {
+            fprintf(stderr, "%s: WAV file '%s' must be mono\n", argv[0], params.fname_inp.c_str());
+            return 3;
+        }
+        if (wav.sampleRate != SAMPLE_RATE) {
+            fprintf(stderr, "%s: WAV file '%s' must be 16 kHz\n", argv[0], params.fname_inp.c_str());
+            return 4;
+        }
+        if (wav.bitsPerSample != 16) {
+            fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", argv[0], params.fname_inp.c_str());
+            return 5;
+        }
+        std::vector<int16_t> pcm16;
+        pcm16.resize(wav.totalPCMFrameCount);
+        drwav_read_pcm_frames_s16(&wav, wav.totalPCMFrameCount, pcm16.data());
+        drwav_uninit(&wav);
+        // convert to float
+        pcmf32.resize(pcm16.size());
+        for (size_t i = 0; i < pcm16.size(); i++) {
+            pcmf32[i] = float(pcm16[i])/32768.0f;
+        }
+    }
+    // compute log mel spectrogram
+    whisper_mel mel_inp;
+    {
+        const int64_t t_start_us = ggml_time_us();
+        log_mel_spectrogram(pcmf32, SAMPLE_RATE, N_FFT, HOP_LENGTH, N_MEL, params.n_threads, model.filters, mel_inp);
+        t_mel_us = ggml_time_us() - t_start_us;
+    }
+    std::vector<whisper_vocab::id> prompt_past = { };
+    // main loop
+    int seek = 0;
+    while (true) {
+        if (seek >= mel_inp.n_len) {
+            break;
+        }
+        // encode audio features starting at offset seek
+        std::vector<float> features;
+        {
+            const int64_t t_start_us = ggml_time_us();
+            if (!whisper_encode(model, params.n_threads, seek, mel_inp, features)) {
+                fprintf(stderr, "%s: failed to eval\n", __func__);
+                return 1;
+            }
+            t_encode_us = ggml_time_us() - t_start_us;
+        }
+        std::vector<float> probs;
+        std::vector<float> logits;
+        // SOT
+        // ref: https://github.com/openai/whisper/blob/15ab54826343c27cfaf44ce31e9c8fb63d0aa775/whisper/decoding.py#L506-L526
+        // TODO: use different initial tokens for different tasks
+        std::vector<whisper_vocab::id> prompt = { vocab.token_sot };
+        int n_past = 0;
+        if (prompt_past.size() > 0) {
+            int n_take = std::min(model.hparams.n_text_ctx/2, int(prompt_past.size()));
+            prompt = { vocab.token_prev };
+            prompt.insert(prompt.end(), prompt_past.end() - n_take, prompt_past.end());
+            prompt.push_back(vocab.token_sot);
+            prompt_past.clear();
+            prompt_past.insert(prompt_past.end(), prompt.begin() + 1, prompt.end() - 1);
+        }
+        bool done = false;
+        int seek_delta = 100*CHUNK_SIZE;
+        whisper_vocab::id last_id = 0;
+        //for (int i = 0; i < prompt.size(); i++) {
+        //    printf("%s: prompt[%d] = %s\n", __func__, i, vocab.id_to_token[prompt[i]].c_str());
+        //}
+        printf("\n");
+        for (int i = 0; i < model.hparams.n_text_ctx/2; ++i) {
+            // decode
+            if (prompt.size() > 0) {
+                const int64_t t_start_us = ggml_time_us();
+                if (!whisper_decode(model, params.n_threads, n_past, prompt, logits, probs)) {
+                    fprintf(stderr, "%s: failed to eval\n", __func__);
+                    return 1;
+                }
+                t_decode_us += ggml_time_us() - t_start_us;
+            }
+            n_past += prompt.size();
+            prompt.clear();
+            {
+                // sample next token
+                const float temp  = 1.0; // TODO
+                const int n_vocab = model.hparams.n_vocab;
+                whisper_vocab::id id = 0;
+                {
+                    const int64_t t_start_sample_us = ggml_time_us();
+                    id = whisper_sample_best(vocab, probs.data() + (probs.size() - n_vocab), temp, i > params.max_tokens_per_iter ? vocab.token_beg : 0);
+                    t_sample_us += ggml_time_us() - t_start_sample_us;
+                }
+                // end of text token
+                if (id == vocab.token_eot) {
+                    break;
+                }
+                // 2 consecutive time tokens
+                if (id > vocab.token_beg && last_id > vocab.token_beg) {
+                    seek_delta = 2*(id - vocab.token_beg);
+                    done = true;
+                }
+                last_id = id;
+                // add it to the context
+                prompt.push_back(id);
+                prompt_past.push_back(id);
+            }
+            // display text
+            for (auto id : prompt) {
+                if (params.print_special_tokens == false && id >= vocab.token_eot) {
+                    continue;
+                }
+                printf("%s", vocab.id_to_token[id].c_str());
+            }
+            fflush(stdout);
+            if (done) {
+                break;
+            }
+        }
+        seek += seek_delta;
+    }
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+        printf("\n\n");
+        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
+        printf("%s:      mel time = %8.2f ms\n", __func__, t_mel_us/1000.0f);
+        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
+        printf("%s:   encode time = %8.2f ms / %.2f ms per layer\n", __func__, t_encode_us/1000.0f, t_encode_us/1000.0f/model.hparams.n_audio_layer);
+        printf("%s:   decode time = %8.2f ms\n", __func__, t_decode_us/1000.0f);
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
+    }
+    ggml_free(model.ctx);
+    return 0;
+}

models/.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.bin

samples/.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ *

samples/jfk.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:59dfb9a4acb36fe2a2affc14bacbee2920ff435cb13cc314a08c13f66ba7860e
+size 352078