Spaces:
Running
Running
| import struct | |
| import torch | |
| import numpy as np | |
| from collections import OrderedDict | |
| from pathlib import Path | |
| import sys | |
| if len(sys.argv) < 3: | |
| print( | |
| "Usage: convert-ggml-to-pt.py model.bin dir-output\n") | |
| sys.exit(1) | |
| fname_inp = Path(sys.argv[1]) | |
| dir_out = Path(sys.argv[2]) | |
| fname_out = dir_out / "torch-model.pt" | |
| # Open the ggml file | |
| with open(fname_inp, "rb") as f: | |
| # Read magic number and hyperparameters | |
| magic_number, n_vocab, n_audio_ctx, n_audio_state, n_audio_head, n_audio_layer, n_text_ctx, n_text_state, n_text_head, n_text_layer, n_mels, use_f16 = struct.unpack("12i", f.read(48)) | |
| print(f"Magic number: {magic_number}") | |
| print(f"Vocab size: {n_vocab}") | |
| print(f"Audio context size: {n_audio_ctx}") | |
| print(f"Audio state size: {n_audio_state}") | |
| print(f"Audio head size: {n_audio_head}") | |
| print(f"Audio layer size: {n_audio_layer}") | |
| print(f"Text context size: {n_text_ctx}") | |
| print(f"Text head size: {n_text_head}") | |
| print(f"Mel size: {n_mels}") | |
| # Read mel filters | |
| # mel_filters = np.fromfile(f, dtype=np.float32, count=n_mels * 2).reshape(n_mels, 2) | |
| # print(f"Mel filters: {mel_filters}") | |
| filters_shape_0 = struct.unpack("i", f.read(4))[0] | |
| print(f"Filters shape 0: {filters_shape_0}") | |
| filters_shape_1 = struct.unpack("i", f.read(4))[0] | |
| print(f"Filters shape 1: {filters_shape_1}") | |
| # Read tokenizer tokens | |
| # bytes = f.read(4) | |
| # print(bytes) | |
| # for i in range(filters.shape[0]): | |
| # for j in range(filters.shape[1]): | |
| # fout.write(struct.pack("f", filters[i][j])) | |
| mel_filters = np.zeros((filters_shape_0, filters_shape_1)) | |
| for i in range(filters_shape_0): | |
| for j in range(filters_shape_1): | |
| mel_filters[i][j] = struct.unpack("f", f.read(4))[0] | |
| bytes_data = f.read(4) | |
| num_tokens = struct.unpack("i", bytes_data)[0] | |
| tokens = {} | |
| for _ in range(num_tokens): | |
| token_len = struct.unpack("i", f.read(4))[0] | |
| token = f.read(token_len) | |
| tokens[token] = {} | |
| # Read model variables | |
| model_state_dict = OrderedDict() | |
| while True: | |
| try: | |
| n_dims, name_length, ftype = struct.unpack("iii", f.read(12)) | |
| except struct.error: | |
| break # End of file | |
| dims = [struct.unpack("i", f.read(4))[0] for _ in range(n_dims)] | |
| dims = dims[::-1] | |
| name = f.read(name_length).decode("utf-8") | |
| if ftype == 1: # f16 | |
| data = np.fromfile(f, dtype=np.float16, count=np.prod(dims)).reshape(dims) | |
| else: # f32 | |
| data = np.fromfile(f, dtype=np.float32, count=np.prod(dims)).reshape(dims) | |
| if name in ["encoder.conv1.bias", "encoder.conv2.bias"]: | |
| data = data[:, 0] | |
| model_state_dict[name] = torch.from_numpy(data) | |
| # Now you have the model's state_dict stored in model_state_dict | |
| # You can load this state_dict into a model with the same architecture | |
| # dims = ModelDimensions(**checkpoint["dims"]) | |
| # model = Whisper(dims) | |
| from whisper import Whisper, ModelDimensions | |
| dims = ModelDimensions( | |
| n_mels=n_mels, | |
| n_audio_ctx=n_audio_ctx, | |
| n_audio_state=n_audio_state, | |
| n_audio_head=n_audio_head, | |
| n_audio_layer=n_audio_layer, | |
| n_text_ctx=n_text_ctx, | |
| n_text_state=n_text_state, | |
| n_text_head=n_text_head, | |
| n_text_layer=n_text_layer, | |
| n_vocab=n_vocab, | |
| ) | |
| model = Whisper(dims) # Replace with your model's class | |
| model.load_state_dict(model_state_dict) | |
| # Save the model in PyTorch format | |
| torch.save(model.state_dict(), fname_out) | |