diff --git a/models/convert-h5-to-ggml.py b/models/convert-h5-to-ggml.py index 80244d735e9..9f004d9bce5 100644 --- a/models/convert-h5-to-ggml.py +++ b/models/convert-h5-to-ggml.py @@ -107,6 +107,8 @@ def bytes_to_unicode(): fname_out = dir_out / "ggml-model.bin" tokens = json.load(open(dir_tokenizer / "vocab.json", "r", encoding="utf8")) +if "<|endoftext|>" in tokens: + del tokens["<|endoftext|>"] # use 16-bit or 32-bit floats use_f16 = True diff --git a/models/for-tests-ggml-base.bin b/models/for-tests-ggml-base.bin index 93cd25b9ea3..76492eb0eb4 100644 Binary files a/models/for-tests-ggml-base.bin and b/models/for-tests-ggml-base.bin differ diff --git a/models/for-tests-ggml-base.en.bin b/models/for-tests-ggml-base.en.bin index 8d9635c4b45..737c439a835 100644 Binary files a/models/for-tests-ggml-base.en.bin and b/models/for-tests-ggml-base.en.bin differ diff --git a/models/for-tests-ggml-large.bin b/models/for-tests-ggml-large.bin index 6f73b5bf401..f0eb8853d9c 100644 Binary files a/models/for-tests-ggml-large.bin and b/models/for-tests-ggml-large.bin differ diff --git a/models/for-tests-ggml-medium.bin b/models/for-tests-ggml-medium.bin index 6ca087929c5..60839c2d7f3 100644 Binary files a/models/for-tests-ggml-medium.bin and b/models/for-tests-ggml-medium.bin differ diff --git a/models/for-tests-ggml-medium.en.bin b/models/for-tests-ggml-medium.en.bin index 281ebdbb41c..8a568857139 100644 Binary files a/models/for-tests-ggml-medium.en.bin and b/models/for-tests-ggml-medium.en.bin differ diff --git a/models/for-tests-ggml-small.bin b/models/for-tests-ggml-small.bin index e9439e99f87..c87e07c492d 100644 Binary files a/models/for-tests-ggml-small.bin and b/models/for-tests-ggml-small.bin differ diff --git a/models/for-tests-ggml-small.en.bin b/models/for-tests-ggml-small.en.bin index 8b301de0a63..04f01ff1dc1 100644 Binary files a/models/for-tests-ggml-small.en.bin and b/models/for-tests-ggml-small.en.bin differ diff --git a/models/for-tests-ggml-tiny.bin b/models/for-tests-ggml-tiny.bin index 1351aeba9b1..6837ddf0df0 100644 Binary files a/models/for-tests-ggml-tiny.bin and b/models/for-tests-ggml-tiny.bin differ diff --git a/models/for-tests-ggml-tiny.en.bin b/models/for-tests-ggml-tiny.en.bin index 30868e1c8e3..e8fe2044b18 100644 Binary files a/models/for-tests-ggml-tiny.en.bin and b/models/for-tests-ggml-tiny.en.bin differ diff --git a/models/gen-test-models.py b/models/gen-test-models.py new file mode 100644 index 00000000000..7d5ae511bef --- /dev/null +++ b/models/gen-test-models.py @@ -0,0 +1,136 @@ +import base64 +import os +import shutil +import struct +import numpy as np + +# ggml magic number +GGML_FILE_MAGIC = 0x67676d6c # "ggml" + + +# Hyperparameter settings (configuration using tiny.en model) +class HyperParams: + def __init__(self, + n_vocab=51865, + n_audio_ctx=1500, + n_audio_state=384, + n_audio_head=6, + n_audio_layer=4, + n_text_ctx=448, + n_text_state=384, + n_text_head=6, + n_text_layer=4, + n_mels=80): + self.n_vocab = n_vocab + self.n_audio_ctx = n_audio_ctx + self.n_audio_state = n_audio_state + self.n_audio_head = n_audio_head + self.n_audio_layer = n_audio_layer + self.n_text_ctx = n_text_ctx + self.n_text_state = n_text_state + self.n_text_head = n_text_head + self.n_text_layer = n_text_layer + self.n_mels = n_mels + self.ftype = True # True: fp16, False: fp32 + +def write_ggml_metadata(fout, hparams): + # write magic number + fout.write(struct.pack("i", GGML_FILE_MAGIC)) + + # write hyperparameters + fout.write(struct.pack("i", hparams.n_vocab)) + fout.write(struct.pack("i", hparams.n_audio_ctx)) + fout.write(struct.pack("i", hparams.n_audio_state)) + fout.write(struct.pack("i", hparams.n_audio_head)) + fout.write(struct.pack("i", hparams.n_audio_layer)) + fout.write(struct.pack("i", hparams.n_text_ctx)) + fout.write(struct.pack("i", hparams.n_text_state)) + fout.write(struct.pack("i", hparams.n_text_head)) + fout.write(struct.pack("i", hparams.n_text_layer)) + fout.write(struct.pack("i", hparams.n_mels)) + fout.write(struct.pack("i", hparams.ftype)) + +def write_mel_filters(fout, hparams, mel_filters_path): + print("loading real Mel filter data...") + # load the Mel filter from the npz file + with np.load(mel_filters_path) as f: + filters = f[f"mel_{hparams.n_mels}"] + fout.write(struct.pack("i", filters.shape[0])) + fout.write(struct.pack("i", filters.shape[1])) + for i in range(filters.shape[0]): + for j in range(filters.shape[1]): + fout.write(struct.pack("f", filters[i][j])) + +def write_tokenizer(fout, tokenizer_path): + # read tokenizer file + with open(tokenizer_path, "r") as f: + tokens = {base64.b64decode(token): int(rank) for token, rank in (line.split() for line in f.readlines() if line)} + # write size of tokenizer + fout.write(struct.pack("i", len(tokens))) + # write vocabulary + for t in tokens: + fout.write(struct.pack("i", len(t))) + fout.write(t) + +def generate_empty_model(filename, hparams): + print(f"generate empty model file: {filename}") + with open(filename, "wb") as f: + write_ggml_metadata(f, hparams) + write_mel_filters(f, hparams, "whisper/whisper/assets/mel_filters.npz") + write_tokenizer(f, f"whisper/whisper/assets/{'gpt2' if hparams.n_vocab < 51865 else 'multilingual'}.tiktoken") + # ignore the rest of the model + +if __name__ == "__main__": + os.system("git clone https://github.com/openai/whisper.git") + + # Base models + generate_empty_model("for-tests-ggml-base.bin", HyperParams( + n_vocab=51865, n_audio_state=512, n_audio_head=8, n_audio_layer=6, + n_text_state=512, n_text_head=8, n_text_layer=6 + )) + generate_empty_model("for-tests-ggml-base.en.bin", HyperParams( + n_vocab=51864, n_audio_state=512, n_audio_head=8, n_audio_layer=6, + n_text_state=512, n_text_head=8, n_text_layer=6 + )) + + # Small models + generate_empty_model("for-tests-ggml-small.bin", HyperParams( + n_vocab=51865, n_audio_state=768, n_audio_head=12, n_audio_layer=12, + n_text_state=768, n_text_head=12, n_text_layer=12 + )) + generate_empty_model("for-tests-ggml-small.en.bin", HyperParams( + n_vocab=51864, n_audio_state=768, n_audio_head=12, n_audio_layer=12, + n_text_state=768, n_text_head=12, n_text_layer=12 + )) + + # Medium models + generate_empty_model("for-tests-ggml-medium.bin", HyperParams( + n_vocab=51865, n_audio_state=1024, n_audio_head=16, n_audio_layer=24, + n_text_state=1024, n_text_head=16, n_text_layer=24 + )) + generate_empty_model("for-tests-ggml-medium.en.bin", HyperParams( + n_vocab=51864, n_audio_state=1024, n_audio_head=16, n_audio_layer=24, + n_text_state=1024, n_text_head=16, n_text_layer=24 + )) + + # Large models + generate_empty_model("for-tests-ggml-large.bin", HyperParams( + n_vocab=51865, n_audio_state=1280, n_audio_head=20, n_audio_layer=32, + n_text_state=1280, n_text_head=20, n_text_layer=32 + )) + # generate_empty_model("for-tests-ggml-large-v3.bin", HyperParams( # add <|yue|> + # n_vocab=51866, n_audio_state=1280, n_audio_head=20, n_audio_layer=32, + # n_text_state=1280, n_text_head=20, n_text_layer=32 + # )) + + # Tiny models + generate_empty_model("for-tests-ggml-tiny.bin", HyperParams(n_vocab=51865)) + generate_empty_model("for-tests-ggml-tiny.en.bin", HyperParams(n_vocab=51864)) + + # Turbo model (based on large-v3 with optimizations) + # generate_empty_model("for-tests-ggml-turbo.bin", HyperParams( # add <|yue|> + # n_vocab=51866, n_audio_state=1280, n_audio_head=20, n_audio_layer=32, + # n_text_state=1280, n_text_head=20, n_text_layer=32 + # )) + + shutil.rmtree("whisper", ignore_errors=True) diff --git a/src/whisper.cpp b/src/whisper.cpp index b6581f2b409..ab0e93b7921 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -453,7 +453,7 @@ struct whisper_vocab { } int num_languages() const { - return n_vocab - 51765 - (is_multilingual() ? 1 : 0); + return token_translate - token_sot - 1; } }; @@ -1587,21 +1587,16 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con // load vocab { - int32_t n_vocab = 0; - read_safe(loader, n_vocab); - - //if (n_vocab != model.hparams.n_vocab) { - // WHISPER_LOG_ERROR("%s: invalid model file '%s' (bad vocab size %d != %d)\n", - // __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); - // return false; - //} + int32_t n_common_vocab = 0; + read_safe(loader, n_common_vocab); + WHISPER_LOG_INFO("%s: n_common_vocab = %d\n", __func__, n_common_vocab); std::string word; std::vector tmp; tmp.reserve(128); - for (int i = 0; i < n_vocab; i++) { + for (int i = 0; i < n_common_vocab; i++) { uint32_t len; read_safe(loader, len); @@ -1621,26 +1616,23 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con //printf("%s: vocab[%d] = '%s'\n", __func__, i, word.c_str()); } - vocab.n_vocab = model.hparams.n_vocab; - if (vocab.is_multilingual()) { - vocab.token_eot++; - vocab.token_sot++; - - // account for variable number of language tokens - const int dt = vocab.num_languages() - 98; + vocab.n_vocab = model.hparams.n_vocab; // all tokens, including special tokens - vocab.token_translate += dt; - vocab.token_transcribe += dt; - vocab.token_solm += dt; - vocab.token_prev += dt; - vocab.token_nosp += dt; - vocab.token_not += dt; - vocab.token_beg += dt; - } + vocab.token_eot = n_common_vocab; // <|endoftext|> 50256 for en, 50257 for multilingual, others for custom model + vocab.token_sot = n_common_vocab + 1; // <|startoftranscribe|> + // [n_common_vocab + 2, vocab.n_vocab - 1507) are language tokens + // num_language = vocab.token_translate - vocab.token_sot - 1 = vocab.n_vocab - n_common_vocab - 1509 + vocab.token_translate = vocab.n_vocab - 1507; // <|translate|> + vocab.token_transcribe = vocab.n_vocab - 1506; // <|transcribe|> + vocab.token_solm = vocab.n_vocab - 1505; // <|startoflm|> + vocab.token_prev = vocab.n_vocab - 1504; // <|startofprev|> + vocab.token_nosp = vocab.n_vocab - 1503; // <|nospeech|> + vocab.token_not = vocab.n_vocab - 1502; // <|notimestamps|> + vocab.token_beg = vocab.n_vocab - 1501; // timestamps from <|0.00|> to <|30.00|>, 1501 tokens - if (n_vocab < model.hparams.n_vocab) { - WHISPER_LOG_INFO("%s: adding %d extra tokens\n", __func__, model.hparams.n_vocab - n_vocab); - for (int i = n_vocab; i < model.hparams.n_vocab; i++) { + if (n_common_vocab < model.hparams.n_vocab) { + WHISPER_LOG_INFO("%s: adding %d extra tokens\n", __func__, model.hparams.n_vocab - n_common_vocab); + for (int i = n_common_vocab; i < model.hparams.n_vocab; i++) { if (i > vocab.token_beg) { word = "[_TT_" + std::to_string(i - vocab.token_beg) + "]"; } else if (i == vocab.token_eot) {