2121# TODO: automate the update of convert-hf-to-gguf.py
2222#
2323
24+ import logging
2425import os
2526import requests
2627import sys
2728import json
2829
2930from hashlib import sha256
3031from enum import IntEnum , auto
32+ from transformers import AutoTokenizer
33+
34+ logger = logging .getLogger ("convert-hf-to-gguf-update" )
35+
3136
3237class TOKENIZER_TYPE (IntEnum ):
3338 SPM = auto ()
3439 BPE = auto ()
3540 WPM = auto ()
3641
42+
3743# TODO: this string has to exercise as much pre-tokenizer functionality as possible
3844# will be updated with time - contributions welcome
3945chktxt = '\n \n \n \n \n \n \t \t \t \t \n \n \n \n \n 🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \' \' \' \' \' \' ```````\" \" \" \" ......!!!!!!?????? I\' ve been \' told he\' s there, \' RE you sure? \' M not sure I\' ll make it, \' D you like some tea? We\' Ve a\' lL'
4046
4147if len (sys .argv ) == 2 :
4248 token = sys .argv [1 ]
4349else :
44- print ("Usage: python convert-hf-to-gguf-update.py <huggingface_token>" )
50+ logger . info ("Usage: python convert-hf-to-gguf-update.py <huggingface_token>" )
4551 sys .exit (1 )
4652
4753# TODO: add models here, base models preferred
4854models = [
49- { "name" : "llama-spm" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/meta-llama/Llama-2-7b-hf" , },
50- { "name" : "llama-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/meta-llama/Meta-Llama-3-8B" , },
51- { "name" : "phi-3" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct" , },
52- { "name" : "deepseek-llm" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base" , },
53- { "name" : "deepseek-coder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base" , },
54- { "name" : "falcon" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/tiiuae/falcon-7b" , },
55- { "name" : "bert-bge" , "tokt" : TOKENIZER_TYPE .WPM , "repo" : "https://huggingface.co/BAAI/bge-small-en-v1.5" , },
56- { "name" : "mpt" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/mosaicml/mpt-7b" , },
57- { "name" : "starcoder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/bigcode/starcoder2-3b" , },
58- { "name" : "gpt-2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/openai-community/gpt2" , },
59- ]
55+ { "name" : "llama-spm" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/meta-llama/Llama-2-7b-hf" , },
56+ { "name" : "llama-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/meta-llama/Meta-Llama-3-8B" , },
57+ { "name" : "phi-3" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct" , },
58+ { "name" : "deepseek-llm" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base" , },
59+ { "name" : "deepseek-coder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base" , },
60+ { "name" : "falcon" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/tiiuae/falcon-7b" , },
61+ { "name" : "bert-bge" , "tokt" : TOKENIZER_TYPE .WPM , "repo" : "https://huggingface.co/BAAI/bge-small-en-v1.5" , },
62+ { "name" : "mpt" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/mosaicml/mpt-7b" , },
63+ { "name" : "starcoder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/bigcode/starcoder2-3b" , },
64+ { "name" : "gpt-2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/openai-community/gpt2" , },
65+ ]
6066
6167# make directory "models/tokenizers" if it doesn't exist
6268if not os .path .exists ("models/tokenizers" ):
6369 os .makedirs ("models/tokenizers" )
6470
71+
6572def download_file_with_auth (url , token , save_path ):
6673 headers = {"Authorization" : f"Bearer { token } " }
6774 response = requests .get (url , headers = headers )
6875 if response .status_code == 200 :
6976 with open (save_path , 'wb' ) as f :
7077 f .write (response .content )
71- print (f"File { save_path } downloaded successfully" )
78+ logger . info (f"File { save_path } downloaded successfully" )
7279 else :
73- print (f"Failed to download file. Status code: { response .status_code } " )
80+ logger .info (f"Failed to download file. Status code: { response .status_code } " )
81+
7482
7583# download the tokenizer models
7684for model in models :
@@ -81,10 +89,10 @@ def download_file_with_auth(url, token, save_path):
8189 if not os .path .exists (f"models/tokenizers/{ name } " ):
8290 os .makedirs (f"models/tokenizers/{ name } " )
8391 else :
84- print (f"Directory models/tokenizers/{ name } already exists - skipping" )
92+ logger . info (f"Directory models/tokenizers/{ name } already exists - skipping" )
8593 continue
8694
87- print (f"Downloading { name } to models/tokenizers/{ name } " )
95+ logger . info (f"Downloading { name } to models/tokenizers/{ name } " )
8896
8997 url = f"{ repo } /raw/main/config.json"
9098 save_path = f"models/tokenizers/{ name } /config.json"
@@ -115,76 +123,76 @@ def download_file_with_auth(url, token, save_path):
115123 continue
116124
117125 # create the tokenizer
118- from transformers import AutoTokenizer
119126 tokenizer = AutoTokenizer .from_pretrained (f"models/tokenizers/{ name } " )
120127
121128 chktok = tokenizer .encode (chktxt )
122129 chkhsh = sha256 (str (chktok ).encode ()).hexdigest ()
123130
124- print (f"model: { name } " )
125- print (f"tokt: { tokt } " )
126- print (f"repo: { model ['repo' ]} " )
127- print (f"chktok: { chktok } " )
128- print (f"chkhsh: { chkhsh } " )
131+ logger . info (f"model: { name } " )
132+ logger . info (f"tokt: { tokt } " )
133+ logger . info (f"repo: { model ['repo' ]} " )
134+ logger . info (f"chktok: { chktok } " )
135+ logger . info (f"chkhsh: { chkhsh } " )
129136
130137 # print the "pre_tokenizer" content from the tokenizer.json
131138 with open (f"models/tokenizers/{ name } /tokenizer.json" , "r" , encoding = "utf-8" ) as f :
132139 cfg = json .load (f )
133140 pre_tokenizer = cfg ["pre_tokenizer" ]
134- print ("pre_tokenizer: " + json .dumps (pre_tokenizer , indent = 4 ))
141+ logger . info ("pre_tokenizer: " + json .dumps (pre_tokenizer , indent = 4 ))
135142
136- print ( f" \n " )
143+ logger . info ( " " )
137144
138145 src_ifs += f" if chkhsh == \" { chkhsh } \" :\n "
139146 src_ifs += f" # ref: { model ['repo' ]} \n "
140147 src_ifs += f" res = \" { name } \" \n "
141148
142- src_func = ""
143- src_func += " def get_vocab_base_pre(self, tokenizer) -> str:\n "
144- src_func += " # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that\n "
145- src_func += " # is specific for the BPE pre-tokenizer used by the model\n "
146- src_func += " # we will use this unique identifier to write a \" tokenizer.ggml.pre\" entry in the GGUF file which we can\n "
147- src_func += " # use in llama.cpp to implement the same pre-tokenizer\n "
148- src_func += "\n "
149- src_func += f" chktxt = { repr (chktxt )} \n "
150- src_func += "\n "
151- src_func += " chktok = tokenizer.encode(chktxt)\n "
152- src_func += " chkhsh = sha256(str(chktok).encode()).hexdigest()\n "
153- src_func += "\n "
154- src_func += " print(f\" chktok: {chktok}\" )\n "
155- src_func += " print(f\" chkhsh: {chkhsh}\" )\n "
156- src_func += "\n "
157- src_func += " res = None\n "
158- src_func += "\n "
159- src_func += " # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script\n "
160- src_func += " # or pull the latest version of the model from Huggingface\n "
161- src_func += " # don't edit the hashes manually!\n "
162- src_func += f"{ src_ifs } \n "
163- src_func += " if res is None:\n "
164- src_func += " print(\" \\ n\" )\n "
165- src_func += " print(\" **************************************************************************************\" )\n "
166- src_func += " print(\" ** WARNING: The BPE pre-tokenizer was not recognized!\" )\n "
167- src_func += " print(\" ** There are 2 possible reasons for this:\" )\n "
168- src_func += " print(\" ** - the model has not been added to convert-hf-to-gguf-update.py yet\" )\n "
169- src_func += " print(\" ** - the pre-tokenization config has changed upstream\" )\n "
170- src_func += " print(\" ** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.\" )\n "
171- src_func += " print(\" ** ref: https://github.com/ggerganov/llama.cpp/pull/6920\" )\n "
172- src_func += " print(\" **\" )\n "
173- src_func += " print(f\" ** chkhsh: {chkhsh}\" )\n "
174- src_func += " print(\" **************************************************************************************\" )\n "
175- src_func += " print(\" \\ n\" )\n "
176- src_func += " raise NotImplementedError(\" BPE pre-tokenizer was not recognized - update get_vocab_base_pre()\" )\n "
177- src_func += "\n "
178- src_func += " print(f\" tokenizer.ggml.pre: {res}\" )\n "
179- src_func += " print(f\" chkhsh: {chkhsh}\" )\n "
180- src_func += "\n "
181- src_func += " return res\n "
182-
183- print (src_func )
184-
185- print ("\n " )
186- print ("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!" )
187- print ("\n " )
149+ src_func = f"""
150+ def get_vocab_base_pre(self, tokenizer) -> str:
151+ # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
152+ # is specific for the BPE pre-tokenizer used by the model
153+ # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
154+ # use in llama.cpp to implement the same pre-tokenizer
155+
156+ chktxt = { repr (chktxt )}
157+
158+ chktok = tokenizer.encode(chktxt)
159+ chkhsh = sha256(str(chktok).encode()).hexdigest()
160+
161+ print(f"chktok: {{chktok}}")
162+ print(f"chkhsh: {{chkhsh}}")
163+
164+ res = None
165+
166+ # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script
167+ # or pull the latest version of the model from Huggingface
168+ # don't edit the hashes manually!
169+ { src_ifs }
170+ if res is None:
171+ print("\\ n")
172+ print("**************************************************************************************")
173+ print("** WARNING: The BPE pre-tokenizer was not recognized!")
174+ print("** There are 2 possible reasons for this:")
175+ print("** - the model has not been added to convert-hf-to-gguf-update.py yet")
176+ print("** - the pre-tokenization config has changed upstream")
177+ print("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
178+ print("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
179+ print("**")
180+ print(f"** chkhsh: {{chkhsh}}")
181+ print("**************************************************************************************")
182+ print("\\ n")
183+ raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
184+
185+ print(f"tokenizer.ggml.pre: {{repr(res)}}")
186+ print(f"chkhsh: {{chkhsh}}")
187+
188+ return res
189+ """
190+
191+ print (src_func ) # noqa: NP100
192+
193+ logger .info ("\n " )
194+ logger .info ("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!" )
195+ logger .info ("\n " )
188196
189197# generate tests for each tokenizer model
190198
@@ -250,7 +258,6 @@ def download_file_with_auth(url, token, save_path):
250258 tokt = model ["tokt" ]
251259
252260 # create the tokenizer
253- from transformers import AutoTokenizer
254261 tokenizer = AutoTokenizer .from_pretrained (f"models/tokenizers/{ name } " )
255262
256263 with open (f"models/ggml-vocab-{ name } .gguf.inp" , "w" , encoding = "utf-8" ) as f :
@@ -265,15 +272,15 @@ def download_file_with_auth(url, token, save_path):
265272 f .write (f" { r } " )
266273 f .write ("\n " )
267274
268- print (f"Tests for { name } written in ./models/ggml-vocab-{ name } .gguf.*" )
275+ logger . info (f"Tests for { name } written in ./models/ggml-vocab-{ name } .gguf.*" )
269276
270277# generate commands for creating vocab files
271278
272- print ("\n Run the following commands to generate the vocab files for testing:\n " )
279+ logger . info ("\n Run the following commands to generate the vocab files for testing:\n " )
273280
274281for model in models :
275282 name = model ["name" ]
276283
277- print (f"python3 convert-hf-to-gguf.py models/tokenizers/{ name } / --outfile models/ggml-vocab-{ name } .gguf --vocab-only" )
284+ logger . info (f"python3 convert-hf-to-gguf.py models/tokenizers/{ name } / --outfile models/ggml-vocab-{ name } .gguf --vocab-only" )
278285
279- print ("\n " )
286+ logger . info ("\n " )
0 commit comments