diff --git a/README.md b/README.md index f0f79195007..ea3664e76cd 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,13 @@ -# Alpaca.cpp +# alpaca.cpp + +## Changes to Alpaca.cpp Repository + +We introduce the following changes/hacks to the repository: +- Update the number of tokens in the vocabulary to match gpt4all +- Remove the instruction/response prompt in the repository +- Add chat binaries (OSX and Linux) to the repository + +## Get Started (7B) Run a fast ChatGPT-like model locally on your device. The screencast below is not sped up and running on an M2 Macbook Air with 4GB of weights. @@ -8,7 +17,6 @@ Run a fast ChatGPT-like model locally on your device. The screencast below is no This combines the [LLaMA foundation model](https://github.com/facebookresearch/llama) with an [open reproduction](https://github.com/tloen/alpaca-lora) of [Stanford Alpaca](https://github.com/tatsu-lab/stanford_alpaca) a fine-tuning of the base model to obey instructions (akin to the [RLHF](https://huggingface.co/blog/rlhf) used to train ChatGPT) and a set of modifications to [llama.cpp](https://github.com/ggerganov/llama.cpp) to add a chat interface. -## Get Started (7B) Download the zip file corresponding to your operating system from the [latest release](https://github.com/antimatter15/alpaca.cpp/releases/latest). On Windows, download `alpaca-win.zip`, on Mac (both Intel or ARM) download `alpaca-mac.zip`, and on Linux (x64) download `alpaca-linux.zip`. diff --git a/chat.cpp b/chat.cpp index 22f0a4ddf4a..4e9de5811f6 100644 --- a/chat.cpp +++ b/chat.cpp @@ -38,7 +38,7 @@ static const std::map LLAMA_N_PARTS = { // default hparams (LLaMA 7B) struct llama_hparams { - int32_t n_vocab = 32000; + int32_t n_vocab = 32001; int32_t n_ctx = 512; // this is provided as user input? int32_t n_embd = 4096; int32_t n_mult = 256; @@ -153,7 +153,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab } std::string word; - for (int i = 0; i < n_vocab; i++) { + for (int i = 0; i < n_vocab - 1; i++) { uint32_t len; fin.read((char *) &len, sizeof(len)); @@ -167,6 +167,8 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab // fprintf(stderr, "%s: vocab[%d] = '%s'\n", __func__, i, word.c_str()); //} } + vocab.token_to_id[""] = n_vocab - 1; + vocab.id_to_token[n_vocab - 1] = ""; } // for the big tensors, we have the option to store the data in 16-bit floats or quantized diff --git a/gpt4all-lora-quantized-OSX-m1 b/gpt4all-lora-quantized-OSX-m1 new file mode 100755 index 00000000000..5d9664401c6 Binary files /dev/null and b/gpt4all-lora-quantized-OSX-m1 differ diff --git a/gpt4all-lora-quantized-linux-x86 b/gpt4all-lora-quantized-linux-x86 new file mode 100755 index 00000000000..542c1ebf0f5 Binary files /dev/null and b/gpt4all-lora-quantized-linux-x86 differ diff --git a/utils.h b/utils.h index 2a843371a35..b81b9868a63 100644 --- a/utils.h +++ b/utils.h @@ -13,53 +13,55 @@ // // The default parameters -struct gpt_params { - int32_t seed = -1; // RNG seed - int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); - int32_t n_predict = 128; // new tokens to predict - int32_t repeat_last_n = 64; // last n tokens to penalize - int32_t n_ctx = 2048; //context size - +struct gpt_params +{ + int32_t seed = -1; // RNG seed + int32_t n_threads = std::min(4, (int32_t)std::thread::hardware_concurrency()); + int32_t n_predict = 128; // new tokens to predict + int32_t repeat_last_n = 64; // last n tokens to penalize + int32_t n_ctx = 2048; // context size + // sampling parameters int32_t top_k = 40; - float top_p = 0.95f; - float temp = 0.10f; - float repeat_penalty = 1.30f; + float top_p = 0.95f; + float temp = 0.10f; + float repeat_penalty = 1.30f; int32_t n_batch = 8; // batch size for prompt processing - std::string model = "ggml-alpaca-7b-q4.bin"; // model path + std::string model = "gpt4all-lora-quantized.bin"; // model path std::string prompt; bool use_color = true; // use color to distinguish generations and inputs - bool interactive = true; // interactive mode + bool interactive = true; // interactive mode bool interactive_start = true; // reverse prompt immediately - std::string antiprompt = ""; // string upon seeing which more user input is prompted + std::string antiprompt = ""; // string upon seeing which more user input is prompted }; -bool gpt_params_parse(int argc, char ** argv, gpt_params & params); +bool gpt_params_parse(int argc, char **argv, gpt_params ¶ms); -void gpt_print_usage(int argc, char ** argv, const gpt_params & params); +void gpt_print_usage(int argc, char **argv, const gpt_params ¶ms); -std::string gpt_random_prompt(std::mt19937 & rng); +std::string gpt_random_prompt(std::mt19937 &rng); // // Vocab utils // -struct gpt_vocab { - using id = int32_t; +struct gpt_vocab +{ + using id = int32_t; using token = std::string; std::map token_to_id; std::map id_to_token; }; -void replace(std::string & str, const std::string & needle, const std::string & replacement); +void replace(std::string &str, const std::string &needle, const std::string &replacement); // poor-man's JSON parsing -std::map json_parse(const std::string & fname); +std::map json_parse(const std::string &fname); // split text into tokens // @@ -71,14 +73,14 @@ std::map json_parse(const std::string & fname); // Regex (C++): // R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)" // -std::vector gpt_tokenize(const gpt_vocab & vocab, const std::string & text); +std::vector gpt_tokenize(const gpt_vocab &vocab, const std::string &text); // TODO: this is probably wrong, but I cannot figure out how this tokenizer works .. // ref: https://github.com/google/sentencepiece -std::vector llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos); +std::vector llama_tokenize(const gpt_vocab &vocab, const std::string &text, bool bos); // load the tokens from encoder.json -bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab); +bool gpt_vocab_init(const std::string &fname, gpt_vocab &vocab); // sample next token given probabilities for each embedding // @@ -86,21 +88,21 @@ bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab); // - from them, consider only the top tokens with cumulative probability > P // gpt_vocab::id llama_sample_top_p_top_k( - const gpt_vocab & vocab, - const float * logits, - std::vector & last_n_tokens, - double repeat_penalty, - int top_k, - double top_p, - double temp, - std::mt19937 & rng); + const gpt_vocab &vocab, + const float *logits, + std::vector &last_n_tokens, + double repeat_penalty, + int top_k, + double top_p, + double temp, + std::mt19937 &rng); // filer to top K tokens from list of logits -void sample_top_k(std::vector> & logits_id, int top_k); +void sample_top_k(std::vector> &logits_id, int top_k); // // Quantization // -size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist); -size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist); +size_t ggml_quantize_q4_0(float *src, void *dst, int n, int k, int qk, int64_t *hist); +size_t ggml_quantize_q4_1(float *src, void *dst, int n, int k, int qk, int64_t *hist);