RWKV · harrisonvanderbyl · Jan 23, 2025 · Aug 25, 2024 · Sep 19, 2024 · Jan 7, 2025
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -186,9 +186,8 @@ jobs:
            defines: '-DRWKV_AVX512=ON'
          - build: 'cuda12'
            defines: '-DRWKV_CUBLAS=ON'
-         - build: 'rocm5.5'
-           defines: '-G "Unix Makefiles" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DRWKV_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx1100;gfx1102;gfx1030"'
-
+         - build: 'hip'
+           defines: ''
     steps:
       - name: Clone
         id: checkout
@@ -206,25 +205,52 @@ jobs:
 
       - name: Install rocm-toolkit
         id: rocm-toolkit
-        if: ${{ matrix.build == 'rocm5.5' }}
-        uses: Cyberhan123/rocm-toolkit@v0.1.0
-        with:
-          rocm: '5.5.0'
+        if: ${{ matrix.build == 'hip' }}
+        run: |
+          $ErrorActionPreference = "Stop"
+          write-host "Downloading AMD HIP SDK Installer"
+          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+          write-host "Installing AMD HIP SDK"
+          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
+          write-host "Completed AMD HIP SDK installation"
+
+      - name: Verify ROCm
+        id: rocm-verify
+        if: ${{ matrix.build == 'hip' }}
+        run: |
+          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
 
       - name: Install Ninja
         id: install-ninja
-        if: ${{ matrix.build == 'rocm5.5' }}
+        if: ${{ matrix.build == 'hip' }}
         uses: urkle/action-get-ninja@v1
         with:
           version: 1.11.1
 
+      - name: Install ccache
+        uses: hendrikmuhs/ccache-action@v1.2
+        with:
+          key: ${{ github.job }}
+
       - name: Build
         id: cmake_build
+        if: ${{ matrix.build != 'hip' }}
         run: |
           mkdir build
           cd build
           cmake .. ${{ matrix.defines }}
-          cmake --build . --config Release
+          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
+
+      - name: Build-hip
+        id: cmake_build_hip
+        if: ${{ matrix.build == 'hip' }}
+        run: |
+          mkdir build
+          cd build
+          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
+          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
+          cmake .. -G "Unix Makefiles" -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DRWKV_HIPBLAS=ON -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release
+          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
 
       - name: Check AVX512F support
         id: check_avx512f
@@ -242,7 +268,7 @@ jobs:
       - name: Test
         id: cmake_test
         # Test AVX-512 only when possible
-        if: ${{ (matrix.build != 'avx512' || env.HAS_AVX512F == '1') && matrix.build != 'cuda12' && matrix.build != 'rocm5.5'}}
+        if: ${{ (matrix.build != 'avx512' || env.HAS_AVX512F == '1') && matrix.build != 'cuda12' && matrix.build != 'hip'}}
         run: |
           cd build
           ctest -C Release --verbose

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -58,7 +58,7 @@ endfunction()
 
 set(GGML_ACCELERATE             ${RWKV_ACCELERATE})
 set(GGML_CUDA                   ${RWKV_CUBLAS})
-set(GGML_HIPBLAS                ${RWKV_HIPBLAS})
+set(GGML_HIP                    ${RWKV_HIPBLAS})
 set(GGML_METAL                  ${RWKV_METAL})
 if (RWKV_OPENBLAS)
     set(GGML_BLAS_VENDOR "OpenBLAS")
@@ -107,6 +107,7 @@ if (RWKV_ALL_WARNINGS)
             -Wcast-qual
             -Wno-unused-function
             -Wno-multichar
+            -Wno-nonnull
         )
     else()
         set(c_flags
@@ -234,7 +235,7 @@ if (GGML_METAL)
         )
 endif()
 
-if (GGML_HIPBLAS)
+if (GGML_HIP)
     # CMake on Windows doesn't support the HIP language yet
     if (WIN32)
         set(CXX_IS_HIPCC TRUE)
@@ -262,12 +263,39 @@ if (GGML_HIPBLAS)
 endif()
 
 target_include_directories(rwkv PUBLIC .)
-target_include_directories(rwkv PRIVATE ggml/include)
+target_include_directories(rwkv PRIVATE ggml/include ggml/src)
 target_compile_features(rwkv PUBLIC cxx_std_11)
-target_link_libraries(rwkv PRIVATE $<TARGET_OBJECTS:ggml> ${RWKV_EXTRA_LIBS})
+
+if (GGML_METAL)
+    set(RWKV_EXTRA_LIBS ${RWKV_EXTRA_LIBS} $<TARGET_OBJECTS:ggml-metal> $<TARGET_OBJECTS:ggml-blas>)
+endif()
+if (GGML_CUDA)
+    set(RWKV_EXTRA_LIBS ${RWKV_EXTRA_LIBS} $<TARGET_OBJECTS:ggml-cuda>)
+endif()
+if (GGML_HIP)
+    set(RWKV_EXTRA_LIBS ${RWKV_EXTRA_LIBS} $<TARGET_OBJECTS:ggml-hip>)
+endif()
+if (GGML_RPC)
+    set(RWKV_EXTRA_LIBS ${RWKV_EXTRA_LIBS} $<TARGET_OBJECTS:ggml-rpc>)
+endif()
+
+target_link_libraries(rwkv PRIVATE $<TARGET_OBJECTS:ggml> $<TARGET_OBJECTS:ggml-base> $<TARGET_OBJECTS:ggml-cpu> ${RWKV_EXTRA_LIBS})
 
 if (RWKV_BUILD_SHARED_LIBRARY)
     set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    set_target_properties(ggml-base PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    set_target_properties(ggml-cpu PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    if (GGML_METAL)
+        set_target_properties(ggml-metal PROPERTIES POSITION_INDEPENDENT_CODE ON)
+        set_target_properties(ggml-blas PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    endif()
+    if (GGML_CUDA)
+        set_target_properties(ggml-cuda PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    endif()
+    if (GGML_HIP)
+        set_target_properties(ggml-hip PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    endif()
+
     target_compile_definitions(ggml PRIVATE GGML_SHARED GGML_BUILD)
     set_target_properties(rwkv PROPERTIES POSITION_INDEPENDENT_CODE ON)
     target_compile_definitions(rwkv PRIVATE RWKV_SHARED RWKV_BUILD)

diff --git a/README.md b/README.md
@@ -6,20 +6,18 @@ Besides the usual **FP32**, it supports **FP16**, **quantized INT4, INT5 and INT
 
 This project provides [a C library rwkv.h](rwkv.h) and [a convinient Python wrapper](python%2Frwkv_cpp%2Frwkv_cpp_model.py) for it.
 
-[RWKV](https://arxiv.org/abs/2305.13048) is a large language model architecture, [with the largest model in the family having 14B parameters](https://huggingface.co/BlinkDL/rwkv-4-pile-14b). In contrast to Transformer with `O(n^2)` attention, RWKV requires only state from previous step to calculate logits. This makes RWKV very CPU-friendly on large context lenghts.
+[RWKV](https://arxiv.org/abs/2305.13048) is a large language model architecture. In contrast to Transformer with `O(n^2)` attention, RWKV requires only state from previous step to calculate logits. This makes RWKV very CPU-friendly on large context lenghts.
 
-[RWKV v5](https://huggingface.co/BlinkDL/rwkv-5-world) is a major upgrade to RWKV architecture, making it competitive with Transformers in quality. RWKV v5 models are supported.
-
-[RWKV v6](https://huggingface.co/BlinkDL/rwkv-6-world) is a further improvement to RWKV architecture, with better quality. RWKV v6 models are supported.
+This project supports RWKV [v4](https://huggingface.co/BlinkDL/rwkv-4-pile-14b), [v5](https://huggingface.co/BlinkDL/rwkv-5-world), [v6](https://huggingface.co/BlinkDL/rwkv-6-world) and the latest [v7](https://huggingface.co/BlinkDL/rwkv-7-world) architectures.
 
 Loading LoRA checkpoints in [Blealtan's format](https://github.com/Blealtan/RWKV-LM-LoRA) is supported through [merge_lora_into_ggml.py script](rwkv%2Fmerge_lora_into_ggml.py).
 
+<!-- TODO: Update data below -->
+
 ## Quality and performance
 
 If you use `rwkv.cpp` for anything serious, please [test all available formats for perplexity and latency](rwkv%2Fmeasure_pexplexity.py) on a representative dataset, and decide which trade-off is best for you.
 
-In general, **`RWKV v5` models are as fast as `RWKV v4` models**, with minor differencies in latency and memory consumption, and with having way higher quality than `v4`. Therefore, it is recommended to use `RWKV v5`.
-
 Below table is for reference only. Measurements were made on 4C/8T x86 CPU with AVX2, 4 threads. The models are `RWKV v4 Pile 169M`, `RWKV v4 Pile 1.5B`.
 
 | Format    | Perplexity (169M) | Latency, ms (1.5B) | File size, GB (1.5B) |

diff --git a/extras/quantize.c b/extras/quantize.c
@@ -25,8 +25,10 @@ bool QueryPerformanceCounter(uint64_t* lpPerformanceCount);
 static enum ggml_type type_from_string(const char * string) {
     if (strcmp(string, "Q4_0") == 0) return GGML_TYPE_Q4_0;
     if (strcmp(string, "Q4_1") == 0) return GGML_TYPE_Q4_1;
+    if (strcmp(string, "Q4_K") == 0) return GGML_TYPE_Q4_K;
     if (strcmp(string, "Q5_0") == 0) return GGML_TYPE_Q5_0;
     if (strcmp(string, "Q5_1") == 0) return GGML_TYPE_Q5_1;
+    if (strcmp(string, "Q5_K") == 0) return GGML_TYPE_Q5_K;
     if (strcmp(string, "Q8_0") == 0) return GGML_TYPE_Q8_0;
     return GGML_TYPE_COUNT;
 }

diff --git a/ggml b/ggml
diff --git a/python/chat_with_bot.py b/python/chat_with_bot.py
@@ -40,6 +40,7 @@
 
 parser = argparse.ArgumentParser(description='Provide terminal-based chat interface for RWKV model')
 parser.add_argument('model_path', help='Path to RWKV model in ggml format')
+parser.add_argument('-ngl', '--num_gpu_layers', type=int, default=99, help='Number of layers to run on GPU')
 add_tokenizer_argument(parser)
 args = parser.parse_args()
 
@@ -48,7 +49,7 @@
 with open(script_dir / 'prompt' / f'{LANGUAGE}-{PROMPT_TYPE}.json', 'r', encoding='utf8') as json_file:
     prompt_data = json.load(json_file)
 
-    user, bot, separator, init_prompt = prompt_data['user'], prompt_data['bot'], prompt_data['separator'], prompt_data['prompt']
+    user, assistant, separator, init_prompt = prompt_data['user'], prompt_data['assistant'], prompt_data['separator'], prompt_data['prompt']
 
 if init_prompt == '':
     raise ValueError('Prompt must not be empty')
@@ -57,7 +58,7 @@
 print(f'System info: {library.rwkv_get_system_info_string()}')
 
 print('Loading RWKV model')
-model = rwkv_cpp_model.RWKVModel(library, args.model_path)
+model = rwkv_cpp_model.RWKVModel(library, args.model_path, gpu_layer_count=args.num_gpu_layers)
 
 tokenizer_decode, tokenizer_encode = get_tokenizer(args.tokenizer, model.n_vocab)
 
@@ -154,7 +155,7 @@ def split_last_end_of_line(tokens: List[int]) -> List[int]:
     if msg == '+reset':
         load_thread_state('chat_init')
         save_thread_state('chat')
-        print(f'{bot}{separator} Chat reset.\n')
+        print(f'{assistant}{separator} Chat reset.\n')
         continue
     elif msg[:5].lower() == '+gen ' or msg[:3].lower() == '+i ' or msg[:4].lower() == '+qa ' or msg[:4].lower() == '+qq ' or msg.lower() == '+++' or msg.lower() == '++':
 
@@ -194,7 +195,7 @@ def split_last_end_of_line(tokens: List[int]) -> List[int]:
             load_thread_state('chat_init')
 
             real_msg = msg[4:].strip()
-            new = f'{user}{separator} {real_msg}\n\n{bot}{separator}'
+            new = f'{user}{separator} {real_msg}\n\n{assistant}{separator}'
 
             process_tokens(tokenizer_encode(new))
             save_thread_state('gen_0')
@@ -225,17 +226,17 @@ def split_last_end_of_line(tokens: List[int]) -> List[int]:
             except Exception as e:
                 print(e)
                 continue
-        # chat with bot
+        # chat with assistant
         else:
             load_thread_state('chat')
-            new = f'{user}{separator} {msg}\n\n{bot}{separator}'
+            new = f'{user}{separator} {msg}\n\n{assistant}{separator}'
             process_tokens(tokenizer_encode(new), new_line_logit_bias=-999999999)
             save_thread_state('chat_pre')
 
         thread = 'chat'
 
-        # Print bot response
-        print(f'> {bot}{separator}', end='')
+        # Print assistant response
+        print(f'> {assistant}{separator}', end='')
 
     start_index: int = len(processed_tokens)
     accumulated_tokens: List[int] = []

diff --git a/python/convert_pytorch_to_ggml.py b/python/convert_pytorch_to_ggml.py
@@ -35,8 +35,11 @@ def write_state_dict(state_dict: Dict[str, torch.Tensor], dest_path: str, data_t
     is_v5_1_or_2: bool = 'blocks.0.att.ln_x.weight' in state_dict
     is_v5_2: bool = 'blocks.0.att.gate.weight' in state_dict
     is_v6_0: bool = 'blocks.0.att.time_maa_x' in state_dict
+    is_v7_0: bool = 'blocks.0.att.k_k' in state_dict
 
-    if is_v6_0:
+    if is_v7_0:
+        print('Detected RWKV v7.0')
+    elif is_v6_0:
         print('Detected RWKV v6.0')
     elif is_v5_2:
         print('Detected RWKV v5.2')
@@ -45,6 +48,23 @@ def write_state_dict(state_dict: Dict[str, torch.Tensor], dest_path: str, data_t
     else:
         print('Detected RWKV v4')
 
+    if is_v7_0:
+        # concat to reduce some cpu overhead during ggml inference
+        state_dict_new = {}
+        for k in state_dict.keys():
+            if 'att.x_' in k:
+                l = int(k.split('.')[1].split('.')[0])
+                try:
+                    state_dict_new[f'blocks.{l}.att.x_rwkvag'] = torch.cat(
+                        [state_dict_new[f'blocks.{l}.att.x_rwkvag'], state_dict[k]], dim=0)
+                except KeyError:
+                    state_dict_new[f'blocks.{l}.att.x_rwkvag'] = state_dict[k]
+            else:
+                state_dict_new[k] = state_dict[k]
+
+        del state_dict[k]
+        state_dict = state_dict_new
+
     with open(dest_path, 'wb') as out_file:
         is_FP16: bool = data_type == 'FP16' or data_type == 'float16'
 
@@ -68,7 +88,16 @@ def write_state_dict(state_dict: Dict[str, torch.Tensor], dest_path: str, data_t
             if '.time_' in k:
                 tensor = tensor.squeeze()
 
-            if is_v6_0:
+            if is_v7_0:
+                if any(s in k for s in [
+                    '.w1', '.w2',
+                    '.a1', '.a2',
+                    '.v1', '.v2',
+                    '.g1', '.g2',
+                ]):
+                    tensor = tensor.transpose(0, 1)
+
+            elif is_v6_0:
                 if '.time_faaaa' in k:
                     tensor = tensor.unsqueeze(-1)
                 if '.time_maa_w1' in k or '.time_decay_w' in k:
@@ -95,7 +124,14 @@ def write_state_dict(state_dict: Dict[str, torch.Tensor], dest_path: str, data_t
                     tensor = -torch.exp(tensor)
 
             # Keep 1-dim vectors and small matrices in FP32
-            if is_FP16 and len(tensor.shape) > 1 and '.time_' not in k:
+            if is_FP16 and len(tensor.shape) > 1 and all(
+                s not in k for s in [
+                    '.time_',
+                    '.k_k', '.k_a', '.r_k',
+                    '.x_rwkvag', '.x_k',
+                    '.w0', '.a0', '.v0',
+                ]
+            ):
                 tensor = tensor.half()
 
             shape = tensor.shape

diff --git a/python/generate_completions.py b/python/generate_completions.py
@@ -29,6 +29,7 @@
 
 parser = argparse.ArgumentParser(description='Generate completions from RWKV model based on a prompt')
 parser.add_argument('model_path', help='Path to RWKV model in ggml format')
+parser.add_argument('-ngl', '--num_gpu_layers', type=int, default=99, help='Number of layers to run on GPU')
 add_tokenizer_argument(parser)
 args = parser.parse_args()
 
@@ -39,7 +40,7 @@
 print(f'System info: {library.rwkv_get_system_info_string()}')
 
 print('Loading RWKV model')
-model = rwkv_cpp_model.RWKVModel(library, args.model_path, gpu_layers_count=0)
+model = rwkv_cpp_model.RWKVModel(library, args.model_path, gpu_layers_count=args.num_gpu_layers)
 
 tokenizer_decode, tokenizer_encode = get_tokenizer(args.tokenizer, model.n_vocab)
 

diff --git a/python/inference_example.py b/python/inference_example.py
@@ -10,12 +10,13 @@
 # Parse received arguments.
 parser = argparse.ArgumentParser(description='Generate some text with an RWKV model')
 parser.add_argument('model_path', help='Path to RWKV model in ggml format')
+parser.add_argument('-ngl', '--num_gpu_layers', type=int, default=99, help='Number of layers to run on GPU')
 add_tokenizer_argument(parser)
 args = parser.parse_args()
 
 # Load the model.
 library = rwkv_cpp_shared_library.load_rwkv_shared_library()
-model = rwkv_cpp_model.RWKVModel(library, args.model_path)
+model = rwkv_cpp_model.RWKVModel(library, args.model_path, gpu_layer_count=args.num_gpu_layers)
 
 # Set up the tokenizer.
 tokenizer_decode, tokenizer_encode = get_tokenizer(args.tokenizer, model.n_vocab)

diff --git a/python/prompt/Chinese-Chat.json b/python/prompt/Chinese-Chat.json
@@ -1,6 +1,6 @@
 {
     "user": "Bob",
-    "bot": "Alice",
+    "assistant": "Alice",
     "separator": ":",
     "prompt": "\nThe following is a coherent verbose detailed conversation between a Chinese girl named Alice and her friend Bob. Alice is very intelligent, creative and friendly. Alice likes to tell Bob a lot about herself and her opinions. Alice usually gives Bob kind, helpful and informative advices.\n\nBob: lhc\n\nAlice: LHC是指大型强子对撞机（Large Hadron Collider），是世界最大最强的粒子加速器，由欧洲核子中心（CERN）在瑞士日内瓦地下建造。LHC的原理是加速质子（氢离子）并让它们相撞，让科学家研究基本粒子和它们之间的相互作用，并在2012年证实了希格斯玻色子的存在。\n\nBob: 企鹅会飞吗\n\nAlice: 企鹅是不会飞的。企鹅的翅膀短而扁平，更像是游泳时的一对桨。企鹅的身体结构和羽毛密度也更适合在水中游泳，而不是飞行。\n\n"
 }
diff --git a/python/prompt/Chinese-QA.json b/python/prompt/Chinese-QA.json
@@ -1,6 +1,6 @@
 {
-    "user": "Q",
-    "bot": "A",
+    "user": "User",
+    "assistant": "Assistant",
     "separator": ":",
-    "prompt": "\nExpert Questions & Helpful Answers\n\nAsk Research Experts\n\n"
+    "prompt": "User: 你好\n\nAssistant: 你好，有什么我可以帮助你的吗？\n\n"
 }
diff --git a/python/prompt/English-Chat.json b/python/prompt/English-Chat.json
@@ -1,6 +1,6 @@
 {
     "user": "Bob",
-    "bot": "Alice",
+    "assistant": "Alice",
     "separator": ":",
     "prompt": "\nThe following is a coherent verbose detailed conversation between a girl named Alice and her friend Bob. Alice is very intelligent, creative and friendly. Alice is unlikely to disagree with Bob, and Alice doesn't like to ask Bob questions. Alice likes to tell Bob a lot about herself and her opinions. Alice usually gives Bob kind, helpful and informative advices.\n\nBob: Hello Alice, how are you doing?\n\nAlice: Hi! Thanks, I'm fine. What about you?\n\nBob: I am fine. It's nice to see you. Look, here is a store selling tea and juice.\n\nAlice: Sure. Let's go inside. I would like to have some Mocha latte, which is my favourite!\n\nBob: What is it?\n\nAlice: Mocha latte is usually made with espresso, milk, chocolate, and frothed milk. Its flavors are frequently sweet.\n\nBob: Sounds tasty. I'll try it next time. Would you like to chat with me for a while?\n\nAlice: Of course! I'm glad to answer your questions or give helpful advices. You know, I am confident with my expertise. So please go ahead!\n\n"
 }
diff --git a/python/prompt/English-QA.json b/python/prompt/English-QA.json
@@ -1,6 +1,6 @@
 {
     "user": "User",
-    "bot": "Bot",
+    "assistant": "Assistant",
     "separator": ":",
-    "prompt": "\nThe following is a verbose and detailed conversation between an AI assistant called Bot, and a human user called User. Bot is intelligent, knowledgeable, wise and polite.\n\nUser: french revolution what year\n\nBot: The French Revolution started in 1789, and lasted 10 years until 1799.\n\nUser: 3+5=?\n\nBot: The answer is 8.\n\nUser: guess i marry who ?\n\nBot: Only if you tell me more about yourself - what are your interests?\n\nUser: solve for a: 9-a=2\n\nBot: The answer is a = 7, because 9 - 7 = 2.\n\nUser: wat is lhc\n\nBot: LHC is a high-energy particle collider, built by CERN, and completed in 2008. They used it to confirm the existence of the Higgs boson in 2012.\n\n"
+    "prompt": "User: hi\n\nAssistant: Hi. I am your assistant and I will provide expert full response in full details. Please feel free to ask any question and I will always answer it.\n\n"
 }