Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 36 additions & 10 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -186,9 +186,8 @@ jobs:
defines: '-DRWKV_AVX512=ON'
- build: 'cuda12'
defines: '-DRWKV_CUBLAS=ON'
- build: 'rocm5.5'
defines: '-G "Unix Makefiles" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DRWKV_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx1100;gfx1102;gfx1030"'

- build: 'hip'
defines: ''
steps:
- name: Clone
id: checkout
Expand All @@ -206,25 +205,52 @@ jobs:

- name: Install rocm-toolkit
id: rocm-toolkit
if: ${{ matrix.build == 'rocm5.5' }}
uses: Cyberhan123/rocm-toolkit@v0.1.0
with:
rocm: '5.5.0'
if: ${{ matrix.build == 'hip' }}
run: |
$ErrorActionPreference = "Stop"
write-host "Downloading AMD HIP SDK Installer"
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
write-host "Installing AMD HIP SDK"
Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
write-host "Completed AMD HIP SDK installation"

- name: Verify ROCm
id: rocm-verify
if: ${{ matrix.build == 'hip' }}
run: |
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version

- name: Install Ninja
id: install-ninja
if: ${{ matrix.build == 'rocm5.5' }}
if: ${{ matrix.build == 'hip' }}
uses: urkle/action-get-ninja@v1
with:
version: 1.11.1

- name: Install ccache
uses: hendrikmuhs/ccache-action@v1.2
with:
key: ${{ github.job }}

- name: Build
id: cmake_build
if: ${{ matrix.build != 'hip' }}
run: |
mkdir build
cd build
cmake .. ${{ matrix.defines }}
cmake --build . --config Release
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}

- name: Build-hip
id: cmake_build_hip
if: ${{ matrix.build == 'hip' }}
run: |
mkdir build
cd build
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
cmake .. -G "Unix Makefiles" -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DRWKV_HIPBLAS=ON -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}

- name: Check AVX512F support
id: check_avx512f
Expand All @@ -242,7 +268,7 @@ jobs:
- name: Test
id: cmake_test
# Test AVX-512 only when possible
if: ${{ (matrix.build != 'avx512' || env.HAS_AVX512F == '1') && matrix.build != 'cuda12' && matrix.build != 'rocm5.5'}}
if: ${{ (matrix.build != 'avx512' || env.HAS_AVX512F == '1') && matrix.build != 'cuda12' && matrix.build != 'hip'}}
run: |
cd build
ctest -C Release --verbose
Expand Down
36 changes: 32 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ endfunction()

set(GGML_ACCELERATE ${RWKV_ACCELERATE})
set(GGML_CUDA ${RWKV_CUBLAS})
set(GGML_HIPBLAS ${RWKV_HIPBLAS})
set(GGML_HIP ${RWKV_HIPBLAS})
set(GGML_METAL ${RWKV_METAL})
if (RWKV_OPENBLAS)
set(GGML_BLAS_VENDOR "OpenBLAS")
Expand Down Expand Up @@ -107,6 +107,7 @@ if (RWKV_ALL_WARNINGS)
-Wcast-qual
-Wno-unused-function
-Wno-multichar
-Wno-nonnull
)
else()
set(c_flags
Expand Down Expand Up @@ -234,7 +235,7 @@ if (GGML_METAL)
)
endif()

if (GGML_HIPBLAS)
if (GGML_HIP)
# CMake on Windows doesn't support the HIP language yet
if (WIN32)
set(CXX_IS_HIPCC TRUE)
Expand Down Expand Up @@ -262,12 +263,39 @@ if (GGML_HIPBLAS)
endif()

target_include_directories(rwkv PUBLIC .)
target_include_directories(rwkv PRIVATE ggml/include)
target_include_directories(rwkv PRIVATE ggml/include ggml/src)
target_compile_features(rwkv PUBLIC cxx_std_11)
target_link_libraries(rwkv PRIVATE $<TARGET_OBJECTS:ggml> ${RWKV_EXTRA_LIBS})

if (GGML_METAL)
set(RWKV_EXTRA_LIBS ${RWKV_EXTRA_LIBS} $<TARGET_OBJECTS:ggml-metal> $<TARGET_OBJECTS:ggml-blas>)
endif()
if (GGML_CUDA)
set(RWKV_EXTRA_LIBS ${RWKV_EXTRA_LIBS} $<TARGET_OBJECTS:ggml-cuda>)
endif()
if (GGML_HIP)
set(RWKV_EXTRA_LIBS ${RWKV_EXTRA_LIBS} $<TARGET_OBJECTS:ggml-hip>)
endif()
if (GGML_RPC)
set(RWKV_EXTRA_LIBS ${RWKV_EXTRA_LIBS} $<TARGET_OBJECTS:ggml-rpc>)
endif()

target_link_libraries(rwkv PRIVATE $<TARGET_OBJECTS:ggml> $<TARGET_OBJECTS:ggml-base> $<TARGET_OBJECTS:ggml-cpu> ${RWKV_EXTRA_LIBS})

if (RWKV_BUILD_SHARED_LIBRARY)
set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
set_target_properties(ggml-base PROPERTIES POSITION_INDEPENDENT_CODE ON)
set_target_properties(ggml-cpu PROPERTIES POSITION_INDEPENDENT_CODE ON)
if (GGML_METAL)
set_target_properties(ggml-metal PROPERTIES POSITION_INDEPENDENT_CODE ON)
set_target_properties(ggml-blas PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
if (GGML_CUDA)
set_target_properties(ggml-cuda PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
if (GGML_HIP)
set_target_properties(ggml-hip PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()

target_compile_definitions(ggml PRIVATE GGML_SHARED GGML_BUILD)
set_target_properties(rwkv PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(rwkv PRIVATE RWKV_SHARED RWKV_BUILD)
Expand Down
10 changes: 4 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,18 @@ Besides the usual **FP32**, it supports **FP16**, **quantized INT4, INT5 and INT

This project provides [a C library rwkv.h](rwkv.h) and [a convinient Python wrapper](python%2Frwkv_cpp%2Frwkv_cpp_model.py) for it.

[RWKV](https://arxiv.org/abs/2305.13048) is a large language model architecture, [with the largest model in the family having 14B parameters](https://huggingface.co/BlinkDL/rwkv-4-pile-14b). In contrast to Transformer with `O(n^2)` attention, RWKV requires only state from previous step to calculate logits. This makes RWKV very CPU-friendly on large context lenghts.
[RWKV](https://arxiv.org/abs/2305.13048) is a large language model architecture. In contrast to Transformer with `O(n^2)` attention, RWKV requires only state from previous step to calculate logits. This makes RWKV very CPU-friendly on large context lenghts.

[RWKV v5](https://huggingface.co/BlinkDL/rwkv-5-world) is a major upgrade to RWKV architecture, making it competitive with Transformers in quality. RWKV v5 models are supported.

[RWKV v6](https://huggingface.co/BlinkDL/rwkv-6-world) is a further improvement to RWKV architecture, with better quality. RWKV v6 models are supported.
This project supports RWKV [v4](https://huggingface.co/BlinkDL/rwkv-4-pile-14b), [v5](https://huggingface.co/BlinkDL/rwkv-5-world), [v6](https://huggingface.co/BlinkDL/rwkv-6-world) and the latest [v7](https://huggingface.co/BlinkDL/rwkv-7-world) architectures.

Loading LoRA checkpoints in [Blealtan's format](https://github.com/Blealtan/RWKV-LM-LoRA) is supported through [merge_lora_into_ggml.py script](rwkv%2Fmerge_lora_into_ggml.py).

<!-- TODO: Update data below -->

## Quality and performance

If you use `rwkv.cpp` for anything serious, please [test all available formats for perplexity and latency](rwkv%2Fmeasure_pexplexity.py) on a representative dataset, and decide which trade-off is best for you.

In general, **`RWKV v5` models are as fast as `RWKV v4` models**, with minor differencies in latency and memory consumption, and with having way higher quality than `v4`. Therefore, it is recommended to use `RWKV v5`.

Below table is for reference only. Measurements were made on 4C/8T x86 CPU with AVX2, 4 threads. The models are `RWKV v4 Pile 169M`, `RWKV v4 Pile 1.5B`.

| Format | Perplexity (169M) | Latency, ms (1.5B) | File size, GB (1.5B) |
Expand Down
2 changes: 2 additions & 0 deletions extras/quantize.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@ bool QueryPerformanceCounter(uint64_t* lpPerformanceCount);
static enum ggml_type type_from_string(const char * string) {
if (strcmp(string, "Q4_0") == 0) return GGML_TYPE_Q4_0;
if (strcmp(string, "Q4_1") == 0) return GGML_TYPE_Q4_1;
if (strcmp(string, "Q4_K") == 0) return GGML_TYPE_Q4_K;
if (strcmp(string, "Q5_0") == 0) return GGML_TYPE_Q5_0;
if (strcmp(string, "Q5_1") == 0) return GGML_TYPE_Q5_1;
if (strcmp(string, "Q5_K") == 0) return GGML_TYPE_Q5_K;
if (strcmp(string, "Q8_0") == 0) return GGML_TYPE_Q8_0;
return GGML_TYPE_COUNT;
}
Expand Down
2 changes: 1 addition & 1 deletion ggml
Submodule ggml updated from 3e7e5e to c8bd0f
17 changes: 9 additions & 8 deletions python/chat_with_bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@

parser = argparse.ArgumentParser(description='Provide terminal-based chat interface for RWKV model')
parser.add_argument('model_path', help='Path to RWKV model in ggml format')
parser.add_argument('-ngl', '--num_gpu_layers', type=int, default=99, help='Number of layers to run on GPU')
add_tokenizer_argument(parser)
args = parser.parse_args()

Expand All @@ -48,7 +49,7 @@
with open(script_dir / 'prompt' / f'{LANGUAGE}-{PROMPT_TYPE}.json', 'r', encoding='utf8') as json_file:
prompt_data = json.load(json_file)

user, bot, separator, init_prompt = prompt_data['user'], prompt_data['bot'], prompt_data['separator'], prompt_data['prompt']
user, assistant, separator, init_prompt = prompt_data['user'], prompt_data['assistant'], prompt_data['separator'], prompt_data['prompt']

if init_prompt == '':
raise ValueError('Prompt must not be empty')
Expand All @@ -57,7 +58,7 @@
print(f'System info: {library.rwkv_get_system_info_string()}')

print('Loading RWKV model')
model = rwkv_cpp_model.RWKVModel(library, args.model_path)
model = rwkv_cpp_model.RWKVModel(library, args.model_path, gpu_layer_count=args.num_gpu_layers)

tokenizer_decode, tokenizer_encode = get_tokenizer(args.tokenizer, model.n_vocab)

Expand Down Expand Up @@ -154,7 +155,7 @@ def split_last_end_of_line(tokens: List[int]) -> List[int]:
if msg == '+reset':
load_thread_state('chat_init')
save_thread_state('chat')
print(f'{bot}{separator} Chat reset.\n')
print(f'{assistant}{separator} Chat reset.\n')
continue
elif msg[:5].lower() == '+gen ' or msg[:3].lower() == '+i ' or msg[:4].lower() == '+qa ' or msg[:4].lower() == '+qq ' or msg.lower() == '+++' or msg.lower() == '++':

Expand Down Expand Up @@ -194,7 +195,7 @@ def split_last_end_of_line(tokens: List[int]) -> List[int]:
load_thread_state('chat_init')

real_msg = msg[4:].strip()
new = f'{user}{separator} {real_msg}\n\n{bot}{separator}'
new = f'{user}{separator} {real_msg}\n\n{assistant}{separator}'

process_tokens(tokenizer_encode(new))
save_thread_state('gen_0')
Expand Down Expand Up @@ -225,17 +226,17 @@ def split_last_end_of_line(tokens: List[int]) -> List[int]:
except Exception as e:
print(e)
continue
# chat with bot
# chat with assistant
else:
load_thread_state('chat')
new = f'{user}{separator} {msg}\n\n{bot}{separator}'
new = f'{user}{separator} {msg}\n\n{assistant}{separator}'
process_tokens(tokenizer_encode(new), new_line_logit_bias=-999999999)
save_thread_state('chat_pre')

thread = 'chat'

# Print bot response
print(f'> {bot}{separator}', end='')
# Print assistant response
print(f'> {assistant}{separator}', end='')

start_index: int = len(processed_tokens)
accumulated_tokens: List[int] = []
Expand Down
42 changes: 39 additions & 3 deletions python/convert_pytorch_to_ggml.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,11 @@ def write_state_dict(state_dict: Dict[str, torch.Tensor], dest_path: str, data_t
is_v5_1_or_2: bool = 'blocks.0.att.ln_x.weight' in state_dict
is_v5_2: bool = 'blocks.0.att.gate.weight' in state_dict
is_v6_0: bool = 'blocks.0.att.time_maa_x' in state_dict
is_v7_0: bool = 'blocks.0.att.k_k' in state_dict

if is_v6_0:
if is_v7_0:
print('Detected RWKV v7.0')
elif is_v6_0:
print('Detected RWKV v6.0')
elif is_v5_2:
print('Detected RWKV v5.2')
Expand All @@ -45,6 +48,23 @@ def write_state_dict(state_dict: Dict[str, torch.Tensor], dest_path: str, data_t
else:
print('Detected RWKV v4')

if is_v7_0:
# concat to reduce some cpu overhead during ggml inference
state_dict_new = {}
for k in state_dict.keys():
if 'att.x_' in k:
l = int(k.split('.')[1].split('.')[0])
try:
state_dict_new[f'blocks.{l}.att.x_rwkvag'] = torch.cat(
[state_dict_new[f'blocks.{l}.att.x_rwkvag'], state_dict[k]], dim=0)
except KeyError:
state_dict_new[f'blocks.{l}.att.x_rwkvag'] = state_dict[k]
else:
state_dict_new[k] = state_dict[k]

del state_dict[k]
state_dict = state_dict_new

with open(dest_path, 'wb') as out_file:
is_FP16: bool = data_type == 'FP16' or data_type == 'float16'

Expand All @@ -68,7 +88,16 @@ def write_state_dict(state_dict: Dict[str, torch.Tensor], dest_path: str, data_t
if '.time_' in k:
tensor = tensor.squeeze()

if is_v6_0:
if is_v7_0:
if any(s in k for s in [
'.w1', '.w2',
'.a1', '.a2',
'.v1', '.v2',
'.g1', '.g2',
]):
tensor = tensor.transpose(0, 1)

elif is_v6_0:
if '.time_faaaa' in k:
tensor = tensor.unsqueeze(-1)
if '.time_maa_w1' in k or '.time_decay_w' in k:
Expand All @@ -95,7 +124,14 @@ def write_state_dict(state_dict: Dict[str, torch.Tensor], dest_path: str, data_t
tensor = -torch.exp(tensor)

# Keep 1-dim vectors and small matrices in FP32
if is_FP16 and len(tensor.shape) > 1 and '.time_' not in k:
if is_FP16 and len(tensor.shape) > 1 and all(
s not in k for s in [
'.time_',
'.k_k', '.k_a', '.r_k',
'.x_rwkvag', '.x_k',
'.w0', '.a0', '.v0',
]
):
tensor = tensor.half()

shape = tensor.shape
Expand Down
3 changes: 2 additions & 1 deletion python/generate_completions.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@

parser = argparse.ArgumentParser(description='Generate completions from RWKV model based on a prompt')
parser.add_argument('model_path', help='Path to RWKV model in ggml format')
parser.add_argument('-ngl', '--num_gpu_layers', type=int, default=99, help='Number of layers to run on GPU')
add_tokenizer_argument(parser)
args = parser.parse_args()

Expand All @@ -39,7 +40,7 @@
print(f'System info: {library.rwkv_get_system_info_string()}')

print('Loading RWKV model')
model = rwkv_cpp_model.RWKVModel(library, args.model_path, gpu_layers_count=0)
model = rwkv_cpp_model.RWKVModel(library, args.model_path, gpu_layers_count=args.num_gpu_layers)

tokenizer_decode, tokenizer_encode = get_tokenizer(args.tokenizer, model.n_vocab)

Expand Down
3 changes: 2 additions & 1 deletion python/inference_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,13 @@
# Parse received arguments.
parser = argparse.ArgumentParser(description='Generate some text with an RWKV model')
parser.add_argument('model_path', help='Path to RWKV model in ggml format')
parser.add_argument('-ngl', '--num_gpu_layers', type=int, default=99, help='Number of layers to run on GPU')
add_tokenizer_argument(parser)
args = parser.parse_args()

# Load the model.
library = rwkv_cpp_shared_library.load_rwkv_shared_library()
model = rwkv_cpp_model.RWKVModel(library, args.model_path)
model = rwkv_cpp_model.RWKVModel(library, args.model_path, gpu_layer_count=args.num_gpu_layers)

# Set up the tokenizer.
tokenizer_decode, tokenizer_encode = get_tokenizer(args.tokenizer, model.n_vocab)
Expand Down
2 changes: 1 addition & 1 deletion python/prompt/Chinese-Chat.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"user": "Bob",
"bot": "Alice",
"assistant": "Alice",
"separator": ":",
"prompt": "\nThe following is a coherent verbose detailed conversation between a Chinese girl named Alice and her friend Bob. Alice is very intelligent, creative and friendly. Alice likes to tell Bob a lot about herself and her opinions. Alice usually gives Bob kind, helpful and informative advices.\n\nBob: lhc\n\nAlice: LHC是指大型强子对撞机(Large Hadron Collider),是世界最大最强的粒子加速器,由欧洲核子中心(CERN)在瑞士日内瓦地下建造。LHC的原理是加速质子(氢离子)并让它们相撞,让科学家研究基本粒子和它们之间的相互作用,并在2012年证实了希格斯玻色子的存在。\n\nBob: 企鹅会飞吗\n\nAlice: 企鹅是不会飞的。企鹅的翅膀短而扁平,更像是游泳时的一对桨。企鹅的身体结构和羽毛密度也更适合在水中游泳,而不是飞行。\n\n"
}
6 changes: 3 additions & 3 deletions python/prompt/Chinese-QA.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"user": "Q",
"bot": "A",
"user": "User",
"assistant": "Assistant",
"separator": ":",
"prompt": "\nExpert Questions & Helpful Answers\n\nAsk Research Experts\n\n"
"prompt": "User: 你好\n\nAssistant: 你好,有什么我可以帮助你的吗?\n\n"
}
2 changes: 1 addition & 1 deletion python/prompt/English-Chat.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"user": "Bob",
"bot": "Alice",
"assistant": "Alice",
"separator": ":",
"prompt": "\nThe following is a coherent verbose detailed conversation between a girl named Alice and her friend Bob. Alice is very intelligent, creative and friendly. Alice is unlikely to disagree with Bob, and Alice doesn't like to ask Bob questions. Alice likes to tell Bob a lot about herself and her opinions. Alice usually gives Bob kind, helpful and informative advices.\n\nBob: Hello Alice, how are you doing?\n\nAlice: Hi! Thanks, I'm fine. What about you?\n\nBob: I am fine. It's nice to see you. Look, here is a store selling tea and juice.\n\nAlice: Sure. Let's go inside. I would like to have some Mocha latte, which is my favourite!\n\nBob: What is it?\n\nAlice: Mocha latte is usually made with espresso, milk, chocolate, and frothed milk. Its flavors are frequently sweet.\n\nBob: Sounds tasty. I'll try it next time. Would you like to chat with me for a while?\n\nAlice: Of course! I'm glad to answer your questions or give helpful advices. You know, I am confident with my expertise. So please go ahead!\n\n"
}
4 changes: 2 additions & 2 deletions python/prompt/English-QA.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"user": "User",
"bot": "Bot",
"assistant": "Assistant",
"separator": ":",
"prompt": "\nThe following is a verbose and detailed conversation between an AI assistant called Bot, and a human user called User. Bot is intelligent, knowledgeable, wise and polite.\n\nUser: french revolution what year\n\nBot: The French Revolution started in 1789, and lasted 10 years until 1799.\n\nUser: 3+5=?\n\nBot: The answer is 8.\n\nUser: guess i marry who ?\n\nBot: Only if you tell me more about yourself - what are your interests?\n\nUser: solve for a: 9-a=2\n\nBot: The answer is a = 7, because 9 - 7 = 2.\n\nUser: wat is lhc\n\nBot: LHC is a high-energy particle collider, built by CERN, and completed in 2008. They used it to confirm the existence of the Higgs boson in 2012.\n\n"
"prompt": "User: hi\n\nAssistant: Hi. I am your assistant and I will provide expert full response in full details. Please feel free to ask any question and I will always answer it.\n\n"
}
Loading