Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
63 commits
Select commit Hold shift + click to select a range
888788d
Enable ROCm backend with custom ops integration (#1683)
pnunna93 Jun 20, 2025
fd2949a
Fix AdamW documentation (#1686)
agupta2304 Jun 23, 2025
aca9778
Make minor improvements to optimizer.py (#1687)
agupta2304 Jun 24, 2025
1abd5e7
Add CUDA 12.9 build (#1689)
matthewdouglas Jun 27, 2025
6d0a5cd
Temporarily disable HPU tests
matthewdouglas Jun 30, 2025
bdcee0f
fix triton kernel on the correct device (#1691)
jiqing-feng Jul 1, 2025
e28d4d9
Update README.md
matthewdouglas Jul 1, 2025
ed398d2
CI: Test with PyTorch 2.8.0 RC (#1693)
matthewdouglas Jul 1, 2025
ed9c8fc
Automatically call CMake as part of PEP 517 build (#1512)
mgorny Jul 1, 2025
3278614
Added inference benchmark
Egor-Krivov Jul 2, 2025
ea4b59f
fix log
jiqing-feng Jul 3, 2025
ee01736
Merge pull request #1697 from jiqing-feng/log
matthewdouglas Jul 8, 2025
adc7fda
Merge pull request #1696 from Egor-Krivov/egor/inf_benchmark
matthewdouglas Jul 8, 2025
b43edf5
Add interface for 8bit optimizer
Egor-Krivov Jul 11, 2025
35ce337
Fixed bugs
Egor-Krivov Jul 11, 2025
abf4a1e
enabled tests
Egor-Krivov Jul 14, 2025
3b89a05
Add 32bit optimizer interface
Egor-Krivov Jul 14, 2025
223fea5
Add no_cpu for optimizers
Egor-Krivov Jul 14, 2025
4075a64
Update to kernel registration
Egor-Krivov Jul 14, 2025
236124e
Reverse lion
Egor-Krivov Jul 14, 2025
36f5c4f
Changed number of errors
Egor-Krivov Jul 14, 2025
24d9139
Removed cpu
Egor-Krivov Jul 14, 2025
e33ba1c
Added mutated args to the schema
Egor-Krivov Jul 14, 2025
0f6fe6b
Fixed default args
Egor-Krivov Jul 14, 2025
941681d
Merge pull request #1706 from Egor-Krivov/egor/8bit_int
matthewdouglas Jul 14, 2025
14147f6
Test fix
matthewdouglas Jul 14, 2025
df67c70
Create FUNDING.yml
matthewdouglas Jul 21, 2025
33449ee
Merge pull request #1714 from bitsandbytes-foundation/add-funding
matthewdouglas Jul 21, 2025
ec19229
Add Volta support in cu128/cu129 builds
matthewdouglas Jul 21, 2025
e54dc12
Merge pull request #1715 from bitsandbytes-foundation/adjust-cuda-build
matthewdouglas Jul 21, 2025
1dbe602
Fix Params4bit tensor subclass handling
ved1beta Jul 31, 2025
639f8c0
Fixing quantization uint8 packing bug for NF4 and FP4
Mhmd-Hisham Aug 2, 2025
2938c73
test_params4bit_torch_chunk_split
ved1beta Aug 2, 2025
0ecb8fb
lint
ved1beta Aug 4, 2025
4265392
Merge pull request #1719 from ved1beta/fsdp_integration2
matthewdouglas Aug 6, 2025
19fe95a
Merge pull request #1721 from Mhmd-Hisham/quantization-packing-bug-fix
matthewdouglas Aug 6, 2025
5959389
Temporary updates for release
matthewdouglas Aug 11, 2025
c0dcdf2
Release 0.47.0
matthewdouglas Aug 11, 2025
9088107
Bump dev version
matthewdouglas Aug 11, 2025
7bfe923
Restore temporary changes from release
matthewdouglas Aug 11, 2025
ff389db
add py.typed (#1726)
cyyever Aug 25, 2025
c76e208
Enable F841 (#1727)
cyyever Sep 2, 2025
a09d05a
add int mm for xpu after torch 2.9 (#1736)
jiqing-feng Sep 3, 2025
39dd847
for intel xpu case, use MatMul8bitFp even not use ipex (#1728)
kaixuanliu Sep 3, 2025
27549fb
4bit quantization for arbitrary `nn.Parameter` (#1720)
matthewdouglas Sep 8, 2025
d731fc4
Adjust 4bit test tolerance on CPU for larger blocksizes (#1749)
matthewdouglas Sep 8, 2025
6a07ffe
Test improvements (#1750)
matthewdouglas Sep 9, 2025
d848d4d
Lint fix
matthewdouglas Sep 15, 2025
275671b
[XPU] Implemented 32bit optimizers in triton (#1710)
YangKai0616 Sep 15, 2025
1813b05
Add SYCL Kernels for XPU backend (#1679)
xiaolil1 Sep 15, 2025
4b02574
Lint fix
matthewdouglas Sep 15, 2025
404e277
[XPU] Implemented 8bit optimizers in triton (#1692)
Egor-Krivov Sep 16, 2025
dd1929b
Drop Maxwell (sm50) build from distribution (#1755)
matthewdouglas Sep 16, 2025
c9bce2b
Bump minimum PyTorch to 2.3 (#1754)
matthewdouglas Sep 16, 2025
b1f80b8
[CUDA] Branchless NF4/FP4 kDequantizeBlockwise kernel for faster dequ…
Mhmd-Hisham Sep 18, 2025
b2a8a15
Update log (#1758)
YangKai0616 Sep 19, 2025
2adcb7a
Add function to reverse 4bit weights for HPU (#1757)
vivekgoe Sep 19, 2025
e817036
Update README.md
matthewdouglas Sep 22, 2025
0507a45
Merge upstream/main into ROCm/rocm_enabled
Sep 23, 2025
359d545
Skip unsupported tests on ROCm
pnunna93 Sep 23, 2025
9f74744
update kernels.hip with latest upstream
pnunna93 Sep 23, 2025
7ba4fb4
Import missing modules
pnunna93 Sep 23, 2025
36da3e1
Fix lint errors.
pnunna93 Sep 23, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/FUNDING.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
open_collective: bitsandbytes
12 changes: 6 additions & 6 deletions .github/scripts/build-cuda.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@ if [[ -v cuda_targets ]]; then
elif [ "${build_arch}" = "aarch64" ]; then
build_capability="75;80;90"

# CUDA 12.8: Add sm100
[[ "${cuda_version}" == 12.8.* ]] && build_capability="75;80;90;100"
# CUDA 12.8+: Add sm100/sm120
[[ "${cuda_version}" == 12.8.* || "${cuda_version}" == 12.9.* ]] && build_capability="75;80;90;100;120"
else
# By default, target Maxwell through Hopper.
build_capability="50;52;60;61;70;75;80;86;89;90"
# By default, target Pascal through Hopper.
build_capability="60;70;75;80;86;89;90"

# CUDA 12.8: Add sm100 and sm120; remove < sm75 to align with PyTorch 2.7+cu128 minimum
[[ "${cuda_version}" == 12.8.* ]] && build_capability="75;80;86;89;90;100;120"
# CUDA 12.8+: Add sm100 and sm120; remove < sm70 to align with PyTorch 2.8+cu128 minimum
[[ "${cuda_version}" == 12.8.* || "${cuda_version}" == 12.9.* ]] && build_capability="70;75;80;86;89;90;100;120"
fi

[[ "${build_os}" = windows-* ]] && python3 -m pip install ninja
Expand Down
7 changes: 4 additions & 3 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -72,16 +72,17 @@ jobs:
- os: windows-latest
arch: x86_64
cuda_version:
["11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1", "12.6.3", "12.8.1"]
["11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1", "12.6.3", "12.8.1", "12.9.1"]
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v4
# Windows: We install Cuda on the agent (slow)
- uses: Jimver/cuda-toolkit@v0.2.22
- uses: Jimver/cuda-toolkit@c35baa1a18fd1fc9dcf47c5bd839bf30559c0bc3 # v0.2.24
if: startsWith(matrix.os, 'windows')
id: cuda-toolkit
with:
cuda: ${{ matrix.cuda_version }}
# Temporary: Use CUDA 12.9.0 for Windows until 12.9.1 is supported with this action.
cuda: ${{ matrix.cuda_version == '12.9.1' && '12.9.0' || matrix.cuda_version }}
method: "network"
sub-packages: '["nvcc","cudart","cusparse","cublas","thrust","nvrtc_dev","cublas_dev","cusparse_dev"]'
linux-local-args: '["--toolkit"]'
Expand Down
62 changes: 23 additions & 39 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,22 +49,23 @@ jobs:
build-cuda:
strategy:
matrix:
cuda_version: ["11.8.0", "12.6.3", "12.8.1"]
os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025]
cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1"]
os: [ubuntu-22.04, ubuntu-22.04-arm]
include:
- os: ubuntu-22.04
arch: x86_64
- os: ubuntu-22.04-arm
arch: aarch64
- os: windows-2025
arch: x86_64
cuda_version: "11.8.0"
runs-on: ${{ matrix.os }}

steps:
- uses: actions/checkout@v4

- name: Install CUDA Toolkit
uses: Jimver/cuda-toolkit@v0.2.23
uses: Jimver/cuda-toolkit@c35baa1a18fd1fc9dcf47c5bd839bf30559c0bc3 # v0.2.24
if: startsWith(matrix.os, 'windows')
id: cuda-toolkit
with:
Expand Down Expand Up @@ -100,8 +101,8 @@ jobs:
fail-fast: false
matrix:
os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025, macos-15]
# Test with the oldest supported torch version and the two newest.
torch_version: ["2.2.2", "2.6.0", "2.7.1"]
# Test with the oldest supported torch version, the newest two stable/RC.
torch_version: ["2.3.1", "2.7.1", "2.8.0"]
include:
- os: ubuntu-22.04
arch: x86_64
Expand All @@ -117,7 +118,7 @@ jobs:
arch: arm64
exclude:
- os: ubuntu-22.04-arm
torch_version: "2.2.2"
torch_version: "2.3.1"

runs-on: ${{ matrix.runner || matrix.os }}
env:
Expand Down Expand Up @@ -147,9 +148,10 @@ jobs:
pip install -e ".[test]"
pip install pytest-cov

# We need to downgrade to numpy<2 for torch<2.3 compatibility.
# We need to downgrade to numpy<2 for torch<2.4.1 compatibility on Windows
# See: https://github.com/pytorch/pytorch/issues/131668
- name: Downgrade NumPy
if: startsWith(matrix.torch_version, '2.2.')
if: startsWith(matrix.os, 'windows') && startsWith(matrix.torch_version, '2.3.')
run: pip install "numpy<2"

- name: Show installed packages
Expand All @@ -161,7 +163,7 @@ jobs:
- name: Run tests
run: pytest --durations=100

test-cpu-ipex:
test-cpu-intel:
if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
needs: build-cpu
runs-on: banb-aws-general-8-plus-use1-public-80
Expand All @@ -185,7 +187,6 @@ jobs:
- name: Install dependencies
run: |
pip install torch==2.7.1 --index-url https://download.pytorch.org/whl/cpu
pip install intel_extension_for_pytorch==2.7.0 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
pip install -e ".[test]"
pip install pytest-cov

Expand All @@ -195,9 +196,6 @@ jobs:
- name: Show environment information
run: python -m torch.utils.collect_env

- name: IPEX smoke test
run: python -c "import torch; import intel_extension_for_pytorch as ipex; print(torch.__version__); print(ipex.__version__);"

- name: Run tests
run: pytest --durations=100

Expand All @@ -223,7 +221,7 @@ jobs:
# run: pip list

test-hpu:
if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
if: false # github.repository == 'bitsandbytes-foundation/bitsandbytes'
needs: build-cpu
strategy:
fail-fast: false
Expand Down Expand Up @@ -279,21 +277,12 @@ jobs:
run: pytest --durations=100

test-xpu:
if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
if: false # github.repository == 'bitsandbytes-foundation/bitsandbytes'
needs: build-cpu
strategy:
fail-fast: false
matrix:
torch_version: ["2.7.1"] #["2.6.0", "2.7.1"]
ipex: [false]
# ipex: [true, false]
# include:
# - torch_version: "2.6.0"
# ipex: true
# ipex_version: "2.6.10+xpu"
# - torch_version: "2.7.1"
# ipex: true
# ipex_version: "2.7.10+xpu"
runs-on:
group: bandb-itac-bmsprpvc1550-8-1gpu
env:
Expand Down Expand Up @@ -329,10 +318,6 @@ jobs:
- name: Install PyTorch
run: pip install torch==${{ matrix.torch_version }} --index-url https://download.pytorch.org/whl/xpu

- name: Install IPEX
if: matrix.ipex == true
run: pip install intel_extension_for_pytorch==${{ matrix.ipex_version }} --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/

- name: Install dependencies
run: |
pip install -e ".[test]"
Expand All @@ -358,17 +343,20 @@ jobs:
os: [ubuntu-22.04, windows-2025]
arch: [x86_64]
gpu: [T4, L40S]
cuda_version: ["11.8.0", "12.6.3", "12.8.1"]
cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1"]
include:
- cuda_version: "11.8.0"
torch_version: "2.2.2"
torch_version: "2.3.1"
pypi_index: "https://download.pytorch.org/whl/cu118"
- cuda_version: "12.6.3"
torch_version: "2.6.0"
pypi_index: "https://download.pytorch.org/whl/cu126"
- cuda_version: "12.8.1"
torch_version: "2.7.1"
pypi_index: "https://download.pytorch.org/whl/cu128"
- cuda_version: "12.9.1"
torch_version: "2.8.0"
pypi_index: "https://download.pytorch.org/whl/cu129"


# Linux L40S runners
Expand All @@ -387,7 +375,7 @@ jobs:
gpu: T4
runner: CUDA-Windows-x64
cuda_version: "11.8.0"
torch_version: "2.2.0"
torch_version: "2.3.1"
pypi_index: "https://download.pytorch.org/whl/cu118"
- os: windows-2025
arch: x86_64
Expand All @@ -401,12 +389,14 @@ jobs:
gpu: T4
runner: CUDA-Windows-x64
cuda_version: "11.8.0"
torch_version: "2.7.1"
torch_version: "2.7.1" # Note: this is the last PyTorch release supporting CUDA 11.8.
pypi_index: "https://download.pytorch.org/whl/cu118"

exclude:
# Our current T4 Windows runner has a driver too old (471.11)
# and cannot support CUDA 12+. Skip for now.
- os: windows-2025
cuda_version: "12.9.1"
- os: windows-2025
cuda_version: "12.8.1"
- os: windows-2025
Expand Down Expand Up @@ -438,15 +428,9 @@ jobs:

- name: Install dependencies
run: |
pip install torch==${{ matrix.torch_version }} --index-url ${{ matrix.pypi_index }}
pip install --pre torch~=${{ matrix.torch_version }}.dev0 --index-url ${{ matrix.pypi_index }}
pip install -e ".[test]"
pip install pytest-cov

# We need to downgrade to numpy<2 for torch<2.3 compatibility.
- name: Downgrade NumPy
if: startsWith(matrix.torch_version, '2.2.')
run: pip install "numpy<2"

- name: Show installed packages
run: pip list

Expand Down
32 changes: 30 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,12 @@ set(CUDA_FILES csrc/ops.cu csrc/kernels.cu)
set(HIP_FILES csrc/ops.hip csrc/kernels.hip)
set(MPS_FILES csrc/mps_ops.mm)
set(METAL_FILES csrc/mps_kernels.metal)
set(XPU_FILES csrc/xpu_ops.cpp csrc/xpu_kernels.cpp)
# C++ sources are always included
list(APPEND SRC_FILES ${CPP_FILES})

set(COMPUTE_BACKEND "cpu" CACHE STRING "The compute backend to use (cpu, cuda, hip, mps)")
set_property(CACHE COMPUTE_BACKEND PROPERTY STRINGS cpu cuda hip mps)
set(COMPUTE_BACKEND "cpu" CACHE STRING "The compute backend to use (cpu, cuda, hip, mps, xpu)")
set_property(CACHE COMPUTE_BACKEND PROPERTY STRINGS cpu cuda hip mps xpu)
option(PTXAS_VERBOSE "Pass through -v flag to PTX Assembler" OFF)

if(APPLE)
Expand Down Expand Up @@ -64,10 +65,19 @@ elseif(${COMPUTE_BACKEND} STREQUAL "mps")
set(BUILD_CUDA OFF)
set(BUILD_HIP OFF)
set(BUILD_MPS ON)
elseif(${COMPUTE_BACKEND} STREQUAL "xpu")
if(APPLE)
message(FATAL_ERROR "XPU is not supported on macOS" )
endif()
set(BUILD_CUDA OFF)
set(BUILD_HIP OFF)
set(BUILD_MPS OFF)
set(BUILD_XPU ON)
else()
set(BUILD_CUDA OFF)
set(BUILD_HIP OFF)
set(BUILD_MPS OFF)
set(BUILD_XPU OFF)
endif()


Expand Down Expand Up @@ -217,6 +227,15 @@ elseif(BUILD_MPS)
COMMENT "Compiling Metal kernels"
VERBATIM)
add_custom_target(metallib DEPENDS "bitsandbytes/bitsandbytes.metallib")
elseif(BUILD_XPU)
list(APPEND SRC_FILES ${XPU_FILES})
string(APPEND BNB_OUTPUT_NAME "_xpu")
add_compile_definitions(BUILD_XPU)
set(CMAKE_C_COMPILER icx)
set(CMAKE_CXX_COMPILER icpx)
if(WIN32)
set(CMAKE_CXX_COMPILER icx)
endif()
else()
string(APPEND BNB_OUTPUT_NAME "_cpu")
set(GPU_SOURCES)
Expand Down Expand Up @@ -285,6 +304,15 @@ if(BUILD_MPS)
add_dependencies(bitsandbytes metallib)
target_link_libraries(bitsandbytes objc "-framework Foundation" "-framework Metal" "-framework MetalPerformanceShaders" "-framework MetalPerformanceShadersGraph")
endif()
if(BUILD_XPU)
set(SYCL_LINK_FLAGS "-fsycl;--offload-compress;-fsycl-targets=spir64_gen,spir64;-Xs;-device pvc,xe-lpg,ats-m150 -options ' -cl-intel-enable-auto-large-GRF-mode -cl-poison-unsupported-fp64-kernels -cl-intel-greater-than-4GB-buffer-required'")
set(SYCL_COMPILE_FLAGS "-fsycl;-fhonor-nans;-fhonor-infinities;-fno-associative-math;-fno-approx-func;-fno-sycl-instrument-device-code;--offload-compress;-fsycl-targets=spir64_gen,spir64;")

set_property(TARGET bitsandbytes PROPERTY CXX_STANDARD 20)
target_compile_options(bitsandbytes PRIVATE ${SYCL_COMPILE_FLAGS})
target_link_options(bitsandbytes PRIVATE ${SYCL_LINK_FLAGS})

endif()

if(WIN32)
set_target_properties(bitsandbytes PROPERTIES PREFIX "lib")
Expand Down
3 changes: 3 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
include CMakeLists.txt
graft csrc
graft include
Loading
Loading