From 824e0b609770ae4c6c41fc9468a67880d18242cf Mon Sep 17 00:00:00 2001 From: Felix Schlepper Date: Sun, 14 Sep 2025 13:13:29 +0200 Subject: [PATCH 1/3] ITS: mm add tests Signed-off-by: Felix Schlepper --- Detectors/ITSMFT/ITS/tracking/CMakeLists.txt | 2 + .../include/ITStracking/BoundedAllocator.h | 63 +++--- .../tracking/include/ITStracking/TimeFrame.h | 2 +- .../tracking/include/ITStracking/Tracker.h | 2 +- .../include/ITStracking/TrackerTraits.h | 2 +- .../tracking/include/ITStracking/Vertexer.h | 2 +- .../include/ITStracking/VertexerTraits.h | 2 +- .../ITSMFT/ITS/tracking/src/TimeFrame.cxx | 30 ++- .../ITSMFT/ITS/tracking/test/CMakeLists.txt | 16 ++ .../test/testBoundedMemoryResource.cxx | 209 ++++++++++++++++++ 10 files changed, 283 insertions(+), 47 deletions(-) create mode 100644 Detectors/ITSMFT/ITS/tracking/test/CMakeLists.txt create mode 100644 Detectors/ITSMFT/ITS/tracking/test/testBoundedMemoryResource.cxx diff --git a/Detectors/ITSMFT/ITS/tracking/CMakeLists.txt b/Detectors/ITSMFT/ITS/tracking/CMakeLists.txt index 291ddffbf9475..001ee537f50d2 100644 --- a/Detectors/ITSMFT/ITS/tracking/CMakeLists.txt +++ b/Detectors/ITSMFT/ITS/tracking/CMakeLists.txt @@ -56,3 +56,5 @@ o2_target_root_dictionary(ITStracking if(CUDA_ENABLED OR HIP_ENABLED) add_subdirectory(GPU) endif() + +add_subdirectory(test) diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/BoundedAllocator.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/BoundedAllocator.h index ac9f72089602d..c19c1e56b273d 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/BoundedAllocator.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/BoundedAllocator.h @@ -69,7 +69,14 @@ class BoundedMemoryResource final : public std::pmr::memory_resource } while (!mUsedMemory.compare_exchange_weak(current_used, new_used, std::memory_order_acq_rel, std::memory_order_relaxed)); - return mUpstream->allocate(bytes, alignment); + void* p{nullptr}; + try { + p = mUpstream->allocate(bytes, alignment); + } catch (...) { + mUsedMemory.fetch_sub(bytes, std::memory_order_relaxed); + throw; + } + return p; } void do_deallocate(void* p, size_t bytes, size_t alignment) final @@ -87,11 +94,12 @@ class BoundedMemoryResource final : public std::pmr::memory_resource size_t getMaxMemory() const noexcept { return mMaxMemory; } void setMaxMemory(size_t max) { - if (mUsedMemory > max) { + size_t used = mUsedMemory.load(std::memory_order_acquire); + if (used > max) { ++mCountThrow; - throw MemoryLimitExceeded(0, mUsedMemory, max); + throw MemoryLimitExceeded(0, used, max); } - mMaxMemory = max; + mMaxMemory.store(max, std::memory_order_release); } void print() const @@ -106,7 +114,7 @@ class BoundedMemoryResource final : public std::pmr::memory_resource } private: - size_t mMaxMemory{std::numeric_limits::max()}; + std::atomic mMaxMemory{std::numeric_limits::max()}; std::atomic mCountThrow{0}; std::atomic mUsedMemory{0}; std::pmr::memory_resource* mUpstream; @@ -116,66 +124,71 @@ template using bounded_vector = std::pmr::vector; template -void deepVectorClear(std::vector& vec) +inline void deepVectorClear(std::vector& vec) { std::vector().swap(vec); } template -inline void deepVectorClear(bounded_vector& vec, BoundedMemoryResource* bmr = nullptr) +inline void deepVectorClear(bounded_vector& vec, std::pmr::memory_resource* mr = nullptr) { - vec.~bounded_vector(); - if (bmr == nullptr) { - auto alloc = vec.get_allocator().resource(); - new (&vec) bounded_vector(alloc); + auto* res = mr ? mr : vec.get_allocator().resource(); + if (res == vec.get_allocator().resource()) { + bounded_vector empty{std::pmr::polymorphic_allocator{res}}; + vec.swap(empty); } else { - new (&vec) bounded_vector(bmr); + vec = bounded_vector(std::pmr::polymorphic_allocator{res}); } } template -void deepVectorClear(std::vector>& vec, BoundedMemoryResource* bmr = nullptr) +inline void deepVectorClear(std::vector>& vec, std::pmr::memory_resource* mr = nullptr) { for (auto& v : vec) { - deepVectorClear(v, bmr); + deepVectorClear(v, mr); } } template -void deepVectorClear(std::array, S>& arr, BoundedMemoryResource* bmr = nullptr) +inline void deepVectorClear(std::array, S>& arr, std::pmr::memory_resource* mr = nullptr) { for (size_t i{0}; i < S; ++i) { - deepVectorClear(arr[i], bmr); + deepVectorClear(arr[i], mr); } } template -void clearResizeBoundedVector(bounded_vector& vec, size_t size, BoundedMemoryResource* bmr, T def = T()) +inline void clearResizeBoundedVector(bounded_vector& vec, size_t sz, std::pmr::memory_resource* mr = nullptr, T def = T()) { - vec.~bounded_vector(); - new (&vec) bounded_vector(size, def, bmr); + auto* res = mr ? mr : vec.get_allocator().resource(); + if (res == vec.get_allocator().resource()) { + bounded_vector tmp(sz, def, std::pmr::polymorphic_allocator{res}); + vec.swap(tmp); + } else { + vec = bounded_vector(sz, def, std::pmr::polymorphic_allocator{res}); + } } template -void clearResizeBoundedVector(std::vector>& vec, size_t size, BoundedMemoryResource* bmr) +void clearResizeBoundedVector(std::vector>& vec, size_t size, std::pmr::memory_resource* mr) { vec.clear(); vec.reserve(size); - for (size_t i{0}; i < size; ++i) { - vec.emplace_back(bmr); + for (size_t i = 0; i < size; ++i) { + vec.emplace_back(std::pmr::polymorphic_allocator>{mr}); } } template -void clearResizeBoundedArray(std::array, S>& arr, size_t size, BoundedMemoryResource* bmr, T def = T()) +inline void clearResizeBoundedArray(std::array, S>& arr, size_t size, std::pmr::memory_resource* mr = nullptr, T def = T()) { for (size_t i{0}; i < S; ++i) { - clearResizeBoundedVector(arr[i], size, bmr, def); + clearResizeBoundedVector(arr[i], size, mr, def); } } template -std::vector toSTDVector(const bounded_vector& b) +inline std::vector toSTDVector(const bounded_vector& b) { std::vector t(b.size()); std::copy(b.cbegin(), b.cend(), t.begin()); diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h index c34701ce222e2..7b5b84b1acde2 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h @@ -188,7 +188,7 @@ struct TimeFrame { auto getNumberOfUsedExtendedClusters() const { return mNExtendedUsedClusters; } /// memory management - void setMemoryPool(std::shared_ptr& pool); + void setMemoryPool(std::shared_ptr pool); auto& getMemoryPool() const noexcept { return mMemoryPool; } bool checkMemory(unsigned long max) { return getArtefactsMemory() < max; } unsigned long getArtefactsMemory() const; diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Tracker.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Tracker.h index 642717bd09596..4c903ed1f3ca1 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Tracker.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Tracker.h @@ -66,7 +66,7 @@ class Tracker const LogFunc& = [](const std::string& s) { std::cerr << s << '\n'; }); void setParameters(const std::vector& p) { mTrkParams = p; } - void setMemoryPool(std::shared_ptr& pool) { mMemoryPool = pool; } + void setMemoryPool(std::shared_ptr pool) { mMemoryPool = pool; } std::vector& getParameters() { return mTrkParams; } void setBz(float bz) { mTraits->setBz(bz); } bool isMatLUT() const { return mTraits->isMatLUT(); } diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackerTraits.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackerTraits.h index 9d14bb91635a0..ee64cacb8fa2a 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackerTraits.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackerTraits.h @@ -70,7 +70,7 @@ class TrackerTraits bool isMatLUT() const; virtual const char* getName() const noexcept { return "CPU"; } virtual bool isGPU() const noexcept { return false; } - void setMemoryPool(std::shared_ptr& pool) noexcept { mMemoryPool = pool; } + void setMemoryPool(std::shared_ptr pool) noexcept { mMemoryPool = pool; } auto getMemoryPool() const noexcept { return mMemoryPool; } // Others diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Vertexer.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Vertexer.h index ab92e7c1a1523..d66bcd6ee2358 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Vertexer.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Vertexer.h @@ -52,7 +52,7 @@ class Vertexer auto& getVertParameters() const { return mTraits->getVertexingParameters(); } void setParameters(const std::vector& vertParams) { mVertParams = vertParams; } const auto& getParameters() const noexcept { return mVertParams; } - void setMemoryPool(std::shared_ptr& pool) { mMemoryPool = pool; } + void setMemoryPool(std::shared_ptr pool) { mMemoryPool = pool; } std::vector exportVertices(); VertexerTraitsN* getTraits() const { return mTraits; }; diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/VertexerTraits.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/VertexerTraits.h index dda32ddfd5aec..b1422d66e12df 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/VertexerTraits.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/VertexerTraits.h @@ -82,7 +82,7 @@ class VertexerTraits virtual bool isGPU() const noexcept { return false; } virtual const char* getName() const noexcept { return "CPU"; } virtual bool usesMemoryPool() const noexcept { return true; } - void setMemoryPool(std::shared_ptr& pool) { mMemoryPool = pool; } + void setMemoryPool(std::shared_ptr pool) { mMemoryPool = pool; } static std::pair computeMain(const bounded_vector& elements) { diff --git a/Detectors/ITSMFT/ITS/tracking/src/TimeFrame.cxx b/Detectors/ITSMFT/ITS/tracking/src/TimeFrame.cxx index 510c66e2420f1..741acd227cb32 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/TimeFrame.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/TimeFrame.cxx @@ -575,36 +575,32 @@ void TimeFrame::printSliceInfo(const int startROF, const int sliceSize) LOG(info) << "Number of seeding vertices: " << getPrimaryVertices(iROF).size(); int iVertex{0}; for (auto& v : getPrimaryVertices(iROF)) { - LOG(info) << "\t vertex " << iVertex++ << ": x=" << v.getX() << " " << " y=" << v.getY() << " z=" << v.getZ() << " has " << v.getNContributors() << " contributors."; + LOG(info) << "\t vertex " << iVertex++ << ": x=" << v.getX() << " " + << " y=" << v.getY() << " z=" << v.getZ() << " has " << v.getNContributors() << " contributors."; } } } template -void TimeFrame::setMemoryPool(std::shared_ptr& pool) +void TimeFrame::setMemoryPool(std::shared_ptr pool) { mMemoryPool = pool; - auto initVector = [&](bounded_vector& vec) { - auto alloc = vec.get_allocator().resource(); - if (alloc != mMemoryPool.get()) { - vec = bounded_vector(mMemoryPool.get()); - } + auto initVector = [&](bounded_vector & vec) + { + bounded_vector tmp(std::pmr::polymorphic_allocator{mMemoryPool.get()}); + vec.swap(tmp); }; - auto initArrays = [&](std::array, S>& arr) { + auto initArrays = [&](std::array, S> & arr) + { for (size_t i{0}; i < S; ++i) { - auto alloc = arr[i].get_allocator().resource(); - if (alloc != mMemoryPool.get()) { - arr[i] = bounded_vector(mMemoryPool.get()); - } + initVector(arr[i]); } }; - auto initVectors = [&](std::vector>& vec) { + auto initVectors = [&](std::vector> & vec) + { for (size_t i{0}; i < vec.size(); ++i) { - auto alloc = vec[i].get_allocator().resource(); - if (alloc != mMemoryPool.get()) { - vec[i] = bounded_vector(mMemoryPool.get()); - } + initVector(vec[i]); } }; diff --git a/Detectors/ITSMFT/ITS/tracking/test/CMakeLists.txt b/Detectors/ITSMFT/ITS/tracking/test/CMakeLists.txt new file mode 100644 index 0000000000000..818ad1d667371 --- /dev/null +++ b/Detectors/ITSMFT/ITS/tracking/test/CMakeLists.txt @@ -0,0 +1,16 @@ +# Copyright 2019-2020 CERN and copyright holders of ALICE O2. +# See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +# All rights not expressly granted are reserved. +# +# This software is distributed under the terms of the GNU General Public +# License v3 (GPL Version 3), copied verbatim in the file "COPYING". +# +# In applying this license CERN does not waive the privileges and immunities +# granted to it by virtue of its status as an Intergovernmental Organization +# or submit itself to any jurisdiction. + +o2_add_test(boundedmemoryresource + SOURCES testBoundedMemoryResource.cxx + COMPONENT_NAME its-tracking + LABELS "its;tracking" + PUBLIC_LINK_LIBRARIES O2::ITStracking) diff --git a/Detectors/ITSMFT/ITS/tracking/test/testBoundedMemoryResource.cxx b/Detectors/ITSMFT/ITS/tracking/test/testBoundedMemoryResource.cxx new file mode 100644 index 0000000000000..e7d8bd9de25de --- /dev/null +++ b/Detectors/ITSMFT/ITS/tracking/test/testBoundedMemoryResource.cxx @@ -0,0 +1,209 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +#define BOOST_TEST_MODULE Test Flags +#define BOOST_TEST_MAIN +#define BOOST_TEST_DYN_LINK + +#include +#include +#include "ITStracking/BoundedAllocator.h" + +using namespace o2::its; +using Vec = bounded_vector; +auto getRandomInt(int min = -100, int max = 100) +{ + static std::mt19937 gen(std::random_device{}()); // static generator, seeded once + std::uniform_int_distribution<> dist(min, max); + return [&, dist]() mutable { + return dist(gen); + }; +} + +// -------- Throwing upstream resource for testing rollback -------- +class ThrowingResource final : public std::pmr::memory_resource +{ + protected: + void* do_allocate(size_t, size_t) final + { + throw std::bad_alloc(); // always fail + } + void do_deallocate(void*, size_t, size_t) noexcept final + { + // nothing + } + bool do_is_equal(const std::pmr::memory_resource& other) const noexcept final + { + return this == &other; + } +}; + +// -------- Upstream resource with empty deallocate -------- +class NoDeallocateResource final : public std::pmr::memory_resource +{ + public: + NoDeallocateResource(std::pmr::memory_resource* upstream = std::pmr::get_default_resource()) + : mUpstream(upstream) {} + + protected: + void* do_allocate(size_t bytes, size_t alignment) final + { + return mUpstream->allocate(bytes, alignment); + } + void do_deallocate(void*, size_t, size_t) noexcept final + { + // nothing + } + bool do_is_equal(const std::pmr::memory_resource& other) const noexcept final + { + return this == &other; + } + + private: + std::pmr::memory_resource* mUpstream; +}; + +// -------- Tests -------- +BOOST_AUTO_TEST_CASE(allocation_and_clear_updates_used_memory) +{ + BoundedMemoryResource bmr(10 * 1024 * 1024); // 10 MB cap + + Vec v(std::pmr::polymorphic_allocator{&bmr}); + BOOST_CHECK_EQUAL(bmr.getUsedMemory(), 0u); + + const size_t count = 128; + v.reserve(count); + const size_t expected = count * sizeof(int); + BOOST_CHECK_GE(bmr.getUsedMemory(), expected); + BOOST_CHECK_LE(bmr.getUsedMemory(), expected + 64); + + deepVectorClear(v, &bmr); + BOOST_CHECK_EQUAL(bmr.getUsedMemory(), 0u); +} + +BOOST_AUTO_TEST_CASE(clearResizeBoundedVector_resizes_and_tracks_memory) +{ + BoundedMemoryResource bmr(1024 * 1024); // 1 MB cap + + Vec v(std::pmr::polymorphic_allocator{&bmr}); + v.reserve(200); + const size_t used_before = bmr.getUsedMemory(); + BOOST_CHECK_GT(used_before, 0u); + + clearResizeBoundedVector(v, 50, &bmr, 7); + const size_t used_after = bmr.getUsedMemory(); + BOOST_CHECK_GE(used_after, 50 * sizeof(int)); + BOOST_CHECK_LT(used_after, used_before); + + clearResizeBoundedVector(v, 300, &bmr, 3); + BOOST_CHECK_GE(bmr.getUsedMemory(), 300 * sizeof(int)); +} + +BOOST_AUTO_TEST_CASE(clearResizeBoundedVector_throws_on_over_limit_and_preserves_previous_usage) +{ + const size_t cap = 256 * sizeof(int); // a cap that will be exceeded + BoundedMemoryResource bmr(cap); + Vec v(std::pmr::polymorphic_allocator{&bmr}); + clearResizeBoundedVector(v, 10, &bmr, 1); + const size_t before = bmr.getUsedMemory(); + BOOST_CHECK_GT(before, 0u); + BOOST_CHECK_LE(before, cap); + bool threw = false; + try { + clearResizeBoundedVector(v, cap / sizeof(int) + 10, &bmr, 2); + } catch (const BoundedMemoryResource::MemoryLimitExceeded&) { + threw = true; + } + BOOST_CHECK(threw); + BOOST_CHECK_EQUAL(bmr.getUsedMemory(), before); +} + +BOOST_AUTO_TEST_CASE(upstream_throw_rolls_back_reservation) +{ + ThrowingResource upstream; + BoundedMemoryResource bmr(std::numeric_limits::max(), &upstream); + const size_t bytes = 1024; + bool threw = false; + void* p{nullptr}; + try { + p = bmr.allocate(bytes, alignof(std::max_align_t)); + } catch (const std::bad_alloc&) { + threw = true; + } + BOOST_CHECK(threw); + BOOST_CHECK_EQUAL(p, nullptr); + BOOST_CHECK_EQUAL(bmr.getUsedMemory(), 0u); +} + +BOOST_AUTO_TEST_CASE(vector_of_bounded_vectors_deep_clear_releases_all) +{ + BoundedMemoryResource bmr(10 * 1024 * 1024); // 10 MB + std::vector outer; + outer.reserve(5); + for (int i = 0; i < 5; ++i) { + outer.emplace_back(std::pmr::polymorphic_allocator{&bmr}); + outer.back().reserve(100); + } + BOOST_CHECK_GT(bmr.getUsedMemory(), 0u); + deepVectorClear(outer, &bmr); // deep clear outer + BOOST_CHECK_EQUAL(bmr.getUsedMemory(), 0u); +} + +BOOST_AUTO_TEST_CASE(array_of_bounded_vectors_clear_resize_works) +{ + BoundedMemoryResource bmr(10 * 1024 * 1024); + std::array arr{{Vec(std::pmr::polymorphic_allocator{&bmr}), + Vec(std::pmr::polymorphic_allocator{&bmr}), + Vec(std::pmr::polymorphic_allocator{&bmr})}}; + clearResizeBoundedVector(arr[0], 10, &bmr, 1); + clearResizeBoundedVector(arr[1], 20, &bmr, 2); + clearResizeBoundedVector(arr[2], 30, &bmr, 3); + BOOST_CHECK_GT(bmr.getUsedMemory(), 0u); + deepVectorClear(arr, &bmr); // now clear all recursively + BOOST_CHECK_EQUAL(bmr.getUsedMemory(), 0u); +} + +BOOST_AUTO_TEST_CASE(deepVectorClear_releases_and_reuses_resource) +{ + // Use a small bounded memory resource + BoundedMemoryResource bmr(1024); + bounded_vector vec{std::pmr::polymorphic_allocator{&bmr}}; + vec.resize(100, 42); + BOOST_TEST(bmr.getUsedMemory() > 0); + deepVectorClear(vec, &bmr); + BOOST_TEST(vec.empty()); + BOOST_TEST(vec.get_allocator().resource() == &bmr); + auto usedAfter = bmr.getUsedMemory(); + BOOST_CHECK_EQUAL(bmr.getUsedMemory(), 0); + vec.push_back(7); + BOOST_TEST(vec.size() == 1); + BOOST_TEST(vec[0] == 7); + BOOST_TEST(vec.get_allocator().resource() == &bmr); +} + +BOOST_AUTO_TEST_CASE(clear_with_memory_resource_without_deallocator) +{ + NoDeallocateResource dmr; + Vec v(std::pmr::polymorphic_allocator{&dmr}); + + for (int shift{0}; shift < 12; ++shift) { + const int c{1 << shift}; + v.resize(100); + std::generate(v.begin(), v.end(), getRandomInt()); + // allocate different sizes, which is actually a no-op now + clearResizeBoundedVector(v, c / 2, &dmr, 999); + for (size_t i{0}; i < c / 2; ++i) { // now only the first c/2 elements should be set + BOOST_CHECK_EQUAL(v[i], 999); + } + // try to deepclear + deepVectorClear(v); + } +} From 70eb6b49ce87d3092a00c6ea1cd34edc9e91f46e Mon Sep 17 00:00:00 2001 From: Felix Schlepper Date: Mon, 15 Sep 2025 13:05:45 +0200 Subject: [PATCH 2/3] ITS: GPU: use pinned framework memory Signed-off-by: Felix Schlepper --- .../ITS/tracking/GPU/cuda/TimeFrameGPU.cu | 142 ++++++++++-------- .../tracking/GPU/cuda/TrackerTraitsGPU.cxx | 14 +- .../include/ITStracking/BoundedAllocator.h | 8 +- .../include/ITStracking/ExternalAllocator.h | 32 ++++ .../tracking/include/ITStracking/TimeFrame.h | 37 +++-- .../ITSMFT/ITS/tracking/src/TimeFrame.cxx | 77 +++++----- .../ITS3/reconstruction/src/IOUtils.cxx | 2 +- GPU/GPUTracking/Global/GPUChainITS.cxx | 19 ++- GPU/GPUTracking/Global/GPUChainITS.h | 9 +- 9 files changed, 193 insertions(+), 147 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu index 4da91522371f8..27bcf04746da5 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu @@ -32,7 +32,7 @@ template void TimeFrameGPU::allocMemAsync(void** ptr, size_t size, Stream& stream, bool extAllocator) { if (extAllocator) { - *ptr = this->mAllocator->allocate(size); + *ptr = this->mExtDeviceAllocator->allocate(size); } else { GPULog("Calling default CUDA allocator"); GPUChkErrS(cudaMallocAsync(reinterpret_cast(ptr), size, stream.get())); @@ -43,7 +43,7 @@ template void TimeFrameGPU::allocMem(void** ptr, size_t size, bool extAllocator) { if (extAllocator) { - *ptr = this->mAllocator->allocate(size); + *ptr = this->mExtDeviceAllocator->allocate(size); } else { GPULog("Calling default CUDA allocator"); GPUChkErrS(cudaMalloc(reinterpret_cast(ptr), size)); @@ -56,7 +56,7 @@ void TimeFrameGPU::loadIndexTableUtils(const int iteration) GPUTimer timer("loading indextable utils"); if (!iteration) { GPULog("gpu-allocation: allocating IndexTableUtils buffer, for {:.2f} MB.", sizeof(IndexTableUtilsN) / constants::MB); - allocMem(reinterpret_cast(&mIndexTableUtilsDevice), sizeof(IndexTableUtilsN), this->getExtAllocator()); + allocMem(reinterpret_cast(&mIndexTableUtilsDevice), sizeof(IndexTableUtilsN), this->hasExternalDeviceAllocator()); } GPULog("gpu-transfer: loading IndexTableUtils object, for {:.2f} MB.", sizeof(IndexTableUtilsN) / constants::MB); GPUChkErrS(cudaMemcpy(mIndexTableUtilsDevice, &(this->mIndexTableUtils), sizeof(IndexTableUtilsN), cudaMemcpyHostToDevice)); @@ -67,12 +67,14 @@ void TimeFrameGPU::createUnsortedClustersDeviceArray(const int iteratio { if (!iteration) { GPUTimer timer("creating unsorted clusters array"); - allocMem(reinterpret_cast(&mUnsortedClustersDeviceArray), nLayers * sizeof(Cluster*), this->getExtAllocator()); + allocMem(reinterpret_cast(&mUnsortedClustersDeviceArray), nLayers * sizeof(Cluster*), this->hasExternalDeviceAllocator()); GPUChkErrS(cudaHostRegister(mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable)); mPinnedUnsortedClusters.set(nLayers); - for (auto iLayer{0}; iLayer < o2::gpu::CAMath::Min(maxLayers, nLayers); ++iLayer) { - GPUChkErrS(cudaHostRegister(this->mUnsortedClusters[iLayer].data(), this->mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable)); - mPinnedUnsortedClusters.set(iLayer); + if (!this->hasExternalDeviceAllocator()) { + for (auto iLayer{0}; iLayer < o2::gpu::CAMath::Min(maxLayers, nLayers); ++iLayer) { + GPUChkErrS(cudaHostRegister(this->mUnsortedClusters[iLayer].data(), this->mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable)); + mPinnedUnsortedClusters.set(iLayer); + } } } } @@ -83,7 +85,7 @@ void TimeFrameGPU::loadUnsortedClustersDevice(const int iteration, cons if (!iteration) { GPUTimer timer(mGpuStreams[layer], "loading unsorted clusters", layer); GPULog("gpu-transfer: loading {} unsorted clusters on layer {}, for {:.2f} MB.", this->mUnsortedClusters[layer].size(), layer, this->mUnsortedClusters[layer].size() * sizeof(Cluster) / constants::MB); - allocMemAsync(reinterpret_cast(&mUnsortedClustersDevice[layer]), this->mUnsortedClusters[layer].size() * sizeof(Cluster), mGpuStreams[layer], this->getExtAllocator()); + allocMemAsync(reinterpret_cast(&mUnsortedClustersDevice[layer]), this->mUnsortedClusters[layer].size() * sizeof(Cluster), mGpuStreams[layer], this->hasExternalDeviceAllocator()); GPUChkErrS(cudaMemcpyAsync(mUnsortedClustersDevice[layer], this->mUnsortedClusters[layer].data(), this->mUnsortedClusters[layer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); GPUChkErrS(cudaMemcpyAsync(&mUnsortedClustersDeviceArray[layer], &mUnsortedClustersDevice[layer], sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); } @@ -94,12 +96,14 @@ void TimeFrameGPU::createClustersDeviceArray(const int iteration, const { if (!iteration) { GPUTimer timer("creating sorted clusters array"); - allocMem(reinterpret_cast(&mClustersDeviceArray), nLayers * sizeof(Cluster*), this->getExtAllocator()); + allocMem(reinterpret_cast(&mClustersDeviceArray), nLayers * sizeof(Cluster*), this->hasExternalDeviceAllocator()); GPUChkErrS(cudaHostRegister(mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable)); mPinnedClusters.set(nLayers); - for (auto iLayer{0}; iLayer < o2::gpu::CAMath::Min(maxLayers, nLayers); ++iLayer) { - GPUChkErrS(cudaHostRegister(this->mClusters[iLayer].data(), this->mClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable)); - mPinnedClusters.set(iLayer); + if (!this->hasExternalDeviceAllocator()) { + for (auto iLayer{0}; iLayer < o2::gpu::CAMath::Min(maxLayers, nLayers); ++iLayer) { + GPUChkErrS(cudaHostRegister(this->mClusters[iLayer].data(), this->mClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable)); + mPinnedClusters.set(iLayer); + } } } } @@ -110,7 +114,7 @@ void TimeFrameGPU::loadClustersDevice(const int iteration, const int la if (!iteration) { GPUTimer timer(mGpuStreams[layer], "loading sorted clusters", layer); GPULog("gpu-transfer: loading {} clusters on layer {}, for {:.2f} MB.", this->mClusters[layer].size(), layer, this->mClusters[layer].size() * sizeof(Cluster) / constants::MB); - allocMemAsync(reinterpret_cast(&mClustersDevice[layer]), this->mClusters[layer].size() * sizeof(Cluster), mGpuStreams[layer], this->getExtAllocator()); + allocMemAsync(reinterpret_cast(&mClustersDevice[layer]), this->mClusters[layer].size() * sizeof(Cluster), mGpuStreams[layer], this->hasExternalDeviceAllocator()); GPUChkErrS(cudaMemcpyAsync(mClustersDevice[layer], this->mClusters[layer].data(), this->mClusters[layer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); GPUChkErrS(cudaMemcpyAsync(&mClustersDeviceArray[layer], &mClustersDevice[layer], sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); } @@ -121,12 +125,14 @@ void TimeFrameGPU::createClustersIndexTablesArray(const int iteration) { if (!iteration) { GPUTimer timer("creating clustersindextable array"); - allocMem(reinterpret_cast(&mClustersIndexTablesDeviceArray), nLayers * sizeof(int*), this->getExtAllocator()); + allocMem(reinterpret_cast(&mClustersIndexTablesDeviceArray), nLayers * sizeof(int*), this->hasExternalDeviceAllocator()); GPUChkErrS(cudaHostRegister(mClustersIndexTablesDevice.data(), nLayers * sizeof(int*), cudaHostRegisterPortable)); mPinnedClustersIndexTables.set(nLayers); - for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { - GPUChkErrS(cudaHostRegister(this->mIndexTables[iLayer].data(), this->mIndexTables[iLayer].size() * sizeof(int), cudaHostRegisterPortable)); - mPinnedClustersIndexTables.set(iLayer); + if (!this->hasExternalDeviceAllocator()) { + for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { + GPUChkErrS(cudaHostRegister(this->mIndexTables[iLayer].data(), this->mIndexTables[iLayer].size() * sizeof(int), cudaHostRegisterPortable)); + mPinnedClustersIndexTables.set(iLayer); + } } } } @@ -137,7 +143,7 @@ void TimeFrameGPU::loadClustersIndexTables(const int iteration, const i if (!iteration) { GPUTimer timer(mGpuStreams[layer], "loading sorted clusters", layer); GPULog("gpu-transfer: loading clusters indextable for layer {} with {} elements, for {:.2f} MB.", layer, this->mIndexTables[layer].size(), this->mIndexTables[layer].size() * sizeof(int) / constants::MB); - allocMemAsync(reinterpret_cast(&mClustersIndexTablesDevice[layer]), this->mIndexTables[layer].size() * sizeof(int), mGpuStreams[layer], this->getExtAllocator()); + allocMemAsync(reinterpret_cast(&mClustersIndexTablesDevice[layer]), this->mIndexTables[layer].size() * sizeof(int), mGpuStreams[layer], this->hasExternalDeviceAllocator()); GPUChkErrS(cudaMemcpyAsync(mClustersIndexTablesDevice[layer], this->mIndexTables[layer].data(), this->mIndexTables[layer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); GPUChkErrS(cudaMemcpyAsync(&mClustersIndexTablesDeviceArray[layer], &mClustersIndexTablesDevice[layer], sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); } @@ -148,12 +154,14 @@ void TimeFrameGPU::createUsedClustersDeviceArray(const int iteration, c { if (!iteration) { GPUTimer timer("creating used clusters flags"); - allocMem(reinterpret_cast(&mUsedClustersDeviceArray), nLayers * sizeof(uint8_t*), this->getExtAllocator()); + allocMem(reinterpret_cast(&mUsedClustersDeviceArray), nLayers * sizeof(uint8_t*), this->hasExternalDeviceAllocator()); GPUChkErrS(cudaHostRegister(mUsedClustersDevice.data(), nLayers * sizeof(uint8_t*), cudaHostRegisterPortable)); mPinnedUsedClusters.set(nLayers); - for (auto iLayer{0}; iLayer < o2::gpu::CAMath::Min(maxLayers, nLayers); ++iLayer) { - GPUChkErrS(cudaHostRegister(this->mUsedClusters[iLayer].data(), this->mUsedClusters[iLayer].size() * sizeof(uint8_t), cudaHostRegisterPortable)); - mPinnedUsedClusters.set(iLayer); + if (!this->hasExternalDeviceAllocator()) { + for (auto iLayer{0}; iLayer < o2::gpu::CAMath::Min(maxLayers, nLayers); ++iLayer) { + GPUChkErrS(cudaHostRegister(this->mUsedClusters[iLayer].data(), this->mUsedClusters[iLayer].size() * sizeof(uint8_t), cudaHostRegisterPortable)); + mPinnedUsedClusters.set(iLayer); + } } } } @@ -164,7 +172,7 @@ void TimeFrameGPU::createUsedClustersDevice(const int iteration, const if (!iteration) { GPUTimer timer(mGpuStreams[layer], "creating used clusters flags", layer); GPULog("gpu-transfer: creating {} used clusters flags on layer {}, for {:.2f} MB.", this->mUsedClusters[layer].size(), layer, this->mUsedClusters[layer].size() * sizeof(unsigned char) / constants::MB); - allocMemAsync(reinterpret_cast(&mUsedClustersDevice[layer]), this->mUsedClusters[layer].size() * sizeof(unsigned char), mGpuStreams[layer], this->getExtAllocator()); + allocMemAsync(reinterpret_cast(&mUsedClustersDevice[layer]), this->mUsedClusters[layer].size() * sizeof(unsigned char), mGpuStreams[layer], this->hasExternalDeviceAllocator()); GPUChkErrS(cudaMemsetAsync(mUsedClustersDevice[layer], 0, this->mUsedClusters[layer].size() * sizeof(unsigned char), mGpuStreams[layer].get())); GPUChkErrS(cudaMemcpyAsync(&mUsedClustersDeviceArray[layer], &mUsedClustersDevice[layer], sizeof(unsigned char*), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); } @@ -185,12 +193,14 @@ void TimeFrameGPU::createROFrameClustersDeviceArray(const int iteration { if (!iteration) { GPUTimer timer("creating ROFrame clusters array"); - allocMem(reinterpret_cast(&mROFramesClustersDeviceArray), nLayers * sizeof(int*), this->getExtAllocator()); + allocMem(reinterpret_cast(&mROFramesClustersDeviceArray), nLayers * sizeof(int*), this->hasExternalDeviceAllocator()); GPUChkErrS(cudaHostRegister(mROFramesClustersDevice.data(), nLayers * sizeof(int*), cudaHostRegisterPortable)); mPinnedROFramesClusters.set(nLayers); - for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { - GPUChkErrS(cudaHostRegister(this->mROFramesClusters[iLayer].data(), this->mROFramesClusters[iLayer].size() * sizeof(int), cudaHostRegisterPortable)); - mPinnedROFramesClusters.set(iLayer); + if (!this->hasExternalDeviceAllocator()) { + for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { + GPUChkErrS(cudaHostRegister(this->mROFramesClusters[iLayer].data(), this->mROFramesClusters[iLayer].size() * sizeof(int), cudaHostRegisterPortable)); + mPinnedROFramesClusters.set(iLayer); + } } } } @@ -201,7 +211,7 @@ void TimeFrameGPU::loadROFrameClustersDevice(const int iteration, const if (!iteration) { GPUTimer timer(mGpuStreams[layer], "loading ROframe clusters", layer); GPULog("gpu-transfer: loading {} ROframe clusters info on layer {}, for {:.2f} MB.", this->mROFramesClusters[layer].size(), layer, this->mROFramesClusters[layer].size() * sizeof(int) / constants::MB); - allocMemAsync(reinterpret_cast(&mROFramesClustersDevice[layer]), this->mROFramesClusters[layer].size() * sizeof(int), mGpuStreams[layer], this->getExtAllocator()); + allocMemAsync(reinterpret_cast(&mROFramesClustersDevice[layer]), this->mROFramesClusters[layer].size() * sizeof(int), mGpuStreams[layer], this->hasExternalDeviceAllocator()); GPUChkErrS(cudaMemcpyAsync(mROFramesClustersDevice[layer], this->mROFramesClusters[layer].data(), this->mROFramesClusters[layer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); GPUChkErrS(cudaMemcpyAsync(&mROFramesClustersDeviceArray[layer], &mROFramesClustersDevice[layer], sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); } @@ -212,12 +222,14 @@ void TimeFrameGPU::createTrackingFrameInfoDeviceArray(const int iterati { if (!iteration) { GPUTimer timer("creating trackingframeinfo array"); - allocMem(reinterpret_cast(&mTrackingFrameInfoDeviceArray), nLayers * sizeof(TrackingFrameInfo*), this->getExtAllocator()); + allocMem(reinterpret_cast(&mTrackingFrameInfoDeviceArray), nLayers * sizeof(TrackingFrameInfo*), this->hasExternalDeviceAllocator()); GPUChkErrS(cudaHostRegister(mTrackingFrameInfoDevice.data(), nLayers * sizeof(TrackingFrameInfo*), cudaHostRegisterPortable)); mPinnedTrackingFrameInfo.set(nLayers); - for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { - GPUChkErrS(cudaHostRegister(this->mTrackingFrameInfo[iLayer].data(), this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), cudaHostRegisterPortable)); - mPinnedTrackingFrameInfo.set(iLayer); + if (!this->hasExternalDeviceAllocator()) { + for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { + GPUChkErrS(cudaHostRegister(this->mTrackingFrameInfo[iLayer].data(), this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), cudaHostRegisterPortable)); + mPinnedTrackingFrameInfo.set(iLayer); + } } } } @@ -228,7 +240,7 @@ void TimeFrameGPU::loadTrackingFrameInfoDevice(const int iteration, con if (!iteration) { GPUTimer timer(mGpuStreams[layer], "loading trackingframeinfo", layer); GPULog("gpu-transfer: loading {} tfinfo on layer {}, for {:.2f} MB.", this->mTrackingFrameInfo[layer].size(), layer, this->mTrackingFrameInfo[layer].size() * sizeof(TrackingFrameInfo) / constants::MB); - allocMemAsync(reinterpret_cast(&mTrackingFrameInfoDevice[layer]), this->mTrackingFrameInfo[layer].size() * sizeof(TrackingFrameInfo), mGpuStreams[layer], this->getExtAllocator()); + allocMemAsync(reinterpret_cast(&mTrackingFrameInfoDevice[layer]), this->mTrackingFrameInfo[layer].size() * sizeof(TrackingFrameInfo), mGpuStreams[layer], this->hasExternalDeviceAllocator()); GPUChkErrS(cudaMemcpyAsync(mTrackingFrameInfoDevice[layer], this->mTrackingFrameInfo[layer].data(), this->mTrackingFrameInfo[layer].size() * sizeof(TrackingFrameInfo), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); GPUChkErrS(cudaMemcpyAsync(&mTrackingFrameInfoDeviceArray[layer], &mTrackingFrameInfoDevice[layer], sizeof(TrackingFrameInfo*), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); } @@ -241,7 +253,7 @@ void TimeFrameGPU::loadMultiplicityCutMask(const int iteration) GPUTimer timer("loading multiplicity cut mask"); GPULog("gpu-transfer: iteration {} loading multiplicity cut mask with {} elements, for {:.2f} MB.", iteration, this->mMultiplicityCutMask.size(), this->mMultiplicityCutMask.size() * sizeof(uint8_t) / constants::MB); if (!iteration) { // only allocate on first call - allocMem(reinterpret_cast(&mMultMaskDevice), this->mMultiplicityCutMask.size() * sizeof(uint8_t), this->getExtAllocator()); + allocMem(reinterpret_cast(&mMultMaskDevice), this->mMultiplicityCutMask.size() * sizeof(uint8_t), this->hasExternalDeviceAllocator()); } GPUChkErrS(cudaMemcpy(mMultMaskDevice, this->mMultiplicityCutMask.data(), this->mMultiplicityCutMask.size() * sizeof(uint8_t), cudaMemcpyHostToDevice)); } @@ -253,10 +265,10 @@ void TimeFrameGPU::loadVertices(const int iteration) if (!iteration) { GPUTimer timer("loading seeding vertices"); GPULog("gpu-transfer: loading {} ROframes vertices, for {:.2f} MB.", this->mROFramesPV.size(), this->mROFramesPV.size() * sizeof(int) / constants::MB); - allocMem(reinterpret_cast(&mROFramesPVDevice), this->mROFramesPV.size() * sizeof(int), this->getExtAllocator()); + allocMem(reinterpret_cast(&mROFramesPVDevice), this->mROFramesPV.size() * sizeof(int), this->hasExternalDeviceAllocator()); GPUChkErrS(cudaMemcpy(mROFramesPVDevice, this->mROFramesPV.data(), this->mROFramesPV.size() * sizeof(int), cudaMemcpyHostToDevice)); GPULog("gpu-transfer: loading {} seeding vertices, for {:.2f} MB.", this->mPrimaryVertices.size(), this->mPrimaryVertices.size() * sizeof(Vertex) / constants::MB); - allocMem(reinterpret_cast(&mPrimaryVerticesDevice), this->mPrimaryVertices.size() * sizeof(Vertex), this->getExtAllocator()); + allocMem(reinterpret_cast(&mPrimaryVerticesDevice), this->mPrimaryVertices.size() * sizeof(Vertex), this->hasExternalDeviceAllocator()); GPUChkErrS(cudaMemcpy(mPrimaryVerticesDevice, this->mPrimaryVertices.data(), this->mPrimaryVertices.size() * sizeof(Vertex), cudaMemcpyHostToDevice)); } } @@ -265,7 +277,7 @@ template void TimeFrameGPU::createTrackletsLUTDeviceArray(const int iteration) { if (!iteration) { - allocMem(reinterpret_cast(&mTrackletsLUTDeviceArray), (nLayers - 1) * sizeof(int*), this->getExtAllocator()); + allocMem(reinterpret_cast(&mTrackletsLUTDeviceArray), (nLayers - 1) * sizeof(int*), this->hasExternalDeviceAllocator()); } } @@ -276,7 +288,7 @@ void TimeFrameGPU::createTrackletsLUTDevice(const int iteration, const const int ncls = this->mClusters[layer].size() + 1; if (!iteration) { GPULog("gpu-allocation: creating tracklets LUT for {} elements on layer {}, for {:.2f} MB.", ncls, layer, ncls * sizeof(int) / constants::MB); - allocMemAsync(reinterpret_cast(&mTrackletsLUTDevice[layer]), ncls * sizeof(int), mGpuStreams[layer], this->getExtAllocator()); + allocMemAsync(reinterpret_cast(&mTrackletsLUTDevice[layer]), ncls * sizeof(int), mGpuStreams[layer], this->hasExternalDeviceAllocator()); GPUChkErrS(cudaMemcpyAsync(&mTrackletsLUTDeviceArray[layer], &mTrackletsLUTDevice[layer], sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); } GPUChkErrS(cudaMemsetAsync(mTrackletsLUTDevice[layer], 0, ncls * sizeof(int), mGpuStreams[layer].get())); @@ -287,7 +299,7 @@ void TimeFrameGPU::createTrackletsBuffersArray(const int iteration) { if (!iteration) { GPUTimer timer("creating tracklet buffers array"); - allocMem(reinterpret_cast(&mTrackletsDeviceArray), (nLayers - 1) * sizeof(Tracklet*), this->getExtAllocator()); + allocMem(reinterpret_cast(&mTrackletsDeviceArray), (nLayers - 1) * sizeof(Tracklet*), this->hasExternalDeviceAllocator()); } } @@ -299,7 +311,7 @@ void TimeFrameGPU::createTrackletsBuffers(const int layer) GPUChkErrS(cudaMemcpyAsync(&mNTracklets[layer], mTrackletsLUTDevice[layer] + this->mClusters[layer].size(), sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[layer].get())); mGpuStreams[layer].sync(); // ensure number of tracklets is correct GPULog("gpu-transfer: creating tracklets buffer for {} elements on layer {}, for {:.2f} MB.", mNTracklets[layer], layer, mNTracklets[layer] * sizeof(Tracklet) / constants::MB); - allocMemAsync(reinterpret_cast(&mTrackletsDevice[layer]), mNTracklets[layer] * sizeof(Tracklet), mGpuStreams[layer], this->getExtAllocator()); + allocMemAsync(reinterpret_cast(&mTrackletsDevice[layer]), mNTracklets[layer] * sizeof(Tracklet), mGpuStreams[layer], this->hasExternalDeviceAllocator()); GPUChkErrS(cudaMemcpyAsync(&mTrackletsDeviceArray[layer], &mTrackletsDevice[layer], sizeof(Tracklet*), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); } @@ -331,7 +343,7 @@ void TimeFrameGPU::createNeighboursIndexTablesDevice(const int layer) { GPUTimer timer(mGpuStreams[layer], "creating cells neighbours", layer); GPULog("gpu-transfer: reserving neighbours LUT for {} elements on layer {}, for {:.2f} MB.", mNCells[layer] + 1, layer, (mNCells[layer] + 1) * sizeof(int) / constants::MB); - allocMemAsync(reinterpret_cast(&mNeighboursIndexTablesDevice[layer]), (mNCells[layer] + 1) * sizeof(int), mGpuStreams[layer], this->getExtAllocator()); + allocMemAsync(reinterpret_cast(&mNeighboursIndexTablesDevice[layer]), (mNCells[layer] + 1) * sizeof(int), mGpuStreams[layer], this->hasExternalDeviceAllocator()); GPUChkErrS(cudaMemsetAsync(mNeighboursIndexTablesDevice[layer], 0, (mNCells[layer] + 1) * sizeof(int), mGpuStreams[layer].get())); } @@ -340,7 +352,7 @@ void TimeFrameGPU::createNeighboursLUTDevice(const int layer, const uns { GPUTimer timer(mGpuStreams[layer], "reserving neighboursLUT"); GPULog("gpu-allocation: reserving neighbours LUT for {} elements on layer {} , for {:.2f} MB.", nCells + 1, layer, (nCells + 1) * sizeof(int) / constants::MB); - allocMemAsync(reinterpret_cast(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), mGpuStreams[layer], this->getExtAllocator()); // We need one element more to move exc -> inc + allocMemAsync(reinterpret_cast(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), mGpuStreams[layer], this->hasExternalDeviceAllocator()); // We need one element more to move exc -> inc GPUChkErrS(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[layer].get())); } @@ -350,8 +362,8 @@ void TimeFrameGPU::loadCellsDevice() GPUTimer timer(mGpuStreams, "loading cell seeds", nLayers - 2); for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) { GPULog("gpu-transfer: loading {} cell seeds on layer {}, for {:.2f} MB.", this->mCells[iLayer].size(), iLayer, this->mCells[iLayer].size() * sizeof(CellSeedN) / constants::MB); - allocMemAsync(reinterpret_cast(&mCellsDevice[iLayer]), this->mCells[iLayer].size() * sizeof(CellSeedN), mGpuStreams[iLayer], this->getExtAllocator()); - allocMemAsync(reinterpret_cast(&mNeighboursIndexTablesDevice[iLayer]), (this->mCells[iLayer].size() + 1) * sizeof(int), mGpuStreams[iLayer], this->getExtAllocator()); // accessory for the neigh. finding. + allocMemAsync(reinterpret_cast(&mCellsDevice[iLayer]), this->mCells[iLayer].size() * sizeof(CellSeedN), mGpuStreams[iLayer], this->hasExternalDeviceAllocator()); + allocMemAsync(reinterpret_cast(&mNeighboursIndexTablesDevice[iLayer]), (this->mCells[iLayer].size() + 1) * sizeof(int), mGpuStreams[iLayer], this->hasExternalDeviceAllocator()); // accessory for the neigh. finding. GPUChkErrS(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (this->mCells[iLayer].size() + 1) * sizeof(int), mGpuStreams[iLayer].get())); GPUChkErrS(cudaMemcpyAsync(mCellsDevice[iLayer], this->mCells[iLayer].data(), this->mCells[iLayer].size() * sizeof(CellSeedN), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get())); } @@ -362,7 +374,7 @@ void TimeFrameGPU::createCellsLUTDeviceArray(const int iteration) { if (!iteration) { GPUTimer timer("creating cells LUTs array"); - allocMem(reinterpret_cast(&mCellsLUTDeviceArray), (nLayers - 2) * sizeof(int*), this->getExtAllocator()); + allocMem(reinterpret_cast(&mCellsLUTDeviceArray), (nLayers - 2) * sizeof(int*), this->hasExternalDeviceAllocator()); } } @@ -371,7 +383,7 @@ void TimeFrameGPU::createCellsLUTDevice(const int layer) { GPUTimer timer(mGpuStreams[layer], "creating cells LUTs", layer); GPULog("gpu-transfer: creating cell LUT for {} elements on layer {}, for {:.2f} MB.", mNTracklets[layer] + 1, layer, (mNTracklets[layer] + 1) * sizeof(int) / constants::MB); - allocMemAsync(reinterpret_cast(&mCellsLUTDevice[layer]), (mNTracklets[layer] + 1) * sizeof(int), mGpuStreams[layer], this->getExtAllocator()); + allocMemAsync(reinterpret_cast(&mCellsLUTDevice[layer]), (mNTracklets[layer] + 1) * sizeof(int), mGpuStreams[layer], this->hasExternalDeviceAllocator()); GPUChkErrS(cudaMemsetAsync(mCellsLUTDevice[layer], 0, (mNTracklets[layer] + 1) * sizeof(int), mGpuStreams[layer].get())); GPUChkErrS(cudaMemcpyAsync(&mCellsLUTDeviceArray[layer], &mCellsLUTDevice[layer], sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); } @@ -381,7 +393,7 @@ void TimeFrameGPU::createCellsBuffersArray(const int iteration) { if (!iteration) { GPUTimer timer("creating cells buffers array"); - allocMem(reinterpret_cast(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeedN*), this->getExtAllocator()); + allocMem(reinterpret_cast(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeedN*), this->hasExternalDeviceAllocator()); GPUChkErrS(cudaMemcpy(mCellsDeviceArray, mCellsDevice.data(), mCellsDevice.size() * sizeof(CellSeedN*), cudaMemcpyHostToDevice)); } } @@ -394,7 +406,7 @@ void TimeFrameGPU::createCellsBuffers(const int layer) GPUChkErrS(cudaMemcpyAsync(&mNCells[layer], mCellsLUTDevice[layer] + mNTracklets[layer], sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[layer].get())); mGpuStreams[layer].sync(); // ensure number of cells is correct GPULog("gpu-transfer: creating cell buffer for {} elements on layer {}, for {:.2f} MB.", mNCells[layer], layer, mNCells[layer] * sizeof(CellSeedN) / constants::MB); - allocMemAsync(reinterpret_cast(&mCellsDevice[layer]), mNCells[layer] * sizeof(CellSeedN), mGpuStreams[layer], this->getExtAllocator()); + allocMemAsync(reinterpret_cast(&mCellsDevice[layer]), mNCells[layer] * sizeof(CellSeedN), mGpuStreams[layer], this->hasExternalDeviceAllocator()); GPUChkErrS(cudaMemcpyAsync(&mCellsDeviceArray[layer], &mCellsDevice[layer], sizeof(CellSeedN*), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); } @@ -414,7 +426,7 @@ void TimeFrameGPU::loadRoadsDevice() { GPUTimer timer("loading roads device"); GPULog("gpu-transfer: loading {} roads, for {:.2f} MB.", this->mRoads.size(), this->mRoads.size() * sizeof(Road) / constants::MB); - allocMem(reinterpret_cast(&mRoadsDevice), this->mRoads.size() * sizeof(Road), this->getExtAllocator()); + allocMem(reinterpret_cast(&mRoadsDevice), this->mRoads.size() * sizeof(Road), this->hasExternalDeviceAllocator()); GPUChkErrS(cudaHostRegister(this->mRoads.data(), this->mRoads.size() * sizeof(Road), cudaHostRegisterPortable)); GPUChkErrS(cudaMemcpy(mRoadsDevice, this->mRoads.data(), this->mRoads.size() * sizeof(Road), cudaMemcpyHostToDevice)); } @@ -424,7 +436,7 @@ void TimeFrameGPU::loadTrackSeedsDevice(bounded_vector& seed { GPUTimer timer("loading track seeds"); GPULog("gpu-transfer: loading {} track seeds, for {:.2f} MB.", seeds.size(), seeds.size() * sizeof(CellSeedN) / constants::MB); - allocMem(reinterpret_cast(&mTrackSeedsDevice), seeds.size() * sizeof(CellSeedN), this->getExtAllocator()); + allocMem(reinterpret_cast(&mTrackSeedsDevice), seeds.size() * sizeof(CellSeedN), this->hasExternalDeviceAllocator()); GPUChkErrS(cudaHostRegister(seeds.data(), seeds.size() * sizeof(CellSeedN), cudaHostRegisterPortable)); GPUChkErrS(cudaMemcpy(mTrackSeedsDevice, seeds.data(), seeds.size() * sizeof(CellSeedN), cudaMemcpyHostToDevice)); } @@ -437,10 +449,10 @@ void TimeFrameGPU::createNeighboursDevice(const unsigned int layer) GPUChkErrS(cudaMemcpyAsync(&(this->mNNeighbours[layer]), &(mNeighboursLUTDevice[layer][this->mNCells[layer + 1] - 1]), sizeof(unsigned int), cudaMemcpyDeviceToHost, mGpuStreams[layer].get())); mGpuStreams[layer].sync(); // ensure number of neighbours is correct GPULog("gpu-allocation: reserving {} neighbours (pairs), for {:.2f} MB.", this->mNNeighbours[layer], (this->mNNeighbours[layer]) * sizeof(gpuPair) / constants::MB); - allocMemAsync(reinterpret_cast(&mNeighbourPairsDevice[layer]), (this->mNNeighbours[layer]) * sizeof(gpuPair), mGpuStreams[layer], this->getExtAllocator()); + allocMemAsync(reinterpret_cast(&mNeighbourPairsDevice[layer]), (this->mNNeighbours[layer]) * sizeof(gpuPair), mGpuStreams[layer], this->hasExternalDeviceAllocator()); GPUChkErrS(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, (this->mNNeighbours[layer]) * sizeof(gpuPair), mGpuStreams[layer].get())); GPULog("gpu-allocation: reserving {} neighbours, for {:.2f} MB.", this->mNNeighbours[layer], (this->mNNeighbours[layer]) * sizeof(gpuPair) / constants::MB); - allocMemAsync(reinterpret_cast(&mNeighboursDevice[layer]), (this->mNNeighbours[layer]) * sizeof(int), mGpuStreams[layer], this->getExtAllocator()); + allocMemAsync(reinterpret_cast(&mNeighboursDevice[layer]), (this->mNNeighbours[layer]) * sizeof(int), mGpuStreams[layer], this->hasExternalDeviceAllocator()); } template @@ -449,7 +461,7 @@ void TimeFrameGPU::createTrackITSExtDevice(bounded_vector& s GPUTimer timer("reserving tracks"); mTrackITSExt = bounded_vector(seeds.size(), {}, this->getMemoryPool().get()); GPULog("gpu-allocation: reserving {} tracks, for {:.2f} MB.", seeds.size(), seeds.size() * sizeof(o2::its::TrackITSExt) / constants::MB); - allocMem(reinterpret_cast(&mTrackITSExtDevice), seeds.size() * sizeof(o2::its::TrackITSExt), this->getExtAllocator()); + allocMem(reinterpret_cast(&mTrackITSExtDevice), seeds.size() * sizeof(o2::its::TrackITSExt), this->hasExternalDeviceAllocator()); GPUChkErrS(cudaMemset(mTrackITSExtDevice, 0, seeds.size() * sizeof(o2::its::TrackITSExt))); GPUChkErrS(cudaHostRegister(mTrackITSExt.data(), seeds.size() * sizeof(o2::its::TrackITSExt), cudaHostRegisterPortable)); } @@ -462,13 +474,13 @@ void TimeFrameGPU::createVtxTrackletsLUTDevice(const int32_t iteration) for (int32_t iMode{0}; iMode < 2; ++iMode) { if (!iteration) { GPULog("gpu-transfer: creating vertexer tracklets per cluster for {} elements for mode {}, for {:.2f} MB.", ncls, iMode, ncls * sizeof(int32_t) / constants::MB); - allocMemAsync(reinterpret_cast(&mNTrackletsPerClusterDevice[iMode]), ncls * sizeof(int32_t), mGpuStreams[iMode], this->getExtAllocator()); + allocMemAsync(reinterpret_cast(&mNTrackletsPerClusterDevice[iMode]), ncls * sizeof(int32_t), mGpuStreams[iMode], this->hasExternalDeviceAllocator()); GPULog("gpu-transfer: creating vertexer tracklets per cluster sum for {} elements for mode {}, for {:.2f} MB.", ncls + 1, iMode, (ncls + 1) * sizeof(int32_t) / constants::MB); - allocMemAsync(reinterpret_cast(&mNTrackletsPerClusterSumDevice[iMode]), (ncls + 1) * sizeof(int32_t), mGpuStreams[iMode], this->getExtAllocator()); + allocMemAsync(reinterpret_cast(&mNTrackletsPerClusterSumDevice[iMode]), (ncls + 1) * sizeof(int32_t), mGpuStreams[iMode], this->hasExternalDeviceAllocator()); GPULog("gpu-transfer: creating vertexer tracklets per ROF for {} elements for mode {}, for {:.2f} MB.", this->mNrof + 1, iMode, (this->mNrof + 1) * sizeof(int32_t) / constants::MB); - allocMemAsync(reinterpret_cast(&mNTrackletsPerROFDevice[iMode]), (this->mNrof + 1) * sizeof(int32_t), mGpuStreams[iMode], this->getExtAllocator()); + allocMemAsync(reinterpret_cast(&mNTrackletsPerROFDevice[iMode]), (this->mNrof + 1) * sizeof(int32_t), mGpuStreams[iMode], this->hasExternalDeviceAllocator()); } GPUChkErrS(cudaMemsetAsync(mNTrackletsPerClusterDevice[iMode], 0, ncls * sizeof(int32_t), mGpuStreams[iMode].get())); GPUChkErrS(cudaMemsetAsync(mNTrackletsPerClusterSumDevice[iMode], 0, (ncls + 1) * sizeof(int32_t), mGpuStreams[iMode].get())); @@ -477,13 +489,13 @@ void TimeFrameGPU::createVtxTrackletsLUTDevice(const int32_t iteration) mGpuStreams[0].sync(); mGpuStreams[1].sync(); if (!iteration) { - allocMem(reinterpret_cast(&mNTrackletsPerClusterDeviceArray), mNTrackletsPerClusterDevice.size() * sizeof(int32_t*), this->getExtAllocator()); + allocMem(reinterpret_cast(&mNTrackletsPerClusterDeviceArray), mNTrackletsPerClusterDevice.size() * sizeof(int32_t*), this->hasExternalDeviceAllocator()); GPUChkErrS(cudaMemcpy(mNTrackletsPerClusterDeviceArray, mNTrackletsPerClusterDevice.data(), mNTrackletsPerClusterDevice.size() * sizeof(int32_t*), cudaMemcpyHostToDevice)); - allocMem(reinterpret_cast(&mNTrackletsPerClusterSumDeviceArray), mNTrackletsPerClusterSumDevice.size() * sizeof(int32_t*), this->getExtAllocator()); + allocMem(reinterpret_cast(&mNTrackletsPerClusterSumDeviceArray), mNTrackletsPerClusterSumDevice.size() * sizeof(int32_t*), this->hasExternalDeviceAllocator()); GPUChkErrS(cudaMemcpy(mNTrackletsPerClusterSumDeviceArray, mNTrackletsPerClusterSumDevice.data(), mNTrackletsPerClusterSumDevice.size() * sizeof(int32_t*), cudaMemcpyHostToDevice)); - allocMem(reinterpret_cast(&mNTrackletsPerROFDeviceArray), mNTrackletsPerROFDevice.size() * sizeof(int32_t*), this->getExtAllocator()); + allocMem(reinterpret_cast(&mNTrackletsPerROFDeviceArray), mNTrackletsPerROFDevice.size() * sizeof(int32_t*), this->hasExternalDeviceAllocator()); GPUChkErrS(cudaMemcpy(mNTrackletsPerROFDeviceArray, mNTrackletsPerROFDevice.data(), mNTrackletsPerROFDevice.size() * sizeof(int32_t*), cudaMemcpyHostToDevice)); } } @@ -496,11 +508,11 @@ void TimeFrameGPU::createVtxTrackletsBuffers(const int32_t iteration) this->mTotalTracklets[iMode] = 0; GPUChkErrS(cudaMemcpyAsync(&(this->mTotalTracklets[iMode]), mNTrackletsPerClusterSumDevice[iMode] + this->mClusters[1].size(), sizeof(int32_t), cudaMemcpyDeviceToHost, mGpuStreams[iMode].get())); GPULog("gpu-transfer: creating vertexer tracklets buffer for {} elements on layer {}, for {:.2f} MB.", this->mTotalTracklets[iMode], iMode, this->mTotalTracklets[iMode] * sizeof(Tracklet) / constants::MB); - allocMemAsync(reinterpret_cast(&mTrackletsDevice[iMode]), this->mTotalTracklets[iMode] * sizeof(Tracklet), mGpuStreams[iMode], this->getExtAllocator()); + allocMemAsync(reinterpret_cast(&mTrackletsDevice[iMode]), this->mTotalTracklets[iMode] * sizeof(Tracklet), mGpuStreams[iMode], this->hasExternalDeviceAllocator()); } mGpuStreams[0].sync(); mGpuStreams[1].sync(); - allocMem(reinterpret_cast(&mTrackletsDeviceArray), 2 * sizeof(Tracklet*), this->getExtAllocator()); + allocMem(reinterpret_cast(&mTrackletsDeviceArray), 2 * sizeof(Tracklet*), this->hasExternalDeviceAllocator()); GPUChkErrS(cudaHostRegister(mTrackletsDevice.data(), 2 * sizeof(Tracklet*), cudaHostRegisterPortable)); GPUChkErrS(cudaMemcpy(mTrackletsDeviceArray, mTrackletsDevice.data(), 2 * sizeof(Tracklet*), cudaMemcpyHostToDevice)); } @@ -512,14 +524,14 @@ void TimeFrameGPU::createVtxLinesLUTDevice(const int32_t iteration) const int32_t ncls = this->mClusters[1].size(); GPULog("gpu-transfer: creating vertexer lines per cluster for {} elements , for {:.2f} MB.", ncls, ncls * sizeof(int32_t) / constants::MB); - allocMem(reinterpret_cast(&mNLinesPerClusterDevice), ncls * sizeof(int32_t), this->getExtAllocator()); + allocMem(reinterpret_cast(&mNLinesPerClusterDevice), ncls * sizeof(int32_t), this->hasExternalDeviceAllocator()); GPULog("gpu-transfer: creating vertexer lines per cluster sum for {} elements , for {:.2f} MB.", ncls + 1, (ncls + 1) * sizeof(int32_t) / constants::MB); - allocMem(reinterpret_cast(&mNLinesPerClusterSumDevice), (ncls + 1) * sizeof(int32_t), this->getExtAllocator()); + allocMem(reinterpret_cast(&mNLinesPerClusterSumDevice), (ncls + 1) * sizeof(int32_t), this->hasExternalDeviceAllocator()); const int32_t ntrkls = this->mTotalTracklets[0]; GPULog("gpu-transfer: creating vertexer used tracklets for {} elements , for {:.2f} MB.", ntrkls, ntrkls * sizeof(uint8_t) / constants::MB); - allocMem(reinterpret_cast(&mUsedTrackletsDevice), ntrkls * sizeof(uint8_t), this->getExtAllocator()); + allocMem(reinterpret_cast(&mUsedTrackletsDevice), ntrkls * sizeof(uint8_t), this->hasExternalDeviceAllocator()); } template @@ -530,7 +542,7 @@ void TimeFrameGPU::createVtxLinesBuffer(const int32_t iteration) GPUChkErrS(cudaMemcpy(&nlines, mNLinesPerClusterDevice + this->mClusters[1].size(), sizeof(int32_t), cudaMemcpyDeviceToHost)); this->mTotalLines = nlines; GPULog("gpu-transfer: creating vertexer lines for {} elements , for {:.2f} MB.", nlines, nlines * sizeof(Line) / constants::MB); - allocMem(reinterpret_cast(&mLinesDevice), nlines * sizeof(Line), this->getExtAllocator()); + allocMem(reinterpret_cast(&mLinesDevice), nlines * sizeof(Line), this->hasExternalDeviceAllocator()); // reset used tracklets GPUChkErrS(cudaMemset(mUsedTrackletsDevice, 0, this->mTotalTracklets[0] * sizeof(uint8_t))); } diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx index 047d42d815e99..cca4283c9b77f 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx @@ -33,12 +33,14 @@ void TrackerTraitsGPU::initialiseTimeFrame(const int iteration) mTimeFrameGPU->loadVertices(iteration); mTimeFrameGPU->loadIndexTableUtils(iteration); mTimeFrameGPU->loadMultiplicityCutMask(iteration); + // pinned on host mTimeFrameGPU->createUsedClustersDeviceArray(iteration); mTimeFrameGPU->createClustersDeviceArray(iteration); mTimeFrameGPU->createUnsortedClustersDeviceArray(iteration); mTimeFrameGPU->createClustersIndexTablesArray(iteration); mTimeFrameGPU->createTrackingFrameInfoDeviceArray(iteration); mTimeFrameGPU->createROFrameClustersDeviceArray(iteration); + // device array mTimeFrameGPU->createTrackletsLUTDeviceArray(iteration); mTimeFrameGPU->createTrackletsBuffersArray(iteration); mTimeFrameGPU->createCellsBuffersArray(iteration); @@ -106,7 +108,7 @@ void TrackerTraitsGPU::computeLayerTracklets(const int iteration, int i mTimeFrameGPU->getPositionResolutions(), this->mTrkParams[iteration].LayerRadii, mTimeFrameGPU->getMSangles(), - mTimeFrameGPU->getExternalAllocator(), + mTimeFrameGPU->getExternalDeviceAllocator(), conf.nBlocksLayerTracklets[iteration], conf.nThreadsLayerTracklets[iteration], mTimeFrameGPU->getStreams()); @@ -144,7 +146,7 @@ void TrackerTraitsGPU::computeLayerTracklets(const int iteration, int i mTimeFrameGPU->getPositionResolutions(), this->mTrkParams[iteration].LayerRadii, mTimeFrameGPU->getMSangles(), - mTimeFrameGPU->getExternalAllocator(), + mTimeFrameGPU->getExternalDeviceAllocator(), conf.nBlocksLayerTracklets[iteration], conf.nThreadsLayerTracklets[iteration], mTimeFrameGPU->getStreams()); @@ -195,7 +197,7 @@ void TrackerTraitsGPU::computeLayerCells(const int iteration) this->mTrkParams[iteration].MaxChi2ClusterAttachment, this->mTrkParams[iteration].CellDeltaTanLambdaSigma, this->mTrkParams[iteration].NSigmaCut, - mTimeFrameGPU->getExternalAllocator(), + mTimeFrameGPU->getExternalDeviceAllocator(), conf.nBlocksLayerCells[iteration], conf.nThreadsLayerCells[iteration], mTimeFrameGPU->getStreams()); @@ -251,7 +253,7 @@ void TrackerTraitsGPU::findCellsNeighbours(const int iteration) currentLayerCellsNum, nextLayerCellsNum, 1e2, - mTimeFrameGPU->getExternalAllocator(), + mTimeFrameGPU->getExternalDeviceAllocator(), conf.nBlocksFindNeighbours[iteration], conf.nThreadsFindNeighbours[iteration], mTimeFrameGPU->getStream(iLayer)); @@ -279,7 +281,7 @@ void TrackerTraitsGPU::findCellsNeighbours(const int iteration) mTimeFrameGPU->getDeviceNeighbours(iLayer), mTimeFrameGPU->getArrayNNeighbours()[iLayer], mTimeFrameGPU->getStream(iLayer), - mTimeFrameGPU->getExternalAllocator()); + mTimeFrameGPU->getExternalDeviceAllocator()); } mTimeFrameGPU->syncStreams(false); } @@ -310,7 +312,7 @@ void TrackerTraitsGPU::findRoads(const int iteration) this->mTrkParams[0].MaxChi2NDF, mTimeFrameGPU->getDevicePropagator(), this->mTrkParams[0].CorrType, - mTimeFrameGPU->getExternalAllocator(), + mTimeFrameGPU->getExternalDeviceAllocator(), conf.nBlocksProcessNeighbours[iteration], conf.nThreadsProcessNeighbours[iteration]); } diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/BoundedAllocator.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/BoundedAllocator.h index c19c1e56b273d..6a1a7e18e16f4 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/BoundedAllocator.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/BoundedAllocator.h @@ -22,6 +22,8 @@ #include #include +#include "ITStracking/ExternalAllocator.h" + #include "GPUCommonLogger.h" namespace o2::its @@ -56,6 +58,7 @@ class BoundedMemoryResource final : public std::pmr::memory_resource BoundedMemoryResource(size_t maxBytes = std::numeric_limits::max(), std::pmr::memory_resource* upstream = std::pmr::get_default_resource()) : mMaxMemory(maxBytes), mUpstream(upstream) {} + BoundedMemoryResource(ExternalAllocator* alloc) : mAdaptor(std::make_unique(alloc)), mUpstream(mAdaptor.get()) {} void* do_allocate(size_t bytes, size_t alignment) final { @@ -117,7 +120,8 @@ class BoundedMemoryResource final : public std::pmr::memory_resource std::atomic mMaxMemory{std::numeric_limits::max()}; std::atomic mCountThrow{0}; std::atomic mUsedMemory{0}; - std::pmr::memory_resource* mUpstream; + std::unique_ptr mAdaptor{nullptr}; + std::pmr::memory_resource* mUpstream{nullptr}; }; template @@ -170,7 +174,7 @@ inline void clearResizeBoundedVector(bounded_vector& vec, size_t sz, std::pmr } template -void clearResizeBoundedVector(std::vector>& vec, size_t size, std::pmr::memory_resource* mr) +inline void clearResizeBoundedVector(std::vector>& vec, size_t size, std::pmr::memory_resource* mr) { vec.clear(); vec.reserve(size); diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/ExternalAllocator.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/ExternalAllocator.h index 1628bbc52776b..36e78ef24020c 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/ExternalAllocator.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/ExternalAllocator.h @@ -16,6 +16,8 @@ #ifndef TRACKINGITSU_INCLUDE_EXTERNALALLOCATOR_H_ #define TRACKINGITSU_INCLUDE_EXTERNALALLOCATOR_H_ +#include + namespace o2::its { @@ -25,6 +27,36 @@ class ExternalAllocator virtual void* allocate(size_t) = 0; virtual void deallocate(char*, size_t) = 0; }; + +class ExternalAllocatorAdaptor final : public std::pmr::memory_resource +{ + public: + explicit ExternalAllocatorAdaptor(ExternalAllocator* alloc) : mAlloc(alloc) {} + + protected: + void* do_allocate(size_t bytes, size_t alignment) override + { + void* p = mAlloc->allocate(bytes); + if (!p) { + throw std::bad_alloc(); + } + return p; + } + + void do_deallocate(void* p, size_t bytes, size_t) override + { + mAlloc->deallocate(static_cast(p), bytes); + } + + bool do_is_equal(const std::pmr::memory_resource& other) const noexcept override + { + return this == &other; + } + + private: + ExternalAllocator* mAlloc; +}; + } // namespace o2::its #endif diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h index 7b5b84b1acde2..f1f16ce30a67d 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h @@ -95,7 +95,7 @@ struct TimeFrame { gsl::span::iterator& pattIt, const itsmft::TopologyDictionary* dict, const dataformats::MCTruthContainer* mcLabels = nullptr); - void resetROFrameData(); + void resetROFrameData(size_t nROFs); int getTotalClusters() const; auto& getTotVertIteration() { return mTotVertPerIteration; } @@ -233,23 +233,26 @@ struct TimeFrame { void setBz(float bz) { mBz = bz; } float getBz() const { return mBz; } - void setExternalAllocator(ExternalAllocator* allocator) + /// State if memory will be externally managed. + // device + ExternalAllocator* mExtDeviceAllocator{nullptr}; + void setExternalDeviceAllocator(ExternalAllocator* allocator) { mExtDeviceAllocator = allocator; } + ExternalAllocator* getExternalDeviceAllocator() { return mExtDeviceAllocator; } + bool hasExternalDeviceAllocator() const noexcept { return mExtDeviceAllocator != nullptr; } + // host + ExternalAllocator* mExtHostAllocator{nullptr}; + void setExternalHostAllocator(ExternalAllocator* allocator) { - if (isGPU()) { - LOGP(debug, "Setting timeFrame allocator to external"); - mAllocator = allocator; - } else { - LOGP(fatal, "External allocator is currently only supported for GPU"); - } + mExtHostAllocator = allocator; + mExtMemoryPool = std::make_shared(mExtHostAllocator); } - - ExternalAllocator* getExternalAllocator() { return mAllocator; } - - virtual void setDevicePropagator(const o2::base::PropagatorImpl*) - { - return; - }; + ExternalAllocator* getExternalHostAllocator() { return mExtHostAllocator; } + bool hasExternalHostAllocator() const noexcept { return mExtHostAllocator != nullptr; } + std::shared_ptr mExtMemoryPool; + BoundedMemoryResource* getMaybeExternalHostResource(bool force = false) { return (hasExternalHostAllocator() && !force) ? mExtMemoryPool.get() : mMemoryPool.get(); } + // Propagator const o2::base::PropagatorImpl* getDevicePropagator() const { return mPropagatorDevice; } + virtual void setDevicePropagator(const o2::base::PropagatorImpl*){}; template void addClusterToLayer(int layer, T&&... args); @@ -290,10 +293,6 @@ struct TimeFrame { bounded_vector mROFramesPV; bounded_vector mPrimaryVertices; - // State if memory will be externally managed. - ExternalAllocator* mAllocator = nullptr; - bool getExtAllocator() const noexcept { return mAllocator != nullptr; } - std::array, nLayers> mUnsortedClusters; std::vector> mTracklets; std::vector> mCells; diff --git a/Detectors/ITSMFT/ITS/tracking/src/TimeFrame.cxx b/Detectors/ITSMFT/ITS/tracking/src/TimeFrame.cxx index 741acd227cb32..ab68a63fc394f 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/TimeFrame.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/TimeFrame.cxx @@ -121,14 +121,15 @@ int TimeFrame::loadROFrameData(gsl::span rofs, const itsmft::TopologyDictionary* dict, const dataformats::MCTruthContainer* mcLabels) { - resetROFrameData(); + resetROFrameData(rofs.size()); GeometryTGeo* geom = GeometryTGeo::Instance(); geom->fillMatrixCache(o2::math_utils::bit2Mask(o2::math_utils::TransformType::T2L, o2::math_utils::TransformType::L2G)); - mNrof = 0; + mNrof = rofs.size(); clearResizeBoundedVector(mClusterSize, clusters.size(), mMemoryPool.get()); - for (auto& rof : rofs) { + for (size_t iRof{0}; iRof < rofs.size(); ++iRof) { + const auto& rof = rofs[iRof]; for (int clusterId{rof.getFirstEntry()}; clusterId < rof.getFirstEntry() + rof.getNEntries(); ++clusterId) { const auto& c = clusters[clusterId]; @@ -164,15 +165,13 @@ int TimeFrame::loadROFrameData(gsl::span rofs, addTrackingFrameInfoToLayer(layer, gloXYZ.x(), gloXYZ.y(), gloXYZ.z(), trkXYZ.x(), geom->getSensorRefAlpha(sensorID), std::array{trkXYZ.y(), trkXYZ.z()}, std::array{sigmaY2, sigmaYZ, sigmaZ2}); - /// Rotate to the global frame addClusterToLayer(layer, gloXYZ.x(), gloXYZ.y(), gloXYZ.z(), mUnsortedClusters[layer].size()); addClusterExternalIndexToLayer(layer, clusterId); } for (unsigned int iL{0}; iL < mUnsortedClusters.size(); ++iL) { - mROFramesClusters[iL].push_back(mUnsortedClusters[iL].size()); + mROFramesClusters[iL][iRof + 1] = mUnsortedClusters[iL].size(); // effectively calculating and exclusive sum } - mNrof++; } for (auto i = 0; i < mNTrackletsPerCluster.size(); ++i) { @@ -188,13 +187,13 @@ int TimeFrame::loadROFrameData(gsl::span rofs, } template -void TimeFrame::resetROFrameData() +void TimeFrame::resetROFrameData(size_t nRofs) { for (int iLayer{0}; iLayer < nLayers; ++iLayer) { - deepVectorClear(mUnsortedClusters[iLayer], mMemoryPool.get()); - deepVectorClear(mTrackingFrameInfo[iLayer], mMemoryPool.get()); + deepVectorClear(mUnsortedClusters[iLayer], getMaybeExternalHostResource()); + deepVectorClear(mTrackingFrameInfo[iLayer], getMaybeExternalHostResource()); + clearResizeBoundedVector(mROFramesClusters[iLayer], nRofs + 1, getMaybeExternalHostResource()); deepVectorClear(mClusterExternalIndices[iLayer], mMemoryPool.get()); - clearResizeBoundedVector(mROFramesClusters[iLayer], 1, mMemoryPool.get(), 0); if (iLayer < 2) { deepVectorClear(mTrackletsIndexROF[iLayer], mMemoryPool.get()); @@ -298,11 +297,11 @@ void TimeFrame::initialise(const int iteration, const TrackingParameter clearResizeBoundedVector(mBogusClusters, trkParam.NLayers, mMemoryPool.get()); deepVectorClear(mTrackletClusters); for (unsigned int iLayer{0}; iLayer < std::min((int)mClusters.size(), maxLayers); ++iLayer) { - clearResizeBoundedVector(mClusters[iLayer], mUnsortedClusters[iLayer].size(), mMemoryPool.get()); - clearResizeBoundedVector(mUsedClusters[iLayer], mUnsortedClusters[iLayer].size(), mMemoryPool.get()); + clearResizeBoundedVector(mClusters[iLayer], mUnsortedClusters[iLayer].size(), getMaybeExternalHostResource(maxLayers != nLayers)); + clearResizeBoundedVector(mUsedClusters[iLayer], mUnsortedClusters[iLayer].size(), getMaybeExternalHostResource(maxLayers != nLayers)); mPositionResolution[iLayer] = o2::gpu::CAMath::Sqrt(0.5f * (trkParam.SystErrorZ2[iLayer] + trkParam.SystErrorY2[iLayer]) + trkParam.LayerResolution[iLayer] * trkParam.LayerResolution[iLayer]); } - clearResizeBoundedArray(mIndexTables, mNrof * (trkParam.ZBins * trkParam.PhiBins + 1), mMemoryPool.get()); + clearResizeBoundedArray(mIndexTables, mNrof * (trkParam.ZBins * trkParam.PhiBins + 1), getMaybeExternalHostResource(maxLayers != nLayers)); clearResizeBoundedVector(mLines, mNrof, mMemoryPool.get()); clearResizeBoundedVector(mTrackletClusters, mNrof, mMemoryPool.get()); @@ -586,37 +585,24 @@ void TimeFrame::setMemoryPool(std::shared_ptr po { mMemoryPool = pool; - auto initVector = [&](bounded_vector & vec) - { - bounded_vector tmp(std::pmr::polymorphic_allocator{mMemoryPool.get()}); - vec.swap(tmp); - }; - auto initArrays = [&](std::array, S> & arr) + auto initVector = [&](bounded_vector & vec, bool useExternal = false) { - for (size_t i{0}; i < S; ++i) { - initVector(arr[i]); - } + vec = bounded_vector(std::pmr::polymorphic_allocator{(useExternal) ? mExtMemoryPool.get() : mMemoryPool.get()}); }; - auto initVectors = [&](std::vector> & vec) + auto initContainers = [&](Container & container, bool useExternal = false) { - for (size_t i{0}; i < vec.size(); ++i) { - initVector(vec[i]); + for (auto& v : container) { + initVector(v, useExternal); } }; - + // these will only reside on the host for the cpu part initVector(mTotVertPerIteration); initVector(mPrimaryVertices); initVector(mROFramesPV); - initArrays(mClusters); - initArrays(mTrackingFrameInfo); - initArrays(mClusterExternalIndices); - initArrays(mROFramesClusters); - initArrays(mNTrackletsPerCluster); - initArrays(mNTrackletsPerClusterSum); - initArrays(mNClustersPerROF); - initArrays(mIndexTables); - initArrays(mUsedClusters); - initArrays(mUnsortedClusters); + initContainers(mClusterExternalIndices); + initContainers(mNTrackletsPerCluster); + initContainers(mNTrackletsPerClusterSum); + initContainers(mNClustersPerROF); initVector(mROFramesPV); initVector(mPrimaryVertices); initVector(mRoads); @@ -628,12 +614,19 @@ void TimeFrame::setMemoryPool(std::shared_ptr po initVector(mPValphaX); initVector(mBogusClusters); initVector(mVerticesContributorLabels); - initArrays(mTrackletsIndexROF); - initVectors(mTracks); - initVectors(mTracklets); - initVectors(mCells); - initVectors(mCellsNeighbours); - initVectors(mCellsLookupTable); + initContainers(mTrackletsIndexROF); + initContainers(mTracks); + initContainers(mTracklets); + initContainers(mCells); + initContainers(mCellsNeighbours); + initContainers(mCellsLookupTable); + // these will use possibly an externally provided allocator + initContainers(mClusters, hasExternalHostAllocator()); + initContainers(mUsedClusters, hasExternalHostAllocator()); + initContainers(mUnsortedClusters, hasExternalHostAllocator()); + initContainers(mIndexTables, hasExternalHostAllocator()); + initContainers(mTrackingFrameInfo, hasExternalHostAllocator()); + initContainers(mROFramesClusters, hasExternalHostAllocator()); } template diff --git a/Detectors/Upgrades/ITS3/reconstruction/src/IOUtils.cxx b/Detectors/Upgrades/ITS3/reconstruction/src/IOUtils.cxx index acba8022e376f..2fced813efc93 100644 --- a/Detectors/Upgrades/ITS3/reconstruction/src/IOUtils.cxx +++ b/Detectors/Upgrades/ITS3/reconstruction/src/IOUtils.cxx @@ -64,7 +64,7 @@ int loadROFrameDataITS3(its::TimeFrame<7>* tf, const its3::TopologyDictionary* dict, const dataformats::MCTruthContainer* mcLabels) { - tf->resetROFrameData(); + tf->resetROFrameData(rofs.size()); auto geom = its::GeometryTGeo::Instance(); geom->fillMatrixCache(o2::math_utils::bit2Mask(o2::math_utils::TransformType::T2L, o2::math_utils::TransformType::L2G)); diff --git a/GPU/GPUTracking/Global/GPUChainITS.cxx b/GPU/GPUTracking/Global/GPUChainITS.cxx index 26dff3710cd4a..a85cdb48c4d1c 100644 --- a/GPU/GPUTracking/Global/GPUChainITS.cxx +++ b/GPU/GPUTracking/Global/GPUChainITS.cxx @@ -17,7 +17,6 @@ #include "DataFormatsITS/TrackITS.h" #include "ITStracking/ExternalAllocator.h" #include "GPUReconstructionIncludesITS.h" -#include using namespace o2::gpu; @@ -26,15 +25,18 @@ namespace o2::its class GPUFrameworkExternalAllocator final : public o2::its::ExternalAllocator { public: + GPUFrameworkExternalAllocator(GPUMemoryResource::MemoryType type) : mType(type) {} + void* allocate(size_t size) override { - return mFWReco->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU); + return mFWReco->AllocateDirectMemory(size, mType); } - void deallocate(char* ptr, size_t) override {} + void deallocate(char* ptr, size_t size) override {} void setReconstructionFramework(o2::gpu::GPUReconstruction* fwr) { mFWReco = fwr; } private: o2::gpu::GPUReconstruction* mFWReco; + GPUMemoryResource::MemoryType mType; }; } // namespace o2::its @@ -71,11 +73,12 @@ o2::its::TimeFrame<7>* GPUChainITS::GetITSTimeframe() } #if !defined(GPUCA_STANDALONE) if (mITSTimeFrame->isGPU()) { - auto doFWExtAlloc = [this](size_t size) -> void* { return rec()->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU); }; - - mFrameworkAllocator.reset(new o2::its::GPUFrameworkExternalAllocator); - mFrameworkAllocator->setReconstructionFramework(rec()); - mITSTimeFrame->setExternalAllocator(mFrameworkAllocator.get()); + mFrameworkDeviceAllocator.reset(new o2::its::GPUFrameworkExternalAllocator(GPUMemoryResource::MEMORY_GPU)); + mFrameworkDeviceAllocator->setReconstructionFramework(rec()); + mITSTimeFrame->setExternalDeviceAllocator(mFrameworkDeviceAllocator.get()); + mFrameworkHostAllocator.reset(new o2::its::GPUFrameworkExternalAllocator(GPUMemoryResource::MEMORY_HOST)); + mFrameworkHostAllocator->setReconstructionFramework(rec()); + mITSTimeFrame->setExternalHostAllocator(mFrameworkHostAllocator.get()); } #endif return mITSTimeFrame.get(); diff --git a/GPU/GPUTracking/Global/GPUChainITS.h b/GPU/GPUTracking/Global/GPUChainITS.h index a607f66322bab..e95c2de2f2023 100644 --- a/GPU/GPUTracking/Global/GPUChainITS.h +++ b/GPU/GPUTracking/Global/GPUChainITS.h @@ -40,9 +40,9 @@ class GPUChainITS final : public GPUChain int32_t Finalize() override; int32_t RunChain() override; - void RegisterPermanentMemoryAndProcessors() final {}; - void RegisterGPUProcessors() final {}; - void MemorySize(size_t&, size_t&) final {}; + void RegisterPermanentMemoryAndProcessors() final{}; + void RegisterGPUProcessors() final{}; + void MemorySize(size_t&, size_t&) final{}; o2::its::TrackerTraits<7>* GetITSTrackerTraits(); o2::its::VertexerTraits<7>* GetITSVertexerTraits(); @@ -53,7 +53,8 @@ class GPUChainITS final : public GPUChain std::unique_ptr> mITSTrackerTraits; std::unique_ptr> mITSVertexerTraits; std::unique_ptr> mITSTimeFrame; - std::unique_ptr mFrameworkAllocator; + std::unique_ptr mFrameworkDeviceAllocator; + std::unique_ptr mFrameworkHostAllocator; }; } // namespace o2::gpu From dd7a382d02667e6d4052158574d32daebc00ab24 Mon Sep 17 00:00:00 2001 From: ALICE Action Bot Date: Fri, 19 Sep 2025 09:29:21 +0000 Subject: [PATCH 3/3] Please consider the following formatting changes --- .../ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h | 2 +- Detectors/ITSMFT/ITS/tracking/src/TimeFrame.cxx | 6 ++---- GPU/GPUTracking/Global/GPUChainITS.h | 6 +++--- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h index f1f16ce30a67d..949eda042e0f9 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h @@ -252,7 +252,7 @@ struct TimeFrame { BoundedMemoryResource* getMaybeExternalHostResource(bool force = false) { return (hasExternalHostAllocator() && !force) ? mExtMemoryPool.get() : mMemoryPool.get(); } // Propagator const o2::base::PropagatorImpl* getDevicePropagator() const { return mPropagatorDevice; } - virtual void setDevicePropagator(const o2::base::PropagatorImpl*){}; + virtual void setDevicePropagator(const o2::base::PropagatorImpl*) {}; template void addClusterToLayer(int layer, T&&... args); diff --git a/Detectors/ITSMFT/ITS/tracking/src/TimeFrame.cxx b/Detectors/ITSMFT/ITS/tracking/src/TimeFrame.cxx index ab68a63fc394f..f3fa5a5fec522 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/TimeFrame.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/TimeFrame.cxx @@ -585,12 +585,10 @@ void TimeFrame::setMemoryPool(std::shared_ptr po { mMemoryPool = pool; - auto initVector = [&](bounded_vector & vec, bool useExternal = false) - { + auto initVector = [&](bounded_vector& vec, bool useExternal = false) { vec = bounded_vector(std::pmr::polymorphic_allocator{(useExternal) ? mExtMemoryPool.get() : mMemoryPool.get()}); }; - auto initContainers = [&](Container & container, bool useExternal = false) - { + auto initContainers = [&](Container& container, bool useExternal = false) { for (auto& v : container) { initVector(v, useExternal); } diff --git a/GPU/GPUTracking/Global/GPUChainITS.h b/GPU/GPUTracking/Global/GPUChainITS.h index e95c2de2f2023..ab693bcef3f8b 100644 --- a/GPU/GPUTracking/Global/GPUChainITS.h +++ b/GPU/GPUTracking/Global/GPUChainITS.h @@ -40,9 +40,9 @@ class GPUChainITS final : public GPUChain int32_t Finalize() override; int32_t RunChain() override; - void RegisterPermanentMemoryAndProcessors() final{}; - void RegisterGPUProcessors() final{}; - void MemorySize(size_t&, size_t&) final{}; + void RegisterPermanentMemoryAndProcessors() final {}; + void RegisterGPUProcessors() final {}; + void MemorySize(size_t&, size_t&) final {}; o2::its::TrackerTraits<7>* GetITSTrackerTraits(); o2::its::VertexerTraits<7>* GetITSVertexerTraits();