From d19df0fbd25818ac75f9ff4ffda2560fb55cff9d Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Mon, 31 Mar 2025 17:36:41 +0200 Subject: [PATCH 1/2] Use typed external allocator for some thrust items --- .../GPU/ITStrackingGPU/TrackingKernels.h | 12 +++- .../ITS/tracking/GPU/cuda/CMakeLists.txt | 2 +- .../tracking/GPU/cuda/TrackerTraitsGPU.cxx | 5 +- .../ITS/tracking/GPU/cuda/TrackingKernels.cu | 60 +++++++++++++++---- .../include/ITStracking/ExternalAllocator.h | 2 +- .../tracking/include/ITStracking/TimeFrame.h | 29 +++++---- GPU/GPUTracking/Global/GPUChainITS.cxx | 2 +- 7 files changed, 80 insertions(+), 32 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h index 54aa0e01c8a78..de6ac415dfc40 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h @@ -16,9 +16,12 @@ #include "DetectorsBase/Propagator.h" #include "GPUCommonDef.h" -namespace o2::its +namespace o2 +{ +namespace its { class CellSeed; +class ExternalAllocator; namespace gpu { #ifdef GPUCA_GPUCODE // GPUg() global kernels must only when compiled by GPU compiler @@ -178,7 +181,8 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice, int filterCellNeighboursHandler(gpuPair*, int*, - unsigned int); + unsigned int, + o2::its::ExternalAllocator* = nullptr); template void processNeighboursHandler(const int startLayer, @@ -191,6 +195,7 @@ void processNeighboursHandler(const int startLayer, gsl::span neighboursDeviceLUTs, const TrackingFrameInfo** foundTrackingFrameInfo, bounded_vector& seedsHost, + o2::its::ExternalAllocator*, const float bz, const float MaxChi2ClusterAttachment, const float maxChi2NDF, @@ -212,5 +217,6 @@ void trackSeedHandler(CellSeed* trackSeeds, const o2::base::PropagatorF::MatCorrType matCorrType, const int nBlocks, const int nThreads); -} // namespace o2::its +} // namespace its +} // namespace o2 #endif // ITSTRACKINGGPU_TRACKINGKERNELS_H_ diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt index 9769930504f29..df055442cbe25 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt @@ -13,7 +13,7 @@ if(CUDA_ENABLED) find_package(CUDAToolkit) message(STATUS "Building ITS CUDA tracker") -# add_compile_options(-O0 -g -lineinfo -fPIC) +add_compile_options(-O0 -g -lineinfo -fPIC) # add_compile_definitions(ITS_MEASURE_GPU_TIME) o2_add_library(ITStrackingCUDA SOURCES ClusterLinesGPU.cu diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx index 871fd7a95f890..89d2b5aeffe63 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx @@ -18,6 +18,7 @@ #include "ITStrackingGPU/TrackerTraitsGPU.h" #include "ITStrackingGPU/TrackingKernels.h" #include "ITStracking/TrackingConfigParam.h" + namespace o2::its { constexpr int UnusedIndex{-1}; @@ -209,7 +210,8 @@ void TrackerTraitsGPU::findCellsNeighbours(const int iteration) filterCellNeighboursHandler(mTimeFrameGPU->getDeviceNeighbourPairs(iLayer), mTimeFrameGPU->getDeviceNeighbours(iLayer), - nNeigh); + nNeigh, + mTimeFrameGPU->getExternalAllocator()); } mTimeFrameGPU->createNeighboursDeviceArray(); mTimeFrameGPU->unregisterRest(); @@ -236,6 +238,7 @@ void TrackerTraitsGPU::findRoads(const int iteration) mTimeFrameGPU->getDeviceNeighboursLUTs(), mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), trackSeeds, + mTimeFrameGPU->getExternalAllocator(), this->mBz, this->mTrkParams[0].MaxChi2ClusterAttachment, this->mTrkParams[0].MaxChi2NDF, diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index 301f37767c160..5fe4bcd4610c9 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -28,6 +28,7 @@ #include "ITStracking/Constants.h" #include "ITStracking/IndexTableUtils.h" #include "ITStracking/MathUtils.h" +#include "ITStracking/ExternalAllocator.h" #include "DataFormatsITS/TrackITS.h" #include "ReconstructionDataFormats/Vertex.h" @@ -35,8 +36,6 @@ #include "ITStrackingGPU/TrackingKernels.h" #include "ITStrackingGPU/Utils.h" -#include "GPUCommonHelpers.h" - #ifndef __HIPCC__ #define THRUST_NAMESPACE thrust::cuda #else @@ -64,6 +63,37 @@ GPUdii() float Sq(float v) namespace gpu { +template +class TypedAllocator : public thrust::device_allocator +{ + public: + using value_type = T; + using pointer = T*; + + template + struct rebind { + using other = TypedAllocator; + }; + + explicit TypedAllocator(ExternalAllocator* allocPtr) + : mInternalAllocator(allocPtr) {} + + T* allocate(size_t n) + { + return reinterpret_cast(mInternalAllocator->allocate(n * sizeof(T))); + } + + void deallocate(T* p, size_t n) + { + char* raw_ptr = reinterpret_cast(p); + size_t bytes = n * sizeof(T); + mInternalAllocator->deallocate(raw_ptr, bytes); // redundant as internal dealloc is no-op. + } + + private: + ExternalAllocator* mInternalAllocator; +}; + GPUd() const int4 getBinsRect(const Cluster& currentCluster, const int layerIndex, const o2::its::IndexTableUtils& utils, const float z1, const float z2, float maxdeltaz, float maxdeltaphi) @@ -1117,7 +1147,8 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice, int filterCellNeighboursHandler(gpuPair* cellNeighbourPairs, int* cellNeighbours, - unsigned int nNeigh) + unsigned int nNeigh, + o2::its::ExternalAllocator* allocator) { thrust::device_ptr> neighVectorPairs(cellNeighbourPairs); thrust::device_ptr validNeighs(cellNeighbours); @@ -1140,6 +1171,7 @@ void processNeighboursHandler(const int startLayer, gsl::span neighboursDeviceLUTs, const TrackingFrameInfo** foundTrackingFrameInfo, bounded_vector& seedsHost, + o2::its::ExternalAllocator* allocator, const float bz, const float maxChi2ClusterAttachment, const float maxChi2NDF, @@ -1148,8 +1180,10 @@ void processNeighboursHandler(const int startLayer, const int nBlocks, const int nThreads) { - thrust::device_vector foundSeedsTable(nCells[startLayer] + 1); // Shortcut: device_vector skips central memory management, we are relying on the contingency. - // TODO: fix this. + auto allocInt = gpu::TypedAllocator(allocator); + auto allocCellSeed = gpu::TypedAllocator(allocator); + thrust::device_vector> foundSeedsTable(nCells[startLayer] + 1, 0, allocInt); // Shortcut: device_vector skips central memory management, we are relying on the contingency. + // TODO: fix this. gpu::processNeighboursKernel<<>>( @@ -1172,8 +1206,8 @@ void processNeighboursHandler(const int startLayer, matCorrType); gpu::cubExclusiveScanInPlace(foundSeedsTable, nCells[startLayer] + 1); - thrust::device_vector updatedCellId(foundSeedsTable.back()); - thrust::device_vector updatedCellSeed(foundSeedsTable.back()); + thrust::device_vector> updatedCellId(foundSeedsTable.back(), 0, allocInt); + thrust::device_vector> updatedCellSeed(foundSeedsTable.back(), allocCellSeed); gpu::processNeighboursKernel<<>>( startLayer, @@ -1195,13 +1229,13 @@ void processNeighboursHandler(const int startLayer, matCorrType); int level = startLevel; - thrust::device_vector lastCellId; - thrust::device_vector lastCellSeed; + thrust::device_vector> lastCellId(allocInt); + thrust::device_vector> lastCellSeed(allocCellSeed); for (int iLayer{startLayer - 1}; iLayer > 0 && level > 2; --iLayer) { lastCellSeed.swap(updatedCellSeed); lastCellId.swap(updatedCellId); - thrust::device_vector().swap(updatedCellSeed); - thrust::device_vector().swap(updatedCellId); + thrust::device_vector>(allocCellSeed).swap(updatedCellSeed); + thrust::device_vector>(allocInt).swap(updatedCellId); auto lastCellSeedSize{lastCellSeed.size()}; foundSeedsTable.resize(lastCellSeedSize + 1); thrust::fill(foundSeedsTable.begin(), foundSeedsTable.end(), 0); @@ -1253,8 +1287,7 @@ void processNeighboursHandler(const int startLayer, propagator, matCorrType); } - - thrust::device_vector outSeeds(updatedCellSeed.size()); + thrust::device_vector> outSeeds(updatedCellSeed.size(), allocCellSeed); auto end = thrust::copy_if(updatedCellSeed.begin(), updatedCellSeed.end(), outSeeds.begin(), gpu::seed_selector(1.e3, maxChi2NDF * ((startLevel + 2) * 2 - 5))); auto s{end - outSeeds.begin()}; seedsHost.reserve(seedsHost.size() + s); @@ -1367,6 +1400,7 @@ template void processNeighboursHandler<7>(const int startLayer, gsl::span neighboursDeviceLUTs, const TrackingFrameInfo** foundTrackingFrameInfo, bounded_vector& seedsHost, + o2::its::ExternalAllocator*, const float bz, const float maxChi2ClusterAttachment, const float maxChi2NDF, diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/ExternalAllocator.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/ExternalAllocator.h index 9bdb2905ba9ba..1628bbc52776b 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/ExternalAllocator.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/ExternalAllocator.h @@ -23,8 +23,8 @@ class ExternalAllocator { public: virtual void* allocate(size_t) = 0; + virtual void deallocate(char*, size_t) = 0; }; - } // namespace o2::its #endif diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h index f6bb9a9b11e66..bead39e713024 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h @@ -230,7 +230,23 @@ struct TimeFrame { void setBz(float bz) { mBz = bz; } float getBz() const { return mBz; } - virtual void setDevicePropagator(const o2::base::PropagatorImpl*) { return; } + void setExternalAllocator(ExternalAllocator* allocator) + { + if (mIsGPU) { + LOGP(debug, "Setting timeFrame allocator to external"); + mAllocator = allocator; + mExtAllocator = true; // to be removed + } else { + LOGP(debug, "External allocator is currently only supported for GPU"); + } + } + + ExternalAllocator* getExternalAllocator() { return mAllocator; } + + virtual void setDevicePropagator(const o2::base::PropagatorImpl*) + { + return; + }; const o2::base::PropagatorImpl* getDevicePropagator() const { return mPropagatorDevice; } template @@ -277,17 +293,6 @@ struct TimeFrame { // State if memory will be externally managed. bool mExtAllocator = false; ExternalAllocator* mAllocator = nullptr; - void setExternalAllocator(ExternalAllocator* allocator) - { - if (mIsGPU) { - LOGP(debug, "Setting timeFrame allocator to external"); - mAllocator = allocator; - mExtAllocator = true; // to be removed - } else { - LOGP(fatal, "External allocator is currently only supported for GPU"); - } - } - void setExtAllocator(bool ext) { mExtAllocator = ext; } bool getExtAllocator() const { return mExtAllocator; } std::array, nLayers> mUnsortedClusters; diff --git a/GPU/GPUTracking/Global/GPUChainITS.cxx b/GPU/GPUTracking/Global/GPUChainITS.cxx index bcb99fff87a64..e11b26b3d62d6 100644 --- a/GPU/GPUTracking/Global/GPUChainITS.cxx +++ b/GPU/GPUTracking/Global/GPUChainITS.cxx @@ -30,7 +30,7 @@ class GPUFrameworkExternalAllocator final : public o2::its::ExternalAllocator { return mFWReco->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU); } - + void deallocate(char* ptr, size_t) {} void setReconstructionFramework(o2::gpu::GPUReconstruction* fwr) { mFWReco = fwr; } private: From aaef7e970c9925cdee58b988112971885297a3ff Mon Sep 17 00:00:00 2001 From: ALICE Action Bot Date: Fri, 6 Jun 2025 07:54:07 +0000 Subject: [PATCH 2/2] Please consider the following formatting changes --- Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index 5fe4bcd4610c9..18c89d39adda0 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -1230,7 +1230,7 @@ void processNeighboursHandler(const int startLayer, int level = startLevel; thrust::device_vector> lastCellId(allocInt); - thrust::device_vector> lastCellSeed(allocCellSeed); + thrust::device_vector> lastCellSeed(allocCellSeed); for (int iLayer{startLayer - 1}; iLayer > 0 && level > 2; --iLayer) { lastCellSeed.swap(updatedCellSeed); lastCellId.swap(updatedCellId);