Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
254 changes: 6 additions & 248 deletions GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "TPCPadGainCalib.h"
#include "TPCZSLinkMapping.h"
#include "GPUTPCGeometry.h"
#include "DetectorsRaw/RDHUtils.h"

using namespace o2::gpu;
using namespace o2::gpu::tpccf;
Expand Down Expand Up @@ -251,8 +252,7 @@ GPUd() size_t GPUTPCCFDecodeZSLink::DecodePage(GPUSharedMemory& smem, DecodeCtx&
if (discardTimeBin) {
FillWithInvalid(ctx.clusterer, ctx.iThread, ctx.nThreads, ctx.pageDigitOffset, nAdc);
} else {
#ifdef GPUCA_GPUCODE
DecodeTBMultiThread(
DecodeTB(
smem,
ctx,
adcData,
Expand All @@ -261,16 +261,6 @@ GPUd() size_t GPUTPCCFDecodeZSLink::DecodePage(GPUSharedMemory& smem, DecodeCtx&
timeBin,
decHdr->cruID,
tbHdr->fecInPartition);
#else // CPU
DecodeTBSingleThread(
ctx,
adcData,
nAdc,
channelMask,
timeBin,
decHdr->cruID,
tbHdr->fecInPartition);
#endif
}

ctx.pageDigitOffset += nAdc;
Expand All @@ -290,62 +280,7 @@ GPUd() size_t GPUTPCCFDecodeZSLink::DecodePage(GPUSharedMemory& smem, DecodeCtx&
return ctx.pageDigitOffset;
}

GPUd() void GPUTPCCFDecodeZSLink::DecodeTBSingleThread(
DecodeCtx& ctx,
const uint8_t* adcData,
uint32_t nAdc,
const uint32_t* channelMask,
int32_t timeBin,
int32_t cru,
int32_t fecInPartition)
{
const CfFragment& fragment = ctx.clusterer.mPmemory->fragment;

if constexpr (TPCZSHDRV2::TIGHTLY_PACKED_V3) {

uint32_t byte = 0, bits = 0, nSamplesWritten = 0, rawFECChannel = 0;

// unpack adc values, assume tightly packed data
while (nSamplesWritten < nAdc) {
byte |= adcData[0] << bits;
adcData++;
bits += CHAR_BIT;
while (bits >= DECODE_BITS) {

// Find next channel with data
for (; !ChannelIsActive(channelMask, rawFECChannel); rawFECChannel++) {
}

// Unpack data for cluster finder
o2::tpc::PadPos padAndRow = GetPadAndRowFromFEC(ctx.clusterer, cru, rawFECChannel, fecInPartition);

WriteCharge(ctx.clusterer, byte, padAndRow, fragment.toLocal(timeBin), ctx.pageDigitOffset + nSamplesWritten);

byte = byte >> DECODE_BITS;
bits -= DECODE_BITS;
nSamplesWritten++;
rawFECChannel++; // Ensure we don't decode same channel twice
} // while (bits >= DECODE_BITS)
} // while (nSamplesWritten < nAdc)

} else { // ! TPCZSHDRV2::TIGHTLY_PACKED_V3
uint32_t rawFECChannel = 0;
const uint64_t* adcData64 = (const uint64_t*)adcData;
for (uint32_t j = 0; j < nAdc; j++) {
for (; !ChannelIsActive(channelMask, rawFECChannel); rawFECChannel++) {
}

uint32_t adc = (adcData64[j / TPCZSHDRV2::SAMPLESPER64BIT] >> ((j % TPCZSHDRV2::SAMPLESPER64BIT) * DECODE_BITS)) & DECODE_MASK;

o2::tpc::PadPos padAndRow = GetPadAndRowFromFEC(ctx.clusterer, cru, rawFECChannel, fecInPartition);
float charge = ADCToFloat(adc, DECODE_MASK, DECODE_BITS_FACTOR);
WriteCharge(ctx.clusterer, charge, padAndRow, fragment.toLocal(timeBin), ctx.pageDigitOffset + j);
rawFECChannel++;
}
}
}

GPUd() void GPUTPCCFDecodeZSLink::DecodeTBMultiThread(
GPUd() void GPUTPCCFDecodeZSLink::DecodeTB(
GPUSharedMemory& smem,
DecodeCtx& ctx,
const uint8_t* adcData,
Expand All @@ -368,26 +303,6 @@ GPUd() void GPUTPCCFDecodeZSLink::DecodeTBMultiThread(
uint8_t myOffset = warp_scan_inclusive_add(myChannelActive) - 1 + blockOffset;
blockOffset = warp_broadcast(myOffset, NTHREADS - 1) + 1;

// Decode entire timebin at once if we have enough threads
// This should further improve performance, but code below is buggy...
// if (nAdc <= NThreads) {
// for (int32_t j = 1; blockOffset < nAdc; j++) {
// rawFECChannel = myChannelActive ? rawFECChannel : (iThread + j*NThreads - myOffset);

// bool iAmIdle = not myChannelActive;

// myChannelActive =
// rawFECChannel < zerosupp_link_based::CommonHeaderlPerTBHeader
// ? BitIsSet(channelMask, rawFECChannel)
// : false;

// uint8_t newOffset = warp_scan_inclusive_add(static_cast<uint8_t>(myChannelActive && iAmIdle)) - 1 + blockOffset;
// blockOffset = warp_broadcast(newOffset, NThreads - 1) + 1;

// myOffset = iAmIdle ? newOffset : myOffset;
// }
// }

if (not myChannelActive) {
continue;
}
Expand All @@ -397,28 +312,16 @@ GPUd() void GPUTPCCFDecodeZSLink::DecodeTBMultiThread(

if constexpr (TPCZSHDRV2::TIGHTLY_PACKED_V3) {

// Try to access adcData with 4 byte reads instead of 1 byte.
// You'd think this would improve performace, but it's actually slower...
// const uint32_t* adcDataU32 = reinterpret_cast<const uint32_t*>(adcData);

uint32_t adcBitOffset = myOffset * DECODE_BITS;
uint32_t adcByteOffset = adcBitOffset / CHAR_BIT;
uint32_t adcOffsetInByte = adcBitOffset - adcByteOffset * CHAR_BIT;
// uint32_t adcByteOffset = adcBitOffset / 32;
// uint32_t adcOffsetInByte = adcBitOffset - adcByteOffset * 32;

uint32_t byte = 0, bits = 0;

// uint32_t byte = adcDataU32[adcByteOffset] >> adcOffsetInByte;
// uint32_t bits = 32 - adcOffsetInByte;
// adcByteOffset++;

while (bits < DECODE_BITS) {
byte |= ((uint32_t)adcData[adcByteOffset]) << bits;
// byte |= adcDataU32[adcByteOffset] << bits;
adcByteOffset++;
bits += CHAR_BIT;
// bits += 32;
}
adc = byte >> adcOffsetInByte;

Expand Down Expand Up @@ -601,12 +504,6 @@ GPUd() void GPUTPCCFDecodeZSDenseLink::Thread<0>(int32_t nBlocks, int32_t nThrea

GPUd() uint32_t GPUTPCCFDecodeZSDenseLink::DecodePage(GPUSharedMemory& smem, DecodeCtx& ctx)
{
#ifdef GPUCA_GPUCODE
constexpr bool DecodeInParallel = true;
#else
constexpr bool DecodeInParallel = false;
#endif

const uint8_t* const pageStart = ctx.page;

const auto* rawDataHeader = Peek<header::RAWDataHeader>(ctx.page);
Expand Down Expand Up @@ -651,13 +548,13 @@ GPUd() uint32_t GPUTPCCFDecodeZSDenseLink::DecodePage(GPUSharedMemory& smem, Dec
}

if ((uint16_t)(raw::RDHUtils::getPageCounter(rawDataHeader) + 1) == raw::RDHUtils::getPageCounter(nextPage)) {
nSamplesWrittenTB = DecodeTB<DecodeInParallel, true>(smem, ctx, rawDataHeader, decHeader->cruID, nSamplesLeftInPage, payloadEnd, nextPage);
nSamplesWrittenTB = DecodeTB<true>(smem, ctx, rawDataHeader, decHeader->cruID, nSamplesLeftInPage, payloadEnd, nextPage);
} else {
err = GPUErrors::ERROR_TPCZS_INCOMPLETE_HBF;
break;
}
} else {
nSamplesWrittenTB = DecodeTB<DecodeInParallel, false>(smem, ctx, rawDataHeader, decHeader->cruID, nSamplesLeftInPage, payloadEnd, nextPage);
nSamplesWrittenTB = DecodeTB<false>(smem, ctx, rawDataHeader, decHeader->cruID, nSamplesLeftInPage, payloadEnd, nextPage);
}

// Abort decoding the page if an error was detected.
Expand Down Expand Up @@ -712,30 +609,8 @@ GPUd() uint32_t GPUTPCCFDecodeZSDenseLink::DecodePage(GPUSharedMemory& smem, Dec
return ctx.pageDigitOffset;
}

template <bool DecodeInParallel, bool PayloadExtendsToNextPage>
GPUd() int16_t GPUTPCCFDecodeZSDenseLink::DecodeTB(
[[maybe_unused]] GPUSharedMemory& smem,
DecodeCtx& ctx,
const header::RAWDataHeader* rawDataHeader,
int32_t cru,
uint16_t nSamplesLeftInPage,
const uint8_t* payloadEnd,
const uint8_t* nextPage)
{

if constexpr (DecodeInParallel) {
return DecodeTBMultiThread<PayloadExtendsToNextPage>(smem, ctx, rawDataHeader, cru, nSamplesLeftInPage, payloadEnd, nextPage);
} else {
int16_t nSamplesWritten = 0;
if (ctx.iThread == 0) {
nSamplesWritten = DecodeTBSingleThread<PayloadExtendsToNextPage>(ctx, rawDataHeader, cru, nSamplesLeftInPage, payloadEnd, nextPage);
}
return warp_broadcast(nSamplesWritten, 0);
}
}

template <bool PayloadExtendsToNextPage>
GPUd() int16_t GPUTPCCFDecodeZSDenseLink::DecodeTBMultiThread(
GPUd() int16_t GPUTPCCFDecodeZSDenseLink::DecodeTB(
GPUSharedMemory& smem,
DecodeCtx& ctx,
const header::RAWDataHeader* rawDataHeader,
Expand Down Expand Up @@ -883,123 +758,6 @@ GPUd() int16_t GPUTPCCFDecodeZSDenseLink::DecodeTBMultiThread(
#undef MAYBE_PAGE_OVERFLOW
}

template <bool PayloadExtendsToNextPage>
GPUd() int16_t GPUTPCCFDecodeZSDenseLink::DecodeTBSingleThread(
DecodeCtx& ctx,
const header::RAWDataHeader* rawDataHeader,
int32_t cru,
uint16_t nSamplesLeftInPage,
const uint8_t* payloadEnd,
const uint8_t* nextPage)
{
#define MAYBE_PAGE_OVERFLOW(pagePtr) \
if constexpr (PayloadExtendsToNextPage) { \
if (pagePtr >= payloadEnd && pagePtr < nextPage) { \
ptrdiff_t diff = pagePtr - payloadEnd; \
pagePtr = nextPage; \
ConsumeBytes(pagePtr, sizeof(header::RAWDataHeader) + diff); \
} \
} else { \
if (pagePtr > payloadEnd) { \
return -GPUErrors::ERROR_TPCZS_PAGE_OVERFLOW; \
} \
}

using zerosupp_link_based::ChannelPerTBHeader;

const CfFragment& fragment = ctx.clusterer.mPmemory->fragment;

uint8_t linkIds[MaxNLinksPerTimebin];
uint8_t channelMasks[MaxNLinksPerTimebin * 10] = {0};
uint16_t nSamplesWritten = 0;

// Read timebin block header
uint16_t tbbHdr = ConsumeByte(ctx.page);
MAYBE_PAGE_OVERFLOW(ctx.page);
tbbHdr |= static_cast<uint16_t>(ConsumeByte(ctx.page)) << CHAR_BIT;
MAYBE_PAGE_OVERFLOW(ctx.page);

uint8_t nLinksInTimebin = tbbHdr & 0x000F;
uint16_t linkBC = (tbbHdr & 0xFFF0) >> 4;
int32_t timeBin = (linkBC + (uint64_t)(raw::RDHUtils::getHeartBeatOrbit(*rawDataHeader) - ctx.firstHBF) * constants::lhc::LHCMaxBunches) / LHCBCPERTIMEBIN;

uint16_t nSamplesInTB = 0;

// Read timebin link headers
for (uint8_t iLink = 0; iLink < nLinksInTimebin; iLink++) {
uint8_t timebinLinkHeaderStart = ConsumeByte(ctx.page);
MAYBE_PAGE_OVERFLOW(ctx.page);

linkIds[iLink] = timebinLinkHeaderStart & 0b00011111;

bool bitmaskIsFlat = timebinLinkHeaderStart & 0b00100000;

uint16_t bitmaskL2 = 0x0FFF;
if (not bitmaskIsFlat) {
bitmaskL2 = static_cast<uint16_t>(timebinLinkHeaderStart & 0b11000000) << 2 | static_cast<uint16_t>(ConsumeByte(ctx.page));
MAYBE_PAGE_OVERFLOW(ctx.page);
}

for (int32_t i = 0; i < 10; i++) {
if (bitmaskL2 & 1 << i) {
nSamplesInTB += CAMath::Popcount(*Peek(ctx.page));
channelMasks[10 * iLink + i] = ConsumeByte(ctx.page);
MAYBE_PAGE_OVERFLOW(ctx.page);
}
}

} // for (uint8_t iLink = 0; iLink < nLinksInTimebin; iLink++)

if (nSamplesInTB > nSamplesLeftInPage) {
return -GPUErrors::ERROR_TPCZS_INVALID_NADC;
}

const uint8_t* adcData = ConsumeBytes(ctx.page, (nSamplesInTB * DECODE_BITS + 7) / 8);
MAYBE_PAGE_OVERFLOW(ctx.page);

bool discardTimeBin = not fragment.contains(timeBin);
discardTimeBin |= (ctx.tpcTimeBinCut > 0 && timeBin > ctx.tpcTimeBinCut);

if (discardTimeBin) {
return FillWithInvalid(ctx.clusterer, 0, 1, ctx.pageDigitOffset, nSamplesInTB);
}

// Unpack ADC
uint32_t byte = 0, bits = 0;
uint16_t rawFECChannel = 0;

// unpack adc values, assume tightly packed data
while (nSamplesWritten < nSamplesInTB) {
byte |= static_cast<uint32_t>(ConsumeByte(adcData)) << bits;
MAYBE_PAGE_OVERFLOW(adcData);
bits += CHAR_BIT;
while (bits >= DECODE_BITS) {

// Find next channel with data
for (; !ChannelIsActive(channelMasks, rawFECChannel); rawFECChannel++) {
}

int32_t iLink = rawFECChannel / ChannelPerTBHeader;
int32_t rawFECChannelLink = rawFECChannel % ChannelPerTBHeader;

// Unpack data for cluster finder
o2::tpc::PadPos padAndRow = GetPadAndRowFromFEC(ctx.clusterer, cru, rawFECChannelLink, linkIds[iLink]);

float charge = ADCToFloat(byte, DECODE_MASK, DECODE_BITS_FACTOR);
WriteCharge(ctx.clusterer, charge, padAndRow, fragment.toLocal(timeBin), ctx.pageDigitOffset + nSamplesWritten);

byte >>= DECODE_BITS;
bits -= DECODE_BITS;
nSamplesWritten++;
rawFECChannel++; // Ensure we don't decode same channel twice
} // while (bits >= DECODE_BITS)
} // while (nSamplesWritten < nAdc)

return nSamplesWritten;

#undef MAYBE_PAGE_OVERFLOW
}

GPUd() bool GPUTPCCFDecodeZSDenseLink::ChannelIsActive(const uint8_t* chan, uint16_t chanIndex)
{
constexpr uint8_t N_BITS_PER_ENTRY = sizeof(*chan) * CHAR_BIT;
Expand Down
13 changes: 3 additions & 10 deletions GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
#include "TPCBase/PadPos.h"
#include "DataFormatsTPC/ZeroSuppression.h"
#include "DataFormatsTPC/ZeroSuppressionLinkBased.h"
#include "DetectorsRaw/RDHUtils.h"
#include "Headers/RAWDataHeader.h"

namespace o2::gpu
{
Expand Down Expand Up @@ -148,8 +148,7 @@ class GPUTPCCFDecodeZSLink : public GPUTPCCFDecodeZSLinkBase
GPUd() static void GetChannelBitmask(const tpc::zerosupp_link_based::CommonHeader& tbHdr, uint32_t* chan);
GPUd() static bool ChannelIsActive(const uint32_t* chan, uint8_t chanIndex);

GPUd() static void DecodeTBSingleThread(DecodeCtx& ctx, const uint8_t* adcData, uint32_t nAdc, const uint32_t* channelMask, int32_t timeBin, int32_t cru, int32_t fecInPartition);
GPUd() static void DecodeTBMultiThread(GPUSharedMemory& smem, DecodeCtx& ctx, const uint8_t* adcData, uint32_t nAdc, const uint32_t* channelMask, int32_t timeBin, int32_t cru, int32_t fecInPartition);
GPUd() static void DecodeTB(GPUSharedMemory& smem, DecodeCtx& ctx, const uint8_t* adcData, uint32_t nAdc, const uint32_t* channelMask, int32_t timeBin, int32_t cru, int32_t fecInPartition);
};

class GPUTPCCFDecodeZSDenseLink : public GPUTPCCFDecodeZSLinkBase
Expand Down Expand Up @@ -179,14 +178,8 @@ class GPUTPCCFDecodeZSDenseLink : public GPUTPCCFDecodeZSLinkBase
// Decode a single timebin within an 8kb page.
// Returns the number of samples decoded from the page
// or negative value to indicate an error (no samples are written in this case)
template <bool DecodeInParallel, bool PayloadExtendsToNextPage>
GPUd() static int16_t DecodeTB(GPUSharedMemory& smem, DecodeCtx& ctx, const header::RAWDataHeader* rawDataHeader, int32_t cru, uint16_t nSamplesLeftInPage, const uint8_t* payloadEnd, const uint8_t* nextPage);

template <bool PayloadExtendsToNextPage>
GPUd() static int16_t DecodeTBSingleThread(DecodeCtx& ctx, const header::RAWDataHeader* rawDataHeader, int32_t cru, uint16_t nSamplesLeftInPage, const uint8_t* payloadEnd, const uint8_t* nextPage);

template <bool PayloadExtendsToNextPage>
GPUd() static int16_t DecodeTBMultiThread(GPUSharedMemory& smem, DecodeCtx& ctx, const header::RAWDataHeader* rawDataHeader, int32_t cru, uint16_t nSamplesLeftInPage, const uint8_t* payloadEnd, const uint8_t* nextPage);
GPUd() static int16_t DecodeTB(GPUSharedMemory& smem, DecodeCtx& ctx, const header::RAWDataHeader* rawDataHeader, int32_t cru, uint16_t nSamplesLeftInPage, const uint8_t* payloadEnd, const uint8_t* nextPage);
};

} // namespace o2::gpu
Expand Down