Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tests/cpp/operator/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ if(USE_CUDA)
list(APPEND test_operator_LINKER_LIBS CUDA::cudart GTest::gtest_main ${TE_LIB} CUDA::nvrtc CUDNN::cudnn)
target_link_libraries(test_operator PUBLIC ${test_operator_LINKER_LIBS} OpenMP::OpenMP_CXX)
else()
target_link_libraries(test_operator PUBLIC hip::host hip::device GTest::gtest_main ${TE_LIB} OpenMP::OpenMP_CXX)
target_link_libraries(test_operator PUBLIC hip::host hip::device GTest::gtest_main ${TE_LIB} OpenMP::OpenMP_CXX rocrand)
endif()
target_compile_options(test_operator PRIVATE -O2 -fopenmp)

Expand Down
51 changes: 50 additions & 1 deletion tests/cpp/test_common.cu
Original file line number Diff line number Diff line change
Expand Up @@ -783,7 +783,6 @@ std::pair<double, double> getTolerances(const DType type) {
template <typename T>
void generate_data_uniformly(T* data, const size_t size, std::mt19937* gen) {
#ifdef __HIP_PLATFORM_AMD__
// TODO: Introduce a parallel RNG library (Random123, PCG, rocRAND)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can probably remove this entire #ifdef guarded section, right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think so, unless we want to keep it around for future reference?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's remove, we can always revert if we need it again.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The whole method seems unused on ROCm and can be guarded

std::uniform_real_distribution<> dis(-2.0, 1.0);
for (int i = 0; i < size; i++) {
data[i] = static_cast<T>(dis(*gen));
Expand Down Expand Up @@ -822,21 +821,71 @@ void generate_data_uniformly(T* data, const size_t size, std::mt19937* gen) {
#endif
}

#ifdef __HIP_PLATFORM_AMD__
#include <rocrand/rocrand.h>
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even though it does not cause errors, better move #include to the top of the file, out of test namespace


template <typename T>
__global__ void affine_transform_and_cast(float* __restrict__ in, T* __restrict__ out, size_t n, float lo, float hi) {
// Clamp values in *in* to [lo, hi] and cast to type *T* for *out*.
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
in[idx] = lo + (hi - lo) * in[idx];
out[idx] = static_cast<T>(in[idx]);
}
}

void fillUniformDevice(Tensor* t) {
void* dst = t->rowwise() ? t->rowwise_dptr() : t->columnwise_dptr();
const auto shape = t->rowwise() ? t->rowwise_shape() : t->columnwise_shape();
const size_t N = product(shape);

float* tmp = nullptr;
hipMalloc(&tmp, N * sizeof(float));

// per-tensor deterministic seed
const unsigned long long seed = static_cast<unsigned long long>(t->gen()());
rocrand_generator gen;
rocrand_create_generator(&gen, ROCRAND_RNG_PSEUDO_PHILOX4_32_10);
rocrand_set_seed(gen, seed);

rocrand_generate_uniform(gen, tmp, N);

// map to [-2.0, 1.0] (like generate_data_uniformly) and cast into tensor dtype
TRANSFORMER_ENGINE_TYPE_SWITCH_ALL(t->dtype(), T, {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

T should either be template parameter and no TRANSFORMER_ENGINE_TYPE_SWITCH_ALL here, or the method calling should be moved out of TRANSFORMER_ENGINE_TYPE_SWITCH_ALL in fillUniform

dim3 block(256);
dim3 grid((N + block.x - 1) / block.x);
hipLaunchKernelGGL(affine_transform_and_cast<T>, grid, block, 0, 0,
tmp, reinterpret_cast<T*>(dst), N, -2.0f, 1.0f);
});

rocrand_destroy_generator(gen);
hipFree(tmp);
}
#endif

void fillUniform(Tensor *t) {
if (t->rowwise()) {
const size_t size = product(t->rowwise_shape());
TRANSFORMER_ENGINE_TYPE_SWITCH_ALL(t->dtype(), T,
{
#ifdef __HIP_PLATFORM_AMD__
fillUniformDevice(t);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any test that tests this generation? I think using GPU generation here does not produce correct result because of using t->from_cpu() below in this method

#else
T *data = t->rowwise_cpu_dptr<T>();
generate_data_uniformly(data, size, &(t->gen()));
#endif
}
);
} else {
const size_t size = product(t->columnwise_shape());
TRANSFORMER_ENGINE_TYPE_SWITCH_ALL(t->dtype(), T,
{
#ifdef __HIP_PLATFORM_AMD__
fillUniformDevice(t);
#else
T *data = t->columnwise_cpu_dptr<T>();
generate_data_uniformly(data, size, &(t->gen()));
#endif
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we able to use rocRAND for fillCase_special as well? Also, I think there were a few tests that for some reason generate their own data... I might be wrong about that, or it may have been updated.

}
);
}
Expand Down
2 changes: 1 addition & 1 deletion tests/cpp/util/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ find_package(OpenMP REQUIRED)
if(USE_CUDA)
target_link_libraries(test_util PUBLIC CUDA::cudart GTest::gtest_main ${TE_LIB} CUDA::nvrtc CUDNN::cudnn OpenMP::OpenMP_CXX)
else()
target_link_libraries(test_util PUBLIC hip::host hip::device GTest::gtest_main ${TE_LIB} OpenMP::OpenMP_CXX)
target_link_libraries(test_util PUBLIC hip::host hip::device GTest::gtest_main ${TE_LIB} OpenMP::OpenMP_CXX rocrand)
endif()
target_compile_options(test_util PRIVATE -O2 -fopenmp)

Expand Down