diff --git a/cpp/examples/performance_timers.cpp b/cpp/examples/performance_timers.cpp index 9518c5a6f2..98ecf755e4 100644 --- a/cpp/examples/performance_timers.cpp +++ b/cpp/examples/performance_timers.cpp @@ -19,6 +19,9 @@ */ #include "memilio/timer/auto_timer.h" #include "memilio/timer/table_printer.h" +#include "memilio/timer/timer_registrar.h" +#include "memilio/utils/logging.h" +#include "memilio/utils/miompi.h" #include // This is only used for the example load function. @@ -40,8 +43,15 @@ void load() int main() { - // Specify the namespace of AutoTimer, so we don't have to repeat it. Do not do this with entire namespaces. + mio::set_log_level(mio::LogLevel::info); + mio::mpi::init(); + + const int comm_rank = mio::mpi::rank(mio::mpi::get_world()); + const int comm_size = mio::mpi::size(mio::mpi::get_world()); + // Specify the namespace of AutoTimer and TimerRegistrar, so we don't have to repeat it. + // Avoid "using" statements with entire namespaces. using mio::timing::AutoTimer; + using mio::timing::TimerRegistrar; // Start immediately timing the main function. An AutoTimer starts timing upon its creation, and ends timing when // it is destroyed. This usually happens when a function returns, or a scope indicated by {curly braces} ends. @@ -54,15 +64,21 @@ int main() // the end of the programm. This can be disabled by calling `TimerRegistrar::disable_final_timer_summary()`. auto printer = std::make_unique(); printer->set_time_format("{:e}"); - mio::timing::TimerRegistrar::get_instance().set_printer(std::move(printer)); + TimerRegistrar::get_instance().set_printer(std::move(printer)); - // To manually print all timers, use `TimerRegistrar::print_timers()`, but make sure that no timers are running. + // To manually print all timers, use `TimerRegistrar::print_timers()` (or `TimerRegistrar::print_gathered_timers()` + // when using MPI), but make sure that no timers are running. // The "main" timer in this example would make that difficult, but you can simply add another scope around it, // similar to the "compute loops" timer below. - const int N = 1000; // Number of iterations. Be wary of increasing this number when using the sleep_for load! + // Number of iterations. Be wary of increasing this number when using the sleep_for load! + const int total_iterations = 1000; + const int N = total_iterations / comm_size + (comm_rank < (total_iterations % comm_size)); - std::cout << "Num threads: " << mio::omp::get_max_threads() << "\n"; + if (mio::mpi::is_root()) { + mio::log_info("Num ranks: {} - Num threads: {} - Work size per rank: {}", comm_size, + mio::omp::get_max_threads(), N); + } // Open a new scope to time computations. { @@ -100,5 +116,15 @@ int main() } } + // For MPI parallel execution, we replace the final timer summary with a manual print. That way, we can gather + // timers from all ranks into a single output, instead of each rank printing a separate output. You can also use a + // custom printer for computing additional statistics. + if (comm_size > 1) { + TimerRegistrar::get_instance().disable_final_timer_summary(); + TimerRegistrar::get_instance().print_gathered_timers(mio::mpi::get_world()); + } + + mio::mpi::finalize(); + return 0; } diff --git a/cpp/memilio/timer/basic_timer.h b/cpp/memilio/timer/basic_timer.h index 9c50f821d0..2a9cbe6cf7 100644 --- a/cpp/memilio/timer/basic_timer.h +++ b/cpp/memilio/timer/basic_timer.h @@ -20,9 +20,11 @@ #ifndef MIO_TIMER_BASIC_TIMER_H #define MIO_TIMER_BASIC_TIMER_H +#include "memilio/io/io.h" #include "memilio/timer/definitions.h" #include +#include namespace mio { @@ -69,6 +71,37 @@ class BasicTimer should_be_running(false, "~BasicTimer"); } + /** + * serialize this. + * @see mio::serialize + */ + template + void serialize(IOContext& io) const + { + auto obj = io.create_object("BasicTimer"); + obj.add_element("elapsed_time", details::convert_to_ticks(m_elapsed_time)); + } + + /** + * deserialize an object of this class. + * @see mio::deserialize + */ + template + static IOResult deserialize(IOContext& io) + { + using Tick = decltype(details::convert_to_ticks(std::declval())); + auto obj = io.expect_object("BasicTimer"); + auto et = obj.expect_element("elapsed_time", Tag{}); + return apply( + io, + [](auto&& et_) { + BasicTimer b; + b.m_elapsed_time = DurationType{et_}; + return b; + }, + et); + } + private: TimeType m_start_time; ///< The last time point at which start() was called DurationType m_elapsed_time{0}; ///< The total time spent between starts and stops. diff --git a/cpp/memilio/timer/definitions.h b/cpp/memilio/timer/definitions.h index c385f7c210..4a08d01ed8 100644 --- a/cpp/memilio/timer/definitions.h +++ b/cpp/memilio/timer/definitions.h @@ -23,7 +23,7 @@ #include "memilio/utils/mioomp.h" #include -#include +#include namespace mio { @@ -42,6 +42,21 @@ using DurationType = std::chrono::steady_clock::duration; #endif +namespace details +{ + +/// @brief Convert a duration to integer ticks. Useful for serialization. +inline decltype(auto) convert_to_ticks(DurationType duration) +{ +#ifdef MEMILIO_ENABLE_OPENMP + return duration; +#else + return duration.count(); +#endif +} + +} // namespace details + /** * @brief Convert a duration to a (floating point) number of seconds. * @param[in] duration Any DurationType value, mainly `BasicTimer::get_elapsed_time()`. diff --git a/cpp/memilio/timer/list_printer.h b/cpp/memilio/timer/list_printer.h index bd94d4e2bb..7ac0d931d6 100644 --- a/cpp/memilio/timer/list_printer.h +++ b/cpp/memilio/timer/list_printer.h @@ -21,6 +21,7 @@ #define MIO_TIMER_LIST_PRINTER_H #include "memilio/timer/registration.h" +#include "memilio/utils/compiler_diagnostics.h" #include #include @@ -45,16 +46,16 @@ struct ListPrinter : public Printer { const auto indent = " "; // Write out all timers out << "All Timers: " << timer_register.size() << "\n"; - for (const auto& [name, scope, timer, thread] : timer_register) { + for (const auto& [name, scope, timer, thread, rank] : timer_register) { out << indent << qualified_name(name, scope) << ": " << std::scientific - << time_in_seconds(timer.get_elapsed_time()) << " (" << thread << ")\n"; - is_multithreaded |= thread > 0; + << time_in_seconds(timer.get_elapsed_time()) << " (" << rank << ", " << thread << ")\n"; + is_multithreaded |= thread > 0 || rank > 0; } // Write out timers accumulated over threads by name if (is_multithreaded) { // dedupe list entries from parallel execution std::map deduper; - for (const auto& [name, scope, timer, _] : timer_register) { + for (const auto& [name, scope, timer, thread, rank] : timer_register) { deduper[qualified_name(name, scope)] += timer.get_elapsed_time(); } out << "Unique Timers (accumulated): " << deduper.size() << "\n"; diff --git a/cpp/memilio/timer/named_timer.h b/cpp/memilio/timer/named_timer.h index 59f9aa4d56..b9a53743e4 100644 --- a/cpp/memilio/timer/named_timer.h +++ b/cpp/memilio/timer/named_timer.h @@ -77,7 +77,7 @@ class NamedTimer : public BasicTimer */ NamedTimer() { - TimerRegistrar::get_instance().add_timer({name(), scope(), *this, mio::omp::get_thread_id()}); + TimerRegistrar::get_instance().add_timer({name(), scope(), *this, mio::omp::get_thread_id(), 0}); } }; diff --git a/cpp/memilio/timer/registration.h b/cpp/memilio/timer/registration.h index 02bf4c39f7..26b42fdfda 100644 --- a/cpp/memilio/timer/registration.h +++ b/cpp/memilio/timer/registration.h @@ -20,6 +20,7 @@ #ifndef MIO_TIMER_REGISTRATION_H #define MIO_TIMER_REGISTRATION_H +#include "memilio/io/default_serialize.h" #include "memilio/timer/basic_timer.h" #include @@ -30,11 +31,13 @@ namespace mio namespace timing { +/// @brief Struct used to register ( @see TimerRegistrar ) and print timers ( @see Printer ). struct TimerRegistration { std::string name; std::string scope; BasicTimer& timer; int thread_id; + int rank; }; /** diff --git a/cpp/memilio/timer/table_printer.h b/cpp/memilio/timer/table_printer.h index a08d70af13..8cdca2714a 100644 --- a/cpp/memilio/timer/table_printer.h +++ b/cpp/memilio/timer/table_printer.h @@ -206,8 +206,8 @@ class TablePrinter : public Printer std::map row_to_index; // map from name to index, used to fill table bool is_multithreaded = false; // keep track of whether a thread id > 0 exists // map rows from thread 0 first, so the order of timers (mostly) corresponds to their call order - for (const auto& [name, scope, _, thread] : timer_register) { - if (thread == 0) { + for (const auto& [name, scope, timer, thread, rank] : timer_register) { + if (thread == 0 && rank == 0) { const std::string qn = qualified_name(name, scope); if (row_to_index.emplace(qn, rows.size()).second) { rows.push_back(qn); @@ -220,8 +220,8 @@ class TablePrinter : public Printer // make a second pass to add timers from other threads // this does nothing, if all timers are used on thread 0 at least once if (is_multithreaded) { - for (auto& [name, scope, _, thread] : timer_register) { - if (thread != 0) { + for (auto& [name, scope, timer, thread, rank] : timer_register) { + if (thread != 0 || rank != 0) { const std::string qn = qualified_name(name, scope); if (row_to_index.emplace(qn, rows.size()).second) { rows.push_back(qn); @@ -245,7 +245,7 @@ class TablePrinter : public Printer } // accumulate elapsed time and gather statistics in the table // averages are calculated later, using finished values from elapsed and num - for (auto& [name, scope, timer, thread] : timer_register) { + for (auto& [name, scope, timer, thread, rank] : timer_register) { const auto row = row_to_index[qualified_name(name, scope)]; const auto time = time_in_seconds(timer.get_elapsed_time()); table(row, elapsed) += time; diff --git a/cpp/memilio/timer/timer_registrar.h b/cpp/memilio/timer/timer_registrar.h index a1c831f645..f127c18802 100644 --- a/cpp/memilio/timer/timer_registrar.h +++ b/cpp/memilio/timer/timer_registrar.h @@ -20,14 +20,19 @@ #ifndef MIO_TIMER_TIMER_REGISTRAR_H #define MIO_TIMER_TIMER_REGISTRAR_H +#include "memilio/io/binary_serializer.h" +#include "memilio/io/default_serialize.h" +#include "memilio/timer/basic_timer.h" #include "memilio/timer/registration.h" #include "memilio/timer/table_printer.h" +#include "memilio/utils/compiler_diagnostics.h" +#include "memilio/utils/miompi.h" +#include #include #include #include #include -#include namespace mio { @@ -88,14 +93,35 @@ class TimerRegistrar { PRAGMA_OMP(single nowait) { - get_instance().m_printer->print(m_register, out); + m_printer->print(m_register, out); + } + } + + /** + * @brief Print all timers gathered on the given MPI Comm using a Printer. + * + * By default, uses TablePrinter to write to stdout. The printer can be changed using the set_printer member. + * + * @param comm An MPI communicator. Make sure this method is called by all ranks on comm! + * @param out Any output stream, defaults to std::cout. + */ + void print_gathered_timers(mpi::Comm comm, std::ostream& out = std::cout) + { + PRAGMA_OMP(single nowait) + { + std::list external_timers; // temporary storage for gathered registrations + std::list gathered_register; // combined registrations of all ranks + gather_timers(comm, external_timers, gathered_register); + if (mpi::is_root()) { + m_printer->print(gathered_register, out); + } } } /// @brief Prevent the TimerRegistrar from calling print_timers on exit from main. - void disable_final_timer_summary() const + void disable_final_timer_summary() { - get_instance().m_print_on_death = false; + m_print_on_death = false; } /** @@ -143,6 +169,62 @@ class TimerRegistrar } } + /// @brief Gather timers from all ranks, using external_timers as temporary timer storage for gathered_register. + void gather_timers(mpi::Comm comm, std::list& external_timers, + std::list& gathered_register) const + { +#ifndef MEMILIO_ENABLE_MPI + mio::unused(comm, external_timers); +#endif + if (mpi::is_root()) { + std::ranges::transform(m_register, std::back_inserter(gathered_register), [](const TimerRegistration& r) { + return TimerRegistration{r.name, r.scope, r.timer, r.thread_id, 0}; + }); + } + if (comm == nullptr) { + if (mpi::is_root()) { + log_error("Got nullptr as MPI Comm. Only timers on root rank are gathered."); + } + return; + } +#ifdef MEMILIO_ENABLE_MPI + // name, scope, timer, thread_id + using GatherableRegistration = std::tuple; + int comm_size; + MPI_Comm_size(comm, &comm_size); + + if (mpi::is_root()) { + for (int snd_rank = 1; snd_rank < comm_size; snd_rank++) { // skip root rank! + int bytes_size; + MPI_Recv(&bytes_size, 1, MPI_INT, snd_rank, 0, comm, MPI_STATUS_IGNORE); + ByteStream bytes(bytes_size); + MPI_Recv(bytes.data(), bytes.data_size(), MPI_BYTE, snd_rank, 0, mpi::get_world(), MPI_STATUS_IGNORE); + + auto rec_register = deserialize_binary(bytes, Tag>{}); + if (!rec_register) { + log_error("Error receiving ensemble results from rank {}.", snd_rank); + } + std::ranges::transform( + rec_register.value(), std::back_inserter(gathered_register), [&](const GatherableRegistration& r) { + const auto& [name, scope, timer, thread_id] = r; + external_timers.push_back(timer); + return TimerRegistration{name, scope, external_timers.back(), thread_id, snd_rank}; + }); + } + } + else { + std::vector snd_register; + std::ranges::transform(m_register, std::back_inserter(snd_register), [](const TimerRegistration& r) { + return GatherableRegistration{r.name, r.scope, r.timer, r.thread_id}; + }); + ByteStream bytes = serialize_binary(snd_register); + int bytes_size = int(bytes.data_size()); + MPI_Send(&bytes_size, 1, MPI_INT, 0, 0, comm); + MPI_Send(bytes.data(), bytes.data_size(), MPI_BYTE, 0, 0, comm); + } +#endif + } + std::unique_ptr m_printer = std::make_unique(); ///< A printer to visualize all timers. bool m_print_on_death = true; ///< Whether to call m_printer during the destructor. std::list m_register; ///< List that allows access to timers without having their name. diff --git a/cpp/memilio/utils/miompi.cpp b/cpp/memilio/utils/miompi.cpp index 8c1079d538..3959dbaad8 100644 --- a/cpp/memilio/utils/miompi.cpp +++ b/cpp/memilio/utils/miompi.cpp @@ -18,6 +18,7 @@ * limitations under the License. */ #include "memilio/utils/miompi.h" +#include "memilio/utils/compiler_diagnostics.h" #include "memilio/utils/logging.h" #ifdef MEMILIO_ENABLE_MPI @@ -56,12 +57,31 @@ void finalize() bool is_root() { -#ifdef MEMILIO_ENABLE_MPI + return rank(get_world()) == 0; +} + +int rank(Comm comm) +{ +#ifndef MEMILIO_ENABLE_MPI + mio::unused(comm); + return 0; +#else int rank; - MPI_Comm_rank(get_world(), &rank); - return rank == 0; + MPI_Comm_rank(comm, &rank); + return rank; +#endif +} + +int size(Comm comm) +{ +#ifndef MEMILIO_ENABLE_MPI + mio::unused(comm); + return 1; +#else + int size; + MPI_Comm_size(comm, &size); + return size; #endif - return true; } } // namespace mpi diff --git a/cpp/memilio/utils/miompi.h b/cpp/memilio/utils/miompi.h index b3a255da46..517fe355a3 100644 --- a/cpp/memilio/utils/miompi.h +++ b/cpp/memilio/utils/miompi.h @@ -60,6 +60,18 @@ void finalize(); */ bool is_root(); +/** + * @brief Return the rank of the calling process on the given communicator. + * @param comm An MPI communicator. + */ +int rank(Comm comm); + +/** + * @brief Return the size of the given communicator. + * @param comm An MPI communicator. + */ +int size(Comm comm); + } // namespace mpi } // namespace mio diff --git a/cpp/tests/test_timers.cpp b/cpp/tests/test_timers.cpp index 96a258f7c3..f8a6856971 100644 --- a/cpp/tests/test_timers.cpp +++ b/cpp/tests/test_timers.cpp @@ -17,6 +17,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include "memilio/utils/miompi.h" #include "utils.h" #include "memilio/timer/auto_timer.h" #include "memilio/timer/list_printer.h" @@ -42,11 +43,11 @@ struct TimingTest : public ::testing::Test { public: static std::list printable_timers() { - return {{"A", "A", m_timer, 0}, - {"B", "", m_timer, 0}, - {"B", "", m_timer, 1}, - {"C has a fairly long name", "C", m_timer, 0}, - {"D", "D", m_timer, 2}}; + return {{"A", "A", m_timer, 0, 0}, + {"B", "", m_timer, 0, 0}, + {"B", "", m_timer, 1, 0}, + {"C has a fairly long name", "C", m_timer, 0, 1}, + {"D", "D", m_timer, 2, 2}}; } // something to make sure the timers do not measure 0 @@ -152,7 +153,7 @@ TEST_F(TimingTest, TimerRegistrar) PRAGMA_OMP(parallel num_threads(mio::omp::get_max_threads())) { const auto i = mio::omp::get_thread_id(); - reg.add_timer({"name", "", bts[i], i}); // reusing names is bad. but the registrar does not care + reg.add_timer({"name", "", bts[i], i, 0}); // reusing names is bad. but the registrar does not care } // -> verify by register size const int new_num_timers = old_num_timers + mio::omp::get_max_threads(); @@ -173,10 +174,24 @@ TEST_F(TimingTest, TimerRegistrar) reg.set_printer(std::move(p)); // check print_timers by using the SizePrinter (defined above), which only writes reg.get_register().size() int size = -1; - std::stringstream out; - reg.print_timers(out); - out >> size; + { + std::stringstream out; + reg.print_timers(out); + out >> size; + } EXPECT_EQ(size, new_num_timers); + // check print_gathered_timers, expecting size * #timers (even when MPI is disabled; in that case, check for a log) + size = -1; + mio::RedirectLogger logger; + logger.capture(); + { + std::stringstream out; + reg.print_gathered_timers(mio::mpi::get_world(), out); + out >> size; + } + EXPECT_EQ(logger.view().empty(), (mio::mpi::get_world() != nullptr) && (mio::mpi::is_root())) << logger.read(); + logger.release(); + EXPECT_EQ(size, mio::mpi::size(mio::mpi::get_world()) * new_num_timers); } TEST_F(TimingTest, qualified_name) @@ -192,11 +207,11 @@ TEST_F(TimingTest, ListPrinter) // manually verify the new behaviour, and update the reference_output. const std::string reference_output = "All Timers: 5\n" - " A::A: 0.000000e+00 (0)\n" - " B: 0.000000e+00 (0)\n" - " B: 0.000000e+00 (1)\n" - " C::C has a fairly long name: 0.000000e+00 (0)\n" - " D::D: 0.000000e+00 (2)\n" + " A::A: 0.000000e+00 (0, 0)\n" + " B: 0.000000e+00 (0, 0)\n" + " B: 0.000000e+00 (0, 1)\n" + " C::C has a fairly long name: 0.000000e+00 (1, 0)\n" + " D::D: 0.000000e+00 (2, 2)\n" "Unique Timers (accumulated): 4\n" " A::A: 0.000000e+00\n" " B: 0.000000e+00\n" diff --git a/docs/source/cpp/performance_monitoring.rst b/docs/source/cpp/performance_monitoring.rst index 3138655be2..10fcae20de 100644 --- a/docs/source/cpp/performance_monitoring.rst +++ b/docs/source/cpp/performance_monitoring.rst @@ -68,12 +68,17 @@ To measure how long advancing a simulation without the setup takes, you can writ ... // evaluate results } -and will see a table printed at the end of your program next to the timer named "my simulation", that lists the time it took to :code:`advance`. +and will see a table printed at the end of your program next to the timer named "my simulation", that lists the time it +took to :code:`advance`. You can add more timers like this, but make sure you use unique names, otherwise the same timer will be reused, and the measured times will be added together. The name of the timer object itself (here :code:`my_timer`) is not important, as long as the compiler does not complain about it. +Note that the automatic print can be disabled, and a manual print can be performed instead. There is also a print method +that gathers timers from all ranks in an MPI parallel context. Check out :code:`mio::timing::TimerRegistrar` for details +on these methods. + Timing in the library ^^^^^^^^^^^^^^^^^^^^^