-
Notifications
You must be signed in to change notification settings - Fork 8
Open
Description
I've been chasing down a weird crash and I think it's a race when a channel gets resized. The symptoms of the crash are an error message like this (sometimes other values) followed by termination:
Invalid buffer index for slot 0: 1
I've since managed to isolate it to channel resizing (I think). This is a test that reproduces the issue about 1 in 10 runs (varies, on one of my machines its 1 in 2; on another its closer to 1 in 20):
TEST_F(ClientTest, TestCrash) {
std::string channel_name = "CrashChannel";
subspace::Client client1;
subspace::Client client2;
ASSERT_TRUE(client1.Init(Socket()).ok());
ASSERT_TRUE(client2.Init(Socket()).ok());
std::atomic<bool> publisher_finished = false;
auto t1 = std::thread([&]() {
auto client1_pub = *client1.CreatePublisher(channel_name, 1, 4);
for (int i = 1; i < 32; i++) {
std::size_t size = std::pow(2, i);
auto buffer = client1_pub.GetMessageBuffer(size);
std::memset(*buffer, i, size);
client1_pub.PublishMessage(size);
}
publisher_finished = true;
});
auto t2 = std::thread([&]() {
auto client2_sub = *client2.CreateSubscriber(channel_name);
while (publisher_finished == false) {
auto message = *client2_sub.ReadMessage();
size_t size = message.length;
if (size == 0) {
continue;
} else {
std::cout << size << std::endl;
}
}
});
t1.join();
t2.join();
}
- Is this test violating any locking/ordering assumptions?
- If not, do you have any suggestions on where I can focus my investigation?
This is the output and the stack trace (macos-aarch64 compiled with clang; running under lldb) running on one machine (this machine crashes about 1 in every 20 runs):
[ RUN ] ClientTest.TestCrash
2
4096
8192
16384
32768
Process 49232 stopped
* thread #4, stop reason = EXC_BAD_ACCESS (code=1, address=0x60)
frame #0: 0x00000001000c8c70 client_test`subspace::Channel::NextSlot(subspace::MessageSlot*, bool, int, std::__1::function<bool (subspace::ChannelLock*)>) + 536
client_test`subspace::Channel::NextSlot:
-> 0x1000c8c70 <+536>: ldr x8, [x9, #0x18]
0x1000c8c74 <+540>: orr x8, x8, #0x4
0x1000c8c78 <+544>: str x8, [x9, #0x18]
0x1000c8c7c <+548>: ldr x8, [sp, #0x68]
Target 0: (client_test) stopped.
(lldb) bt
* thread #4, stop reason = EXC_BAD_ACCESS (code=1, address=0x60)
* frame #0: 0x00000001000c8c70 client_test`subspace::Channel::NextSlot(subspace::MessageSlot*, bool, int, std::__1::function<bool (subspace::ChannelLock*)>) + 536
frame #1: 0x00000001000a8d4c client_test`subspace::details::SubscriberImpl::NextSlot(std::__1::function<bool (subspace::ChannelLock*)>) + 132
frame #2: 0x00000001000a897c client_test`subspace::Client::ReadMessageInternal(subspace::details::SubscriberImpl*, subspace::ReadMode, bool, bool) + 332
frame #3: 0x00000001000a92a8 client_test`subspace::Client::ReadMessage(subspace::details::SubscriberImpl*, subspace::ReadMode) + 408
frame #4: 0x0000000100011d54 client_test`subspace::Subscriber::ReadMessage(subspace::ReadMode) + 48
frame #5: 0x000000010006237c client_test`ClientTest_TestCrash_Test::TestBody()::$_7::operator()() const + 192
frame #6: 0x0000000100062288 client_test`decltype(std::declval<ClientTest_TestCrash_Test::TestBody()::$_7>()()) std::__1::__invoke[abi:v160006]<ClientTest_TestCrash_Test::TestBody()::$_7>(ClientTest_TestCrash_Test::TestBody()::$_7&&) + 24
frame #7: 0x0000000100062264 client_test`void std::__1::__thread_execute[abi:v160006]<std::__1::unique_ptr<std::__1::__thread_struct, std::__1::default_delete<std::__1::__thread_struct>>, ClientTest_TestCrash_Test::TestBody()::$_7>(std::__1::tuple<std::__1::unique_ptr<std::__1::__thread_struct, std::__1::default_delete<std::__1::__thread_struct>>, ClientTest_TestCrash_Test::TestBody()::$_7>&, std::__1::__tuple_indices<>) + 28
frame #8: 0x0000000100061f6c client_test`void* std::__1::__thread_proxy[abi:v160006]<std::__1::tuple<std::__1::unique_ptr<std::__1::__thread_struct, std::__1::default_delete<std::__1::__thread_struct>>, ClientTest_TestCrash_Test::TestBody()::$_7>>(void*) + 84
frame #9: 0x000000018f32d034 libsystem_pthread.dylib`_pthread_start + 136
On a different machine (linux-aarch64) compiled with gcc running under gdb (this machine triggers this crash every other run):
[ RUN ] ClientTest.TestCrash
[New Thread 0xfffff4f0d040 (LWP 84416)]
[New Thread 0xffffeffff040 (LWP 84417)]
Invalid buffer index for slot 0: 1
Thread 4 "client_test" received signal SIGABRT, Aborted.
[Switching to Thread 0xffffeffff040 (LWP 84417)]
__pthread_kill_implementation (threadid=281474708271168, signo=signo@entry=6, no_tid=no_tid@entry=0) at ./nptl/pthread_kill.c:44
44 ./nptl/pthread_kill.c: No such file or directory.
(gdb) bt
#0 __pthread_kill_implementation (threadid=281474708271168, signo=signo@entry=6, no_tid=no_tid@entry=0) at ./nptl/pthread_kill.c:44
#1 0x0000fffff5c0f254 in __pthread_kill_internal (signo=6, threadid=<optimized out>) at ./nptl/pthread_kill.c:78
#2 0x0000fffff5bca67c in __GI_raise (sig=sig@entry=6) at ../sysdeps/posix/raise.c:26
#3 0x0000fffff5bb7130 in __GI_abort () at ./stdlib/abort.c:79
#4 0x0000fffff7ea1980 in subspace::Channel::Buffer(int) const ()
from /home/eq/.cache/bazel/_bazel_eq/48758b0bb01c6b4a67c45fe1327b09b0/execroot/_main/bazel-out/aarch64-fastbuild/bin/client/../_solib_aarch64/libclient_Slibsubspace_Uclient.so
#5 0x0000fffff7ea1768 in subspace::Channel::Prefix(subspace::MessageSlot*) const ()
from /home/eq/.cache/bazel/_bazel_eq/48758b0bb01c6b4a67c45fe1327b09b0/execroot/_main/bazel-out/aarch64-fastbuild/bin/client/../_solib_aarch64/libclient_Slibsubspace_Uclient.so
#6 0x0000fffff7e142f4 in subspace::Channel::NextSlot(subspace::MessageSlot*, bool, int, std::function<bool (subspace::ChannelLock*)>) ()
from /home/eq/.cache/bazel/_bazel_eq/48758b0bb01c6b4a67c45fe1327b09b0/execroot/_main/bazel-out/aarch64-fastbuild/bin/client/../_solib_aarch64/libcommon_Slibsubspace_Ucommon.so
#7 0x0000fffff7ea40d0 in subspace::details::SubscriberImpl::NextSlot(std::function<bool (subspace::ChannelLock*)>) ()
from /home/eq/.cache/bazel/_bazel_eq/48758b0bb01c6b4a67c45fe1327b09b0/execroot/_main/bazel-out/aarch64-fastbuild/bin/client/../_solib_aarch64/libclient_Slibsubspace_Uclient.so
#8 0x0000fffff7e9b5a0 in subspace::Client::ReadMessageInternal(subspace::details::SubscriberImpl*, subspace::ReadMode, bool, bool) ()
from /home/eq/.cache/bazel/_bazel_eq/48758b0bb01c6b4a67c45fe1327b09b0/execroot/_main/bazel-out/aarch64-fastbuild/bin/client/../_solib_aarch64/libclient_Slibsubspace_Uclient.so
#9 0x0000fffff7e9b988 in subspace::Client::ReadMessage(subspace::details::SubscriberImpl*, subspace::ReadMode) ()
from /home/eq/.cache/bazel/_bazel_eq/48758b0bb01c6b4a67c45fe1327b09b0/execroot/_main/bazel-out/aarch64-fastbuild/bin/client/../_solib_aarch64/libclient_Slibsubspace_Uclient.so
#10 0x0000aaaaaaae9248 in subspace::Subscriber::ReadMessage(subspace::ReadMode) ()
#11 0x0000aaaaaaae0ae0 in ClientTest_TestCrash_Test::TestBody()::{lambda()#2}::operator()() const ()
#12 0x0000aaaaaaae70b4 in void std::__invoke_impl<void, ClientTest_TestCrash_Test::TestBody()::{lambda()#2}>(std::__invoke_other, ClientTest_TestCrash_Test::TestBody()::{lambda()#2}&&) ()
#13 0x0000aaaaaaae7004 in std::__invoke_result<ClientTest_TestCrash_Test::TestBody()::{lambda()#2}>::type std::__invoke<ClientTest_TestCrash_Test::TestBody()::{lambda()#2}>(ClientTest_TestCrash_Test::TestBody()::{lambda()#2}&&) ()
#14 0x0000aaaaaaae6f74 in void std::thread::_Invoker<std::tuple<ClientTest_TestCrash_Test::TestBody()::{lambda()#2}> >::_M_invoke<0ul>(std::_Index_tuple<0ul>) ()
#15 0x0000aaaaaaae6f28 in std::thread::_Invoker<std::tuple<ClientTest_TestCrash_Test::TestBody()::{lambda()#2}> >::operator()() ()
#16 0x0000aaaaaaae6ee4 in std::thread::_State_impl<std::thread::_Invoker<std::tuple<ClientTest_TestCrash_Test::TestBody()::{lambda()#2}> > >::_M_run() ()
#17 0x0000fffff5e431fc in ?? () from /lib/aarch64-linux-gnu/libstdc++.so.6
#18 0x0000fffff5c0d5c8 in start_thread (arg=0x0) at ./nptl/pthread_create.c:442
#19 0x0000fffff5c75d9c in thread_start () at ../sysdeps/unix/sysv/linux/aarch64/clone.S:79
Metadata
Metadata
Assignees
Labels
No labels