一条日志导致的crash
我们的程序在退出的时候,每次都crash。不废话直接上StackTrace:
StackTrace
(rr) bt
#0 __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50
#1 0x00007f4467c58859 in __GI_abort () at abort.c:79
#2 0x0000000006f3899c in ?? ()
#3 0x00000000043e19bb in nebula::meta::MetaClient::~MetaClient (this=0x7f445e233000)
at /home/doodle.wang/Git/nebula-ent/src/clients/meta/MetaClient.cpp:98
#4 0x00000000043e25cb in nebula::meta::MetaClient::~MetaClient (this=0x7f445e233000)
at /home/doodle.wang/Git/nebula-ent/src/clients/meta/MetaClient.cpp:94
#5 0x0000000003f9b39e in std::default_delete<nebula::meta::MetaClient>::operator() (this=0x756ba20 <metaClient>, __ptr=0x7f445e233000)
at /usr/bin/../lib/gcc/x86_64-linux-gnu/9/../../../../include/c++/9/bits/unique_ptr.h:81
#6 0x0000000003f82174 in std::unique_ptr<nebula::meta::MetaClient, std::default_delete<nebula::meta::MetaClient> >::~unique_ptr (
this=0x756ba20 <metaClient>) at /usr/bin/../lib/gcc/x86_64-linux-gnu/9/../../../../include/c++/9/bits/unique_ptr.h:292
#7 0x00007f4467c7c8a7 in __run_exit_handlers (status=1, listp=0x7f4467e22718 <__exit_funcs>, run_list_atexit=run_list_atexit@entry=true,
run_dtors=run_dtors@entry=true) at exit.c:108
#8 0x00007f4467c7ca60 in __GI_exit (status=<optimized out>) at exit.c:139
#9 0x00007f4467c5a08a in __libc_start_main (main=0x3f3d700 <main(int, char**)>, argc=3, argv=0x7ffc957efe08, init=<optimized out>, fini=<optimized out>,
rtld_fini=<optimized out>, stack_end=0x7ffc957efdf8) at ../csu/libc-start.c:342
#10 0x0000000003f3c02e in _start ()
相应的代码如下,MetaClient
是一个全局变量,然而挂在了它的析构函数中。可以看到MetaClient.cpp:98只是打印一个日志:
94 MetaClient::~MetaClient() {
95 notifyStop();
96 stop();
97 delete metadata_.load();
98 VLOG(3) << "~MetaClient";
99 }
而这条日志导致调用了abort
,我们看下对应的汇编代码:
Dump of assembler code for function nebula::meta::MetaClient::~MetaClient():
0x00000000043e1910 <+0>: sub $0x68,%rsp
0x00000000043e1914 <+4>: mov %rdi,0x60(%rsp)
0x00000000043e1919 <+9>: mov 0x60(%rsp),%rax
0x00000000043e191e <+14>: movq $0x3cd00f8,(%rax)
0x00000000043e1925 <+21>: mov %rax,%rdi
0x00000000043e1928 <+24>: mov %rax,0x28(%rsp)
0x00000000043e192d <+29>: callq 0x43e2200 <nebula::meta::MetaClient::notifyStop()>
0x00000000043e1932 <+34>: jmpq 0x43e1937 <nebula::meta::MetaClient::~MetaClient()+39>
0x00000000043e1937 <+39>: mov 0x28(%rsp),%rdi
0x00000000043e193c <+44>: callq 0x43e2270 <nebula::meta::MetaClient::stop()>
0x00000000043e1941 <+49>: jmpq 0x43e1946 <nebula::meta::MetaClient::~MetaClient()+54>
0x00000000043e1946 <+54>: mov 0x28(%rsp),%rax
0x00000000043e194b <+59>: add $0x18e0,%rax
0x00000000043e1951 <+65>: mov %rax,%rdi
0x00000000043e1954 <+68>: mov $0x5,%esi
0x00000000043e1959 <+73>: callq 0x43e22f0 <std::atomic<nebula::meta::MetaClient::MetaData*>::load(std::memory_order) const>
0x00000000043e195e <+78>: cmp $0x0,%rax
0x00000000043e1962 <+82>: mov %rax,0x20(%rsp)
0x00000000043e1967 <+87>: je 0x43e1984 <nebula::meta::MetaClient::~MetaClient()+116>
0x00000000043e196d <+93>: mov 0x20(%rsp),%rdi
0x00000000043e1972 <+98>: callq 0x43e23d0 <nebula::meta::MetaClient::MetaData::~MetaData()>
0x00000000043e1977 <+103>: mov 0x20(%rsp),%rax
0x00000000043e197c <+108>: mov %rax,%rdi
0x00000000043e197f <+111>: callq 0x5d419d0 <operator delete(void*)>
0x00000000043e1984 <+116>: movl $0x3,0x50(%rsp)
0x00000000043e198c <+124>: mov 0x75703f0,%rax
0x00000000043e1994 <+132>: movb $0x0,0x37(%rsp)
0x00000000043e1999 <+137>: cmp $0x0,%rax
0x00000000043e199d <+141>: jne 0x43e19d1 <nebula::meta::MetaClient::~MetaClient()+193>
0x00000000043e19a3 <+147>: mov 0x50(%rsp),%ecx
0x00000000043e19a7 <+151>: mov $0x75703f0,%edi
0x00000000043e19ac <+156>: mov $0x764c008,%esi
0x00000000043e19b1 <+161>: mov $0x3eaab65,%edx
=> 0x00000000043e19b6 <+166>: callq 0x6f38420 <_ZN6google11InitVLOG3__EPPiS0_PKci>
0x00000000043e19bb <+171>: mov %al,0x1f(%rsp)
0x00000000043e19bf <+175>: jmpq 0x43e19c4 <nebula::meta::MetaClient::~MetaClient()+180>
...
可以看到,最终是挂在glog的这个函数google::InitVLOG3__
中,它的汇编如下:
Dump of assembler code for function _ZN6google11InitVLOG3__EPPiS0_PKci:
0x0000000006f38420 <+0>: push %rbp
0x0000000006f38421 <+1>: mov %rsp,%rbp
0x0000000006f38424 <+4>: push %r15
0x0000000006f38426 <+6>: mov %rdx,%r15
0x0000000006f38429 <+9>: push %r14
0x0000000006f3842b <+11>: push %r13
0x0000000006f3842d <+13>: push %r12
0x0000000006f3842f <+15>: push %rbx
0x0000000006f38430 <+16>: sub $0x78,%rsp
0x0000000006f38434 <+20>: movzbl 0x713c2d(%rip),%eax # 0x764c068
0x0000000006f3843b <+27>: mov %rdi,-0x98(%rbp)
0x0000000006f38442 <+34>: mov %rsi,-0x78(%rbp)
0x0000000006f38446 <+38>: mov %ecx,-0x80(%rbp)
0x0000000006f38449 <+41>: test %al,%al
=> 0x0000000006f3844b <+43>: jne 0x6f38710 <_ZN6google11InitVLOG3__EPPiS0_PKci+752>
0x0000000006f38451 <+49>: movzbl 0x713bc0(%rip),%eax # 0x764c018
0x0000000006f38458 <+56>: mov 0x713bc1(%rip),%rbx # 0x764c020
0x0000000006f3845f <+63>: mov %al,-0x79(%rbp)
0x0000000006f38462 <+66>: test %al,%al
0x0000000006f38464 <+68>: je 0x6f38540 <_ZN6google11InitVLOG3__EPPiS0_PKci+288>
可以看到在0x6f38434
执行movzbl 0x713c2d(%rip),%eax
,之后在6f38449进行test %al,%al
。由于此时rax不为0,因此在6f3844b
时,会跳转到0x6f38710
(ZF为0)。
(rr) info registers rax
rax 0x1 1
我们再继续看一下0x6f38710
附近的代码:
0x0000000006f38710 <+752>: lea 0x713929(%rip),%rdi # 0x764c040
=> 0x0000000006f38717 <+759>: callq 0x3f3b840 <pthread_mutex_lock$pltgot>
0x0000000006f3871c <+764>: test %eax,%eax
0x0000000006f3871e <+766>: je 0x6f38451 <_ZN6google11InitVLOG3__EPPiS0_PKci+49>
0x0000000006f38724 <+772>: jmpq 0x6f38997
可以看到它调用了pthread_mutex_lock
函数,然后检查其返回值是否为0,此时寄存器的值是0x16
:
(rr) info register rax
rax 0x16 22
因此不满足je条件,此时会执行jmpq 0x6f3899
:
0x0000000006f38710 <+752>: lea 0x713929(%rip),%rdi # 0x764c040
0x0000000006f38717 <+759>: callq 0x3f3b840 <pthread_mutex_lock$pltgot>
0x0000000006f3871c <+764>: test %eax,%eax
0x0000000006f3871e <+766>: je 0x6f38451 <_ZN6google11InitVLOG3__EPPiS0_PKci+49>
=> 0x0000000006f38724 <+772>: jmpq 0x6f38997
在0x6f3899
,我们就abort了:
(rr) ni
0x0000000006f38997 in ?? ()
1: x/i $pc
=> 0x6f38997: callq 0x3f3b850 <abort$pltgot>
What happened?
由刚才的汇编可以知道,由于pthread_mutex_lock
失败,其返回值不是0,之后程序很快就abort
。按图索骥,可以看一下glog相关代码:
static Mutex vmodule_lock;
// NOTE: Individual VLOG statements cache the integer log level pointers.
// NOTE: This function must not allocate memory or require any locks.
bool InitVLOG3__(int32** site_flag, int32* site_default,
const char* fname, int32 verbose_level) {
MutexLock l(&vmodule_lock);
// ...
}
vmodule_lock
是一个static全局变量。
class MutexLock {
public:
explicit MutexLock(Mutex *mu) : mu_(mu) { mu_->Lock(); }
~MutexLock() { mu_->Unlock(); }
private:
Mutex * const mu_;
// Disallow "evil" constructors
MutexLock(const MutexLock&);
void operator=(const MutexLock&);
};
Mutex::Lock
最终就是调用pthread_mutex_lock
:
#define SAFE_PTHREAD(fncall) do { /* run fncall if is_safe_ is true */ \
if (is_safe_ && fncall(&mutex_) != 0) abort(); \
} while (0)
Mutex::Mutex() {
SetIsSafe();
if (is_safe_ && pthread_mutex_init(&mutex_, NULL) != 0) abort();
}
Mutex::~Mutex() { SAFE_PTHREAD(pthread_mutex_destroy); }
void Mutex::Lock() { SAFE_PTHREAD(pthread_mutex_lock); }
void Mutex::Unlock() { SAFE_PTHREAD(pthread_mutex_unlock); }
#ifdef GMUTEX_TRYLOCK
bool Mutex::TryLock() { return is_safe_ ?
pthread_mutex_trylock(&mutex_) == 0 : true; }
#endif
void Mutex::ReaderLock() { Lock(); }
void Mutex::ReaderUnlock() { Unlock(); }
#undef SAFE_PTHREAD
可以看到首先检查的is_safe_,对应上面汇编代码中的0x6f38434
和0x6f38449
。然后调用pthread_mutex_lock
,如果其返回值不是0,就直接abort。
#define SAFE_PTHREAD(fncall) do { /* run fncall if is_safe_ is true */ \
if (is_safe_ && fncall(&mutex_) != 0) abort(); \
} while (0)
为了避免解释过多细节,首先pthread_mutex_lock
的错误码是EINVAL
,其内部是因为这个锁已经被释放了。而前面也提到这个MetaClient
是一个全局变量,如果在它析构之前,vmodule_lock
这个mutex已经被析构了。当之后执行MetaClient
析构时,上锁pthread_mutex_lock
就必然失败了。
出现这个问题的本质是因为多个源文件中的C++全局变量的析构顺序是不确定的。因此解决办法也很简单,在main退出之前,手动析构MetaClient
这个全局变量即可。
Details in pthread_mutex_lock
为什么pthread_mutex_lock
会失败呢?一会追踪,首先调用的是__pthread_mutex_lock
:
130 int pthread_mutex_lock(pthread_mutex_t* mutex) {
131 fix_mutex_kind(mutex);
132 if (!real_pthread_mutex_lock) {
133 #ifdef DOUBLE_UNDERSCORE_PTHREAD_LOCK_AVAILABLE
134 return __pthread_mutex_lock(mutex);
135 #else
136 real_pthread_mutex_lock = dlsym(RTLD_NEXT, "pthread_mutex_lock");
137 #endif
138 }
139 return real_pthread_mutex_lock(mutex);
140 }
然后调用__pthread_mutex_lock_full
:
int __pthread_mutex_lock (pthread_mutex_t *mutex)
{
/* See concurrency notes regarding mutex type which is loaded from __kind
in struct __pthread_mutex_s in sysdeps/nptl/bits/thread-shared-types.h. */
unsigned int type = PTHREAD_MUTEX_TYPE_ELISION (mutex);
LIBC_PROBE (mutex_entry, 1, mutex);
if (__builtin_expect (type & ~(PTHREAD_MUTEX_KIND_MASK_NP
| PTHREAD_MUTEX_ELISION_FLAGS_NP), 0))
return __pthread_mutex_lock_full (mutex);
// ...
}
最终可以看到走到了这段代码,返回EOWNERDEAD
。这个错误码表示一个进程拥有的某个资源(例如一个互斥锁)的原主进程已经结束,但没有正确地释放这个资源。
441 if (__glibc_unlikely (oldval & FUTEX_OWNER_DIED))
442 {
443 atomic_and (&mutex->__data.__lock, ~FUTEX_OWNER_DIED);
444
445 /* We got the mutex. */
446 mutex->__data.__count = 1;
447 /* But it is inconsistent unless marked otherwise. */
448 mutex->__data.__owner = PTHREAD_MUTEX_INCONSISTENT;
449
450 /* We must not enqueue the mutex before we have acquired it.
451 Also see comments at ENQUEUE_MUTEX. */
452 __asm ("" ::: "memory");
453 ENQUEUE_MUTEX_PI (mutex);
454 /* We need to clear op_pending after we enqueue the mutex. */
455 __asm ("" ::: "memory");
456 THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
457
458 /* Note that we deliberately exit here. If we fall
459 through to the end of the function __nusers would be
460 incremented which is not correct because the old owner
461 has to be discounted. If we are not supposed to
462 increment __nusers we actually have to decrement it here. */
463 # ifdef NO_INCR
464 --mutex->__data.__nusers;
465 # endif
466
467 return EOWNERDEAD;
468 }