我们的程序在退出的时候,每次都crash。不废话直接上StackTrace:

StackTrace

(rr) bt
#0  __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50
#1  0x00007f4467c58859 in __GI_abort () at abort.c:79
#2  0x0000000006f3899c in ?? ()
#3  0x00000000043e19bb in nebula::meta::MetaClient::~MetaClient (this=0x7f445e233000)
    at /home/doodle.wang/Git/nebula-ent/src/clients/meta/MetaClient.cpp:98
#4  0x00000000043e25cb in nebula::meta::MetaClient::~MetaClient (this=0x7f445e233000)
    at /home/doodle.wang/Git/nebula-ent/src/clients/meta/MetaClient.cpp:94
#5  0x0000000003f9b39e in std::default_delete<nebula::meta::MetaClient>::operator() (this=0x756ba20 <metaClient>, __ptr=0x7f445e233000)
    at /usr/bin/../lib/gcc/x86_64-linux-gnu/9/../../../../include/c++/9/bits/unique_ptr.h:81
#6  0x0000000003f82174 in std::unique_ptr<nebula::meta::MetaClient, std::default_delete<nebula::meta::MetaClient> >::~unique_ptr (
    this=0x756ba20 <metaClient>) at /usr/bin/../lib/gcc/x86_64-linux-gnu/9/../../../../include/c++/9/bits/unique_ptr.h:292
#7  0x00007f4467c7c8a7 in __run_exit_handlers (status=1, listp=0x7f4467e22718 <__exit_funcs>, run_list_atexit=run_list_atexit@entry=true,
    run_dtors=run_dtors@entry=true) at exit.c:108
#8  0x00007f4467c7ca60 in __GI_exit (status=<optimized out>) at exit.c:139
#9  0x00007f4467c5a08a in __libc_start_main (main=0x3f3d700 <main(int, char**)>, argc=3, argv=0x7ffc957efe08, init=<optimized out>, fini=<optimized out>,
    rtld_fini=<optimized out>, stack_end=0x7ffc957efdf8) at ../csu/libc-start.c:342
#10 0x0000000003f3c02e in _start ()

相应的代码如下,MetaClient是一个全局变量,然而挂在了它的析构函数中。可以看到MetaClient.cpp:98只是打印一个日志:

94 MetaClient::~MetaClient() {
95   notifyStop();
96   stop();
97   delete metadata_.load();
98   VLOG(3) << "~MetaClient";
99 }

而这条日志导致调用了abort,我们看下对应的汇编代码:

Dump of assembler code for function nebula::meta::MetaClient::~MetaClient():
   0x00000000043e1910 <+0>:	sub    $0x68,%rsp
   0x00000000043e1914 <+4>:	mov    %rdi,0x60(%rsp)
   0x00000000043e1919 <+9>:	mov    0x60(%rsp),%rax
   0x00000000043e191e <+14>:	movq   $0x3cd00f8,(%rax)
   0x00000000043e1925 <+21>:	mov    %rax,%rdi
   0x00000000043e1928 <+24>:	mov    %rax,0x28(%rsp)
   0x00000000043e192d <+29>:	callq  0x43e2200 <nebula::meta::MetaClient::notifyStop()>
   0x00000000043e1932 <+34>:	jmpq   0x43e1937 <nebula::meta::MetaClient::~MetaClient()+39>
   0x00000000043e1937 <+39>:	mov    0x28(%rsp),%rdi
   0x00000000043e193c <+44>:	callq  0x43e2270 <nebula::meta::MetaClient::stop()>
   0x00000000043e1941 <+49>:	jmpq   0x43e1946 <nebula::meta::MetaClient::~MetaClient()+54>
   0x00000000043e1946 <+54>:	mov    0x28(%rsp),%rax
   0x00000000043e194b <+59>:	add    $0x18e0,%rax
   0x00000000043e1951 <+65>:	mov    %rax,%rdi
   0x00000000043e1954 <+68>:	mov    $0x5,%esi
   0x00000000043e1959 <+73>:	callq  0x43e22f0 <std::atomic<nebula::meta::MetaClient::MetaData*>::load(std::memory_order) const>
   0x00000000043e195e <+78>:	cmp    $0x0,%rax
   0x00000000043e1962 <+82>:	mov    %rax,0x20(%rsp)
   0x00000000043e1967 <+87>:	je     0x43e1984 <nebula::meta::MetaClient::~MetaClient()+116>
   0x00000000043e196d <+93>:	mov    0x20(%rsp),%rdi
   0x00000000043e1972 <+98>:	callq  0x43e23d0 <nebula::meta::MetaClient::MetaData::~MetaData()>
   0x00000000043e1977 <+103>:	mov    0x20(%rsp),%rax
   0x00000000043e197c <+108>:	mov    %rax,%rdi
   0x00000000043e197f <+111>:	callq  0x5d419d0 <operator delete(void*)>
   0x00000000043e1984 <+116>:	movl   $0x3,0x50(%rsp)
   0x00000000043e198c <+124>:	mov    0x75703f0,%rax
   0x00000000043e1994 <+132>:	movb   $0x0,0x37(%rsp)
   0x00000000043e1999 <+137>:	cmp    $0x0,%rax
   0x00000000043e199d <+141>:	jne    0x43e19d1 <nebula::meta::MetaClient::~MetaClient()+193>
   0x00000000043e19a3 <+147>:	mov    0x50(%rsp),%ecx
   0x00000000043e19a7 <+151>:	mov    $0x75703f0,%edi
   0x00000000043e19ac <+156>:	mov    $0x764c008,%esi
   0x00000000043e19b1 <+161>:	mov    $0x3eaab65,%edx
=> 0x00000000043e19b6 <+166>:	callq  0x6f38420 <_ZN6google11InitVLOG3__EPPiS0_PKci>
   0x00000000043e19bb <+171>:	mov    %al,0x1f(%rsp)
   0x00000000043e19bf <+175>:	jmpq   0x43e19c4 <nebula::meta::MetaClient::~MetaClient()+180>
   ...

可以看到,最终是挂在glog的这个函数google::InitVLOG3__中,它的汇编如下:

Dump of assembler code for function _ZN6google11InitVLOG3__EPPiS0_PKci:
   0x0000000006f38420 <+0>:	push   %rbp
   0x0000000006f38421 <+1>:	mov    %rsp,%rbp
   0x0000000006f38424 <+4>:	push   %r15
   0x0000000006f38426 <+6>:	mov    %rdx,%r15
   0x0000000006f38429 <+9>:	push   %r14
   0x0000000006f3842b <+11>:	push   %r13
   0x0000000006f3842d <+13>:	push   %r12
   0x0000000006f3842f <+15>:	push   %rbx
   0x0000000006f38430 <+16>:	sub    $0x78,%rsp
   0x0000000006f38434 <+20>:	movzbl 0x713c2d(%rip),%eax        # 0x764c068
   0x0000000006f3843b <+27>:	mov    %rdi,-0x98(%rbp)
   0x0000000006f38442 <+34>:	mov    %rsi,-0x78(%rbp)
   0x0000000006f38446 <+38>:	mov    %ecx,-0x80(%rbp)
   0x0000000006f38449 <+41>:	test   %al,%al
=> 0x0000000006f3844b <+43>:	jne    0x6f38710 <_ZN6google11InitVLOG3__EPPiS0_PKci+752>
   0x0000000006f38451 <+49>:	movzbl 0x713bc0(%rip),%eax        # 0x764c018
   0x0000000006f38458 <+56>:	mov    0x713bc1(%rip),%rbx        # 0x764c020
   0x0000000006f3845f <+63>:	mov    %al,-0x79(%rbp)
   0x0000000006f38462 <+66>:	test   %al,%al
   0x0000000006f38464 <+68>:	je     0x6f38540 <_ZN6google11InitVLOG3__EPPiS0_PKci+288>

可以看到在0x6f38434执行movzbl 0x713c2d(%rip),%eax,之后在6f38449进行test %al,%al。由于此时rax不为0,因此在6f3844b时,会跳转到0x6f38710(ZF为0)。

(rr) info registers rax
rax            0x1                 1

我们再继续看一下0x6f38710附近的代码:

   0x0000000006f38710 <+752>:	lea    0x713929(%rip),%rdi        # 0x764c040
=> 0x0000000006f38717 <+759>:	callq  0x3f3b840 <pthread_mutex_lock$pltgot>
   0x0000000006f3871c <+764>:	test   %eax,%eax
   0x0000000006f3871e <+766>:	je     0x6f38451 <_ZN6google11InitVLOG3__EPPiS0_PKci+49>
   0x0000000006f38724 <+772>:	jmpq   0x6f38997

可以看到它调用了pthread_mutex_lock函数,然后检查其返回值是否为0,此时寄存器的值是0x16

(rr) info register rax
rax            0x16                22

因此不满足je条件,此时会执行jmpq 0x6f3899

   0x0000000006f38710 <+752>:	lea    0x713929(%rip),%rdi        # 0x764c040
   0x0000000006f38717 <+759>:	callq  0x3f3b840 <pthread_mutex_lock$pltgot>
   0x0000000006f3871c <+764>:	test   %eax,%eax
   0x0000000006f3871e <+766>:	je     0x6f38451 <_ZN6google11InitVLOG3__EPPiS0_PKci+49>
=> 0x0000000006f38724 <+772>:	jmpq   0x6f38997

0x6f3899,我们就abort了:

(rr) ni
0x0000000006f38997 in ?? ()
1: x/i $pc
=> 0x6f38997:	callq  0x3f3b850 <abort$pltgot>

What happened?

由刚才的汇编可以知道,由于pthread_mutex_lock失败,其返回值不是0,之后程序很快就abort。按图索骥,可以看一下glog相关代码:

static Mutex vmodule_lock;

// NOTE: Individual VLOG statements cache the integer log level pointers.
// NOTE: This function must not allocate memory or require any locks.
bool InitVLOG3__(int32** site_flag, int32* site_default,
                 const char* fname, int32 verbose_level) {
  MutexLock l(&vmodule_lock);
  // ...
}

vmodule_lock是一个static全局变量。

class MutexLock {
 public:
  explicit MutexLock(Mutex *mu) : mu_(mu) { mu_->Lock(); }
  ~MutexLock() { mu_->Unlock(); }
 private:
  Mutex * const mu_;
  // Disallow "evil" constructors
  MutexLock(const MutexLock&);
  void operator=(const MutexLock&);
};

Mutex::Lock最终就是调用pthread_mutex_lock

#define SAFE_PTHREAD(fncall)  do {   /* run fncall if is_safe_ is true */  \
  if (is_safe_ && fncall(&mutex_) != 0) abort();                           \
} while (0)

Mutex::Mutex()             {
  SetIsSafe();
  if (is_safe_ && pthread_mutex_init(&mutex_, NULL) != 0) abort();
}

Mutex::~Mutex()            { SAFE_PTHREAD(pthread_mutex_destroy); }
void Mutex::Lock()         { SAFE_PTHREAD(pthread_mutex_lock); }
void Mutex::Unlock()       { SAFE_PTHREAD(pthread_mutex_unlock); }
#ifdef GMUTEX_TRYLOCK
bool Mutex::TryLock()      { return is_safe_ ?
                                 pthread_mutex_trylock(&mutex_) == 0 : true; }
#endif
void Mutex::ReaderLock()   { Lock(); }
void Mutex::ReaderUnlock() { Unlock(); }
#undef SAFE_PTHREAD

可以看到首先检查的is_safe_,对应上面汇编代码中的0x6f384340x6f38449。然后调用pthread_mutex_lock,如果其返回值不是0,就直接abort。

#define SAFE_PTHREAD(fncall)  do {   /* run fncall if is_safe_ is true */  \
  if (is_safe_ && fncall(&mutex_) != 0) abort();                           \
} while (0)

为了避免解释过多细节,首先pthread_mutex_lock的错误码是EINVAL,其内部是因为这个锁已经被释放了。而前面也提到这个MetaClient是一个全局变量,如果在它析构之前,vmodule_lock这个mutex已经被析构了。当之后执行MetaClient析构时,上锁pthread_mutex_lock就必然失败了。

出现这个问题的本质是因为多个源文件中的C++全局变量的析构顺序是不确定的。因此解决办法也很简单,在main退出之前,手动析构MetaClient这个全局变量即可。

Details in pthread_mutex_lock

为什么pthread_mutex_lock会失败呢?一会追踪,首先调用的是__pthread_mutex_lock

 130 int pthread_mutex_lock(pthread_mutex_t* mutex) {
 131   fix_mutex_kind(mutex);
 132   if (!real_pthread_mutex_lock) {
 133 #ifdef DOUBLE_UNDERSCORE_PTHREAD_LOCK_AVAILABLE
 134     return __pthread_mutex_lock(mutex);
 135 #else
 136     real_pthread_mutex_lock = dlsym(RTLD_NEXT, "pthread_mutex_lock");
 137 #endif
 138   }
 139   return real_pthread_mutex_lock(mutex);
 140 }

然后调用__pthread_mutex_lock_full

int __pthread_mutex_lock (pthread_mutex_t *mutex)
{
  /* See concurrency notes regarding mutex type which is loaded from __kind
     in struct __pthread_mutex_s in sysdeps/nptl/bits/thread-shared-types.h.  */
  unsigned int type = PTHREAD_MUTEX_TYPE_ELISION (mutex);

  LIBC_PROBE (mutex_entry, 1, mutex);

  if (__builtin_expect (type & ~(PTHREAD_MUTEX_KIND_MASK_NP
				 | PTHREAD_MUTEX_ELISION_FLAGS_NP), 0))
    return __pthread_mutex_lock_full (mutex);

  // ...
}

最终可以看到走到了这段代码,返回EOWNERDEAD。这个错误码表示一个进程拥有的某个资源(例如一个互斥锁)的原主进程已经结束,但没有正确地释放这个资源。

 441       if (__glibc_unlikely (oldval & FUTEX_OWNER_DIED))
 442       {
 443         atomic_and (&mutex->__data.__lock, ~FUTEX_OWNER_DIED);
 444
 445         /* We got the mutex.  */
 446         mutex->__data.__count = 1;
 447         /* But it is inconsistent unless marked otherwise.  */
 448         mutex->__data.__owner = PTHREAD_MUTEX_INCONSISTENT;
 449
 450         /* We must not enqueue the mutex before we have acquired it.
 451            Also see comments at ENQUEUE_MUTEX.  */
 452         __asm ("" ::: "memory");
 453         ENQUEUE_MUTEX_PI (mutex);
 454         /* We need to clear op_pending after we enqueue the mutex.  */
 455         __asm ("" ::: "memory");
 456         THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
 457
 458         /* Note that we deliberately exit here.  If we fall
 459            through to the end of the function __nusers would be
 460            incremented which is not correct because the old owner
 461            has to be discounted.  If we are not supposed to
 462            increment __nusers we actually have to decrement it here.  */
 463 # ifdef NO_INCR
 464         --mutex->__data.__nusers;
 465 # endif
 466
 467         return EOWNERDEAD;
 468       }

Tags:

Categories:

Updated: