Android是基于Linux存在的,常常有使用c/c++代码库编写native进程或者动态库的情况,这些库在运行时发生的异常统称Native Exception。

0 NE简介

通常native代码,出现错误,比如空指针异常,出现signal 11的信号错误,会打印如下的堆栈信息

 1--------- beginning of crash
 208-04 17:02:06.305   361   361 F libc    : Fatal signal 11 (SIGSEGV), code 1, fault addr 0xa in tid 361 (demo), pid 361 (demo)
 308-04 17:02:06.305   361   361 F libc    : ->>>after clone|pseudothread_tid(-1)
 408-04 17:02:06.306   361   361 F libc    : ->>>debuggerd_dispatch_pseudothread_1|pseudothread_tid(1668)
 508-04 17:02:06.312  1669  1669 F libc    : ->>>main_tid(361),pseudothread_tid(1668),debuggerd_dump_type(1)
 608-04 17:02:06.360   361   361 F libc    : ->>>1-1buf[0][1][2][3](0x1 0x0 0x0 0x0),rc(1),pseudothread_tid(1668)
 708-04 17:02:06.360   361   361 F libc    : ->>>debuggerd_dispatch_pseudothread_1-2|pseudothread_tid(1668)
 808-04 17:02:06.360   361   361 F libc    : ->>>debuggerd_dispatch_pseudothread_2|pseudothread_tid(1668)
 908-04 17:02:06.361   361   361 F libc    : ->>>debuggerd_dispatch_pseudothread_3|pseudothread_tid(1668)
1008-04 17:02:06.361   361   361 F libc    : ->>>after futex_wait|pseudothread_tid(0)
1108-04 17:02:06.366  1671  1671 F DEBUG   : *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
1208-04 17:02:06.366  1671  1671 F DEBUG   : Build fingerprint: 'xxx'
1308-04 17:02:06.366  1671  1671 F DEBUG   : Revision: '0'
1408-04 17:02:06.366  1671  1671 F DEBUG   : ABI: 'arm'
1508-04 17:02:06.366  1671  1671 F DEBUG   : pid: 361, tid: 361, name: demo  >>> /system/bin/demo <<<
1608-04 17:02:06.366  1671  1671 F DEBUG   : signal 11 (SIGSEGV), code 1 (SEGV_MAPERR), fault addr 0xa
1708-04 17:02:06.366  1671  1671 F DEBUG   : Cause: null pointer dereference
1808-04 17:02:06.366  1671  1671 F DEBUG   :     r0 0000000a  r1 00000061  r2 0538ad06  r3 fff162b0
1908-04 17:02:06.366  1671  1671 F DEBUG   :     r4 b91d04ed  r5 b91dde08  r6 00000004  r7 fff1639c
2008-04 17:02:06.366  1671  1671 F DEBUG   :     r8 00000000  r9 00000000  sl 00000000  fp fff1638c
2108-04 17:02:06.367  1671  1671 F DEBUG   :     ip fff1639c  sp fff16308  lr eb97f0eb  pc b91d0342  cpsr 20010030
2208-04 17:02:06.373  1671  1671 F DEBUG   :
2308-04 17:02:06.373  1671  1671 F DEBUG   : backtrace:
2408-04 17:02:06.373  1671  1671 F DEBUG   :     #00 pc 00006342  /system/bin/demo (main+489)
2508-04 17:02:06.373  1671  1671 F DEBUG   :     #01 pc 0007669d  /system/lib/libc.so (__libc_init+48)
2608-04 17:02:06.373  1671  1671 F DEBUG   :     #02 pc 000024d8  /system/bin/demo (_start_main+88)

1 NE流程分析

1.1从架构begin.S开始

1//bionic/linker/arch/arm/begin.S
2ENTRY(_start)
3  // Force unwinds to end in this function.
4  .cfi_undefined r14
5  mov r0, sp
6  //汇编语言,开始调用函数__linker_init
7  bl __linker_init
8  bx r0
9END(_start)

sp寄存器为堆栈指针寄存器,为栈顶指针,指向栈顶地址

r0寄存器用作传入函数参数,传出函数返回值,被调用函数在返回之前不必恢复r0寄存器

bx为16位基址寄存器,且lr起存器保存r0的地址,其中地址还可以分为高位低位(BHBL),常存放访问内存的地址

bl调用子程序,其中bl跳转范围为(-32MB~32MB

1.将栈顶元素,赋值给r0寄存器(16位)

1mov r0, sp

2.调用子程序__linker_init,把返回地址保存在lr寄存器里面

1bl __linker_init

3.返回子程序,并出现返回值,指向__linker_init的下一个地址

1bx r0

1.2__linker_init

__linker_init()方法中调用了__linker_init_post_relocation()__linker_init_post_relocation()初始化linker的全局变量,之后调用linker_main()函数获取可执行程序的开始地址,然后跳转到开始地址继续执行。

可以看到linker_main()函数中对系统环境进行了确认,对系统属性进行了初始化设置,之后调用

 1//bionic/linker/linker_main.cpp
 2static ElfW(Addr) linker_main(KernelArgumentBlock& args, const char* exe_to_load) {
 3  ...
 4  // Sanitize the environment.
 5  __libc_init_AT_SECURE(args.envp);
 6  // Initialize system properties
 7  __system_properties_init(); // may use 'environ'
 8  // Register the debuggerd signal handler.
 9  linker_debuggerd_init();
10  ...
11}
12
13extern "C" ElfW(Addr) __linker_init(void* raw_args) {
14  ...  
15  return __linker_init_post_relocation(args, tmp_linker_so);
16}
17
18static ElfW(Addr) __attribute__((noinline))
19__linker_init_post_relocation(KernelArgumentBlock& args, soinfo& tmp_linker_so) {
20  ...
21  ElfW(Addr) start_address = linker_main(args, exe_to_load);
22  ...  
23}

1.3linker_debuggerd_init

设置几个回调方法,定义了一个callbacks结构体,之后把这个callbacks作为参数传递给debuggerd_init()

 1//bionic/linker/linker_debuggerd_android.cpp
 2void linker_debuggerd_init() {
 3  debuggerd_callbacks_t callbacks = {
 4    .get_abort_message = []() {
 5      return __libc_shared_globals()->abort_msg;
 6    },
 7    .post_dump = &notify_gdb_of_libraries,
 8    .get_gwp_asan_state = []() {
 9      return __libc_shared_globals()->gwp_asan_state;
10    },
11    .get_gwp_asan_metadata = []() {
12      return __libc_shared_globals()->gwp_asan_metadata;
13    },
14  };
15  debuggerd_init(&callbacks);
16}

1.4debuggerd_init

  • 首先把callbacks赋给g_callbacks
  • 之后调用mmap为线程的栈分配空间
  • 然后调用mprotect函数,设置stack对应的内存区的保护属性为可读可写,之后把栈的起始点定义在页末尾并对齐
  • 最后调用debuggerd_register_handlers()去注册一些异常信号,而异常信号的处理函数是debuggerd_signal_handler()
 1//system/core/debuggerd/handler/debuggerd_handler.cpp
 2void debuggerd_init(debuggerd_callbacks_t* callbacks) {
 3  if (callbacks) {
 4    g_callbacks = *callbacks;
 5  }
 6
 7  size_t thread_stack_pages = 8;
 8  //设置一个mmap匿名的空间
 9  void* thread_stack_allocation = mmap(nullptr, PAGE_SIZE * (thread_stack_pages + 2), PROT_NONE,
10                                       MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
11  if (thread_stack_allocation == MAP_FAILED) {
12    fatal_errno("failed to allocate debuggerd thread stack");
13  }
14
15  char* stack = static_cast<char*>(thread_stack_allocation) + PAGE_SIZE;
16  if (mprotect(stack, PAGE_SIZE * thread_stack_pages, PROT_READ | PROT_WRITE) != 0) {
17    fatal_errno("failed to mprotect debuggerd thread stack");
18  }
19
20  //设置一个栈,指向栈顶,大小为4*1024的整数倍,为后续clone的逻辑空间地址
21  stack = (stack + thread_stack_pages * PAGE_SIZE - 1);
22  stack -= 15;
23  pseudothread_stack = stack;
24  //为当前进程设置signal处理机制
25  struct sigaction action;
26  memset(&action, 0, sizeof(action));
27  sigfillset(&action.sa_mask);
28  action.sa_sigaction = debuggerd_signal_handler;
29  //SA_RESTART代表函数程序不会异常终止,比如read,write等
30  action.sa_flags = SA_RESTART | SA_SIGINFO;
31  //如果可用,使用备用信号堆栈,这样我们可以捕捉堆栈溢出。
32  action.sa_flags |= SA_ONSTACK;
33  //信号注册函数如下所示
34  debuggerd_register_handlers(&action);
35}

debuggerd_register_handlers注册了8种信号,SIGABRT(6)SIGBUS(7)SIGFPE(8)SIGILL(4)SIGSEGV(11)SIGSTKFLT(16)SIGSYS(31)SIGTRAP(5)

 1//system/core/debuggerd/include/debuggerd/handler.h
 2static void __attribute__((__unused__)) debuggerd_register_handlers(struct sigaction* action) {
 3  char value[PROP_VALUE_MAX] = "";
 4  bool enabled =
 5      !(__system_property_get("ro.debuggable", value) > 0 && !strcmp(value, "1") &&
 6        __system_property_get("debug.debuggerd.disable", value) > 0 && !strcmp(value, "1"));
 7  if (enabled) {
 8    sigaction(SIGABRT, action, nullptr);
 9    sigaction(SIGBUS, action, nullptr);
10    sigaction(SIGFPE, action, nullptr);
11    sigaction(SIGILL, action, nullptr);
12    sigaction(SIGSEGV, action, nullptr);
13    sigaction(SIGSTKFLT, action, nullptr);
14    sigaction(SIGSYS, action, nullptr);
15    sigaction(SIGTRAP, action, nullptr);
16  }
17
18  sigaction(BIONIC_SIGNAL_DEBUGGER, action, nullptr);
19}

对应的信号为

 1 1) SIGHUP       2) SIGINT       3) SIGQUIT      4) SIGILL       5) SIGTRAP
 2 6) SIGABRT      7) SIGBUS       8) SIGFPE       9) SIGKILL     10) SIGUSR1
 311) SIGSEGV     12) SIGUSR2     13) SIGPIPE     14) SIGALRM     15) SIGTERM
 416) SIGSTKFLT   17) SIGCHLD     18) SIGCONT     19) SIGSTOP     20) SIGTSTP
 521) SIGTTIN     22) SIGTTOU     23) SIGURG      24) SIGXCPU     25) SIGXFSZ
 626) SIGVTALRM   27) SIGPROF     28) SIGWINCH    29) SIGIO       30) SIGPWR
 731) SIGSYS      34) SIGRTMIN    35) SIGRTMIN+1  36) SIGRTMIN+2  37) SIGRTMIN+3
 838) SIGRTMIN+4  39) SIGRTMIN+5  40) SIGRTMIN+6  41) SIGRTMIN+7  42) SIGRTMIN+8
 943) SIGRTMIN+9  44) SIGRTMIN+10 45) SIGRTMIN+11 46) SIGRTMIN+12 47) SIGRTMIN+13
1048) SIGRTMIN+14 49) SIGRTMIN+15 50) SIGRTMAX-14 51) SIGRTMAX-13 52) SIGRTMAX-12
1153) SIGRTMAX-11 54) SIGRTMAX-10 55) SIGRTMAX-9  56) SIGRTMAX-8  57) SIGRTMAX-7
1258) SIGRTMAX-6  59) SIGRTMAX-5  60) SIGRTMAX-4  61) SIGRTMAX-3  62) SIGRTMAX-2
1363) SIGRTMAX-1  64) SIGRTMAX

1.5debuggerd_signal_handler

上述信号一旦产生,直接调用到这里

 1//system/core/debuggerd/handler/debuggerd_handler.cpp
 2static void debuggerd_signal_handler(int signal_number, siginfo_t* info, void* context) {
 3  ...
 4  if (signal_number == BIONIC_SIGNAL_DEBUGGER) {
 5      ...
 6  } else {
 7    if (g_callbacks.get_abort_message) {
 8      abort_message = g_callbacks.get_abort_message();
 9    }
10    if (g_callbacks.get_gwp_asan_state) {
11      gwp_asan_state = g_callbacks.get_gwp_asan_state();
12    }
13    if (g_callbacks.get_gwp_asan_metadata) {
14      gwp_asan_metadata = g_callbacks.get_gwp_asan_metadata();
15    }
16  }
17  //打印第一句信息
18  //Fatal signal 11 (SIGSEGV), code 1, fault addr 0xa in tid 361 (demo), pid 361 (demo)
19  log_signal_summary(info);
20
21  debugger_thread_info thread_info = {
22      .crashing_tid = __gettid(),
23      .pseudothread_tid = -1,
24      .siginfo = info,
25      .ucontext = context,
26      .abort_msg = reinterpret_cast<uintptr_t>(abort_message),
27      .fdsan_table = reinterpret_cast<uintptr_t>(android_fdsan_get_fd_table()),
28      .gwp_asan_state = reinterpret_cast<uintptr_t>(gwp_asan_state),
29      .gwp_asan_metadata = reinterpret_cast<uintptr_t>(gwp_asan_metadata),
30  };
31  //开启一个新线程,设置CLONE_CHILD_SETTID之后会把thread_info.pseudothread_tid进入函数设置为新线程pid,退出又恢复
32  pid_t child_pid =
33    clone(debuggerd_dispatch_pseudothread, pseudothread_stack,
34          CLONE_THREAD | CLONE_SIGHAND | CLONE_VM | CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID,
35          &thread_info, nullptr, nullptr, &thread_info.pseudothread_tid);
36  if (child_pid == -1) {
37    fatal_errno("failed to spawn debuggerd dispatch thread");
38  }
39  //主线程等待
40  //正好对应上述线程的进入和退出,等待进入函数debuggerd_dispatch_pseudothread
41  futex_wait(&thread_info.pseudothread_tid, -1);
42  //等待退出函数debuggerd_dispatch_pseudothread  
43  futex_wait(&thread_info.pseudothread_tid, child_pid);
44  ...
45}

1.6debuggerd_dispatch_pseudothread

开启子线程

 1//system/core/debuggerd/handler/debuggerd_handler.cpp
 2static int debuggerd_dispatch_pseudothread(void* arg) {
 3  debugger_thread_info* thread_info = static_cast<debugger_thread_info*>(arg);
 4  //使用系统调用关闭socket句柄,不使用close用来避免bionic的文件描述符所有权检查
 5  for (int i = 0; i < 1024; ++i) {
 6    syscall(__NR_close, i);
 7  }
 8  //创建四个管道,后续的操作主要在管道中进行	
 9  unique_fd input_read, input_write;
10  unique_fd output_read, output_write;
11  if (!Pipe(&input_read, &input_write) != 0 || !Pipe(&output_read, &output_write)) {
12    fatal_errno("failed to create pipe");
13  }
14  ...
15  struct iovec iovs[] = {
16      {.iov_base = &version, .iov_len = sizeof(version)},
17      {.iov_base = thread_info->siginfo, .iov_len = sizeof(siginfo_t)},
18      {.iov_base = thread_info->ucontext, .iov_len = sizeof(ucontext_t)},
19      {.iov_base = &thread_info->abort_msg, .iov_len = sizeof(uintptr_t)},
20      {.iov_base = &thread_info->fdsan_table, .iov_len = sizeof(uintptr_t)},
21      {.iov_base = &thread_info->gwp_asan_state, .iov_len = sizeof(uintptr_t)},
22      {.iov_base = &thread_info->gwp_asan_metadata, .iov_len = sizeof(uintptr_t)},
23  };
24  //当前线程写入一个iovs给管道句柄output_write,后续在crash_dump进程中接收
25  ssize_t rc = TEMP_FAILURE_RETRY(writev(output_write.get(), iovs, arraysize(iovs)));
26
27
28  //fork一个子进程,这里运用了clone的方式来fork,用来避免调用pthread_atfork处理程序
29  pid_t crash_dump_pid = __fork();
30  if (crash_dump_pid == -1) {
31    async_safe_format_log(ANDROID_LOG_FATAL, "libc",
32                          "failed to fork in debuggerd signal handler: %s", strerror(errno));
33  } else if (crash_dump_pid == 0) {
34    //子进程最终通过调用/system/bin/crash_dump调用到这个进程中
35    TEMP_FAILURE_RETRY(dup2(input_write.get(), STDOUT_FILENO));
36    TEMP_FAILURE_RETRY(dup2(output_read.get(), STDIN_FILENO));
37    input_read.reset();
38    input_write.reset();
39    output_read.reset();
40    output_write.reset();
41    ...  
42    async_safe_format_buffer(main_tid, sizeof(main_tid), "%d", thread_info->crashing_tid);
43    async_safe_format_buffer(pseudothread_tid, sizeof(pseudothread_tid), "%d",
44                             thread_info->pseudothread_tid);
45    async_safe_format_buffer(debuggerd_dump_type, sizeof(debuggerd_dump_type), "%d",
46                             get_dump_type(thread_info));
47    //这里传入三个参数
48    //main_tid为当前crash进程号,
49    //pseudothread_tid为当前的子线程号,即之前主线程clone出来的线程号
50    //debuggerd_dump_type为kDebuggerdTombstone  
51    execle(CRASH_DUMP_PATH, CRASH_DUMP_NAME, main_tid, pseudothread_tid, debuggerd_dump_type,
52           nullptr, nullptr);
53    return 1;
54  }
55
56  input_write.reset();
57  output_read.reset();
58  //父进程等待,通过管道句柄input_read接收到刚刚已经fork的子进程的一个\1
59  char buf[4];
60  rc = TEMP_FAILURE_RETRY(read(input_read.get(), &buf, sizeof(buf)));
61  if (rc == -1) {
62    async_safe_format_log(ANDROID_LOG_FATAL, "libc", "read of IPC pipe failed: %s", strerror(errno));
63    return 1;
64  } else if (rc == 0) {
65    async_safe_format_log(ANDROID_LOG_FATAL, "libc", "crash_dump helper failed to exec");
66    return 1;
67  } else if (rc != 1) {
68    async_safe_format_log(ANDROID_LOG_FATAL, "libc",
69                          "read of IPC pipe returned unexpected value: %zd", rc);
70    return 1;
71  } else if (buf[0] != '\1') {
72    async_safe_format_log(ANDROID_LOG_FATAL, "libc", "crash_dump helper reported failure");
73    return 1;
74  }
75
76  // Crash_dump正在跟踪我们,复制我们的地址空间的副本供它使用
77  create_vm_process();
78  //等待子进程结束,防止变成僵尸进程
79  int status;
80  if (TEMP_FAILURE_RETRY(waitpid(crash_dump_pid, &status, 0)) == -1) {
81    async_safe_format_log(ANDROID_LOG_FATAL, "libc", "failed to wait for crash_dump helper: %s",
82                          strerror(errno));
83  } else if (WIFSTOPPED(status) || WIFSIGNALED(status)) {
84    async_safe_format_log(ANDROID_LOG_FATAL, "libc", "crash_dump helper crashed or stopped");
85  }
86  return 0;
87}

1.7crash_dump

进入/system/bin/crash_dump进程的入口函数

  • 初始化,包括初始化相关信号处理,初始化管道,并且fork一个子进程
  • 子父进程处理,父进程等待子进程退出,子进程解析传入的三个参数,传入子进程的iovs,并将相关crash的子线程保存
  • 客户端连接tombstoned服务端,并且返回一个文件句柄g_output_fd
  • 打印部分堆栈信息,将所有堆栈信息写入到句柄g_output_fd中去
  • 通知AMS做进一步的堆栈处理
  • 通知tombstoned服务端,将tombstone文件生成
  1//system/core/debuggerd/crash_dump.cpp
  2int main(int argc, char** argv) {
  3  //初始化相关信号处理  
  4  DefuseSignalHandlers();
  5  InstallSigPipeHandler();
  6  // 内核中似乎有一个错误,我们的死亡会导致SIGHUP发送到我们的进程组
  7  //如果我们退出,而它已经停止了工作(例如,因为wait_for_gdb)。
  8  //使用setsid创建一个新的进程组来避免碰到这个问题。
  9  setsid();
 10  //打开进程的/proc/pid作为target_proc_fd句柄
 11  std::string target_proc_path = "/proc/" + std::to_string(target_process);
 12  int target_proc_fd = open(target_proc_path.c_str(), O_DIRECTORY | O_RDONLY);
 13  //把之前的两个句柄还原回来
 14  unique_fd output_pipe(dup(STDOUT_FILENO));
 15  unique_fd input_pipe(dup(STDIN_FILENO));
 16  //这里的句柄是为了判断是否还有数据
 17  unique_fd fork_exit_read, fork_exit_write;
 18  if (!Pipe(&fork_exit_read, &fork_exit_write)) {
 19    PLOG(FATAL) << "failed to create pipe";
 20  }
 21  //这里创建一个新子进程和当前进程一致
 22  pid_t forkpid = fork();
 23  if (forkpid == -1) {
 24    PLOG(FATAL) << "fork failed";
 25  } else if (forkpid == 0) {
 26    //子进程保留写,其实是没有数据可写  
 27    fork_exit_read.reset();
 28  } else {
 29    //我们需要伪线程存在,直到我们开始验证vm pid。它做的最后一件事是阻塞我们的waitpid,所以等待直到我们的child告诉我们死亡
 30    //阻塞,直到子进程退出,当前的父进程也退出  
 31    fork_exit_write.reset();
 32    char buf;
 33    TEMP_FAILURE_RETRY(read(fork_exit_read.get(), &buf, sizeof(buf)));
 34    _exit(0);
 35  }
 36  //子进程
 37  //提取出传入的三个参数,pseudothread_tid为上述clone的子线程号,dump_type为kDebuggerdTombstone
 38  Initialize(argv);
 39  ParseArgs(argc, argv, &pseudothread_tid, &dump_type);
 40  //获取crash进程所有的线程号
 41  std::set<pid_t> threads;
 42  if (!android::procinfo::GetProcessTids(g_target_thread, &threads)) {
 43    PLOG(FATAL) << "failed to get process threads";
 44  }
 45
 46  std::map<pid_t, ThreadInfo> thread_info;
 47  siginfo_t siginfo;
 48  std::string error;
 49
 50  {
 51    ATRACE_NAME("ptrace");
 52    for (pid_t thread : threads) {
 53      if (thread == pseudothread_tid) {
 54        continue;
 55      }
 56      ThreadInfo info;
 57      info.pid = target_process;
 58      info.tid = thread;
 59      info.uid = getuid();
 60      info.process_name = process_name;
 61      info.thread_name = get_thread_name(thread);
 62      if (thread == g_target_thread) {
 63        //最重要的是这个,这里的input_pipe对应之前crash子线程中的iovs,这里是真正收集到堆栈信息的函数
 64        ReadCrashInfo(input_pipe, &siginfo, &info.registers, &abort_msg_address,
 65                      &fdsan_table_address, &gwp_asan_state, &gwp_asan_metadata);
 66        info.siginfo = &siginfo;
 67        info.signo = info.siginfo->si_signo;
 68      } else {
 69        info.registers.reset(unwindstack::Regs::RemoteGet(thread));
 70        if (!info.registers) {
 71          PLOG(WARNING) << "failed to fetch registers for thread " << thread;
 72          ptrace(PTRACE_DETACH, thread, 0, 0);
 73          continue;
 74        }
 75      }
 76
 77      thread_info[thread] = std::move(info);
 78    }
 79  }
 80
 81  // Trace the pseudothread with PTRACE_O_TRACECLONE and tell it to fork.
 82  if (!ptrace_seize_thread(target_proc_fd, pseudothread_tid, &error, PTRACE_O_TRACECLONE)) {
 83    LOG(FATAL) << "failed to seize pseudothread: " << error;
 84  }
 85  //通过管道句柄output_pipe,回写给之前crash子线程
 86  if (TEMP_FAILURE_RETRY(write(output_pipe.get(), "\1", 1)) != 1) {
 87    PLOG(FATAL) << "failed to write to pseudothread";
 88  }
 89  //这里会double fork,用于作为守护进程
 90  //第一次fork是为了脱离父进程,setsid让子进程变成session leader,脱离控制终端
 91  //第二次fork是因为session leader有可能会获取控制终端,这样终端断开会发送信号到该进程,导致退出
 92  //并且开始ptrace各个线程
 93  pid_t vm_pid = wait_for_vm_process(pseudothread_tid);
 94  if (ptrace(PTRACE_DETACH, pseudothread_tid, 0, 0) != 0) {
 95    PLOG(FATAL) << "failed to detach from pseudothread";
 96  }
 97  fork_exit_write.reset();
 98  for (const auto& [tid, thread] : thread_info) {
 99    int resume_signal = thread.signo == BIONIC_SIGNAL_DEBUGGER ? 0 : thread.signo;
100    LOG(DEBUG) << "detaching from thread " << tid;
101    if (ptrace(PTRACE_DETACH, tid, 0, resume_signal) != 0) {
102      PLOG(ERROR) << "failed to detach from thread " << tid;
103    }
104  }
105
106  //tombstoned_connect连接,作为客户端连接tombstoned服务端,并且返回一个文件句柄g_output_fd
107  {
108    ATRACE_NAME("tombstoned_connect");
109    LOG(INFO) << "obtaining output fd from tombstoned, type: " << dump_type;
110    g_tombstoned_connected =
111        tombstoned_connect(g_target_thread, &g_tombstoned_socket, &g_output_fd, dump_type);
112  }
113
114  int signo = siginfo.si_signo;
115  bool fatal_signal = signo != BIONIC_SIGNAL_DEBUGGER;
116  bool backtrace = false;
117  // si_value is special when used with BIONIC_SIGNAL_DEBUGGER.
118  //   0: dump tombstone
119  //   1: dump backtrace
120  if (!fatal_signal) {
121    int si_val = siginfo.si_value.sival_int;
122    if (si_val == 0) {
123      backtrace = false;
124    } else if (si_val == 1) {
125      backtrace = true;
126    } else {
127      LOG(WARNING) << "unknown si_value value " << si_val;
128    }
129  }
130
131  //初始化寄存器信息
132  unwindstack::UnwinderFromPid unwinder(256, vm_pid);
133  if (!unwinder.Init(unwindstack::Regs::CurrentArch())) {
134    LOG(FATAL) << "Failed to init unwinder object.";
135  }
136
137  std::string amfd_data;
138  if (backtrace) {
139    ...
140  } else {
141      //打印堆栈信息
142      engrave_tombstone(std::move(g_output_fd), &unwinder, thread_info, g_target_thread,
143                        abort_msg_address, &open_files, &amfd_data, gwp_asan_state,
144                        gwp_asan_metadata);
145
146  }
147  //通知AMS
148  if (fatal_signal) {
149    if (thread_info[target_process].thread_name != "system_server") {
150      activity_manager_notify(target_process, signo, amfd_data);
151    }
152  }
153  close(STDOUT_FILENO);
154  //通知写tombstoned文件  
155  if (g_tombstoned_connected && !tombstoned_notify_completion(g_tombstoned_socket.get())) {
156    LOG(ERROR) << "failed to notify tombstoned of completion";
157  }
158
159  return 0;
160}

上述的内容比较的,主要是新建立的几个socket处理

1.7.1tombstoned_connect

客户端连接tombstoned服务端,并且返回一个文件句柄g_output_fd

1.7.1.1客户端

 1//system/core/debuggerd/tombstoned/tombstoned_client.cpp
 2bool tombstoned_connect(pid_t pid, unique_fd* tombstoned_socket, unique_fd* output_fd,
 3                        DebuggerdDumpType dump_type) {
 4  //建立socket连接,这里是kTombstonedCrashSocketName,对应/dev/socket/crash_dump
 5  //使用SOCK_SEQPACKET传输  
 6  unique_fd sockfd(
 7      socket_local_client((dump_type != kDebuggerdJavaBacktrace ? kTombstonedCrashSocketName
 8                                                                : kTombstonedJavaTraceSocketName),
 9                          ANDROID_SOCKET_NAMESPACE_RESERVED, SOCK_SEQPACKET));
10  }
11  //使用kDumpRequest,pid,kDebuggerdTombstone数据传输
12  TombstonedCrashPacket packet = {};
13  packet.packet_type = CrashPacketType::kDumpRequest;
14  packet.packet.dump_request.pid = pid;
15  packet.packet.dump_request.dump_type = dump_type;
16  if (TEMP_FAILURE_RETRY(write(sockfd, &packet, sizeof(packet))) != sizeof(packet)) {
17    async_safe_format_log(ANDROID_LOG_ERROR, "libc", "failed to write DumpRequest packet: %s",
18                          strerror(errno));
19    return false;
20  }
21  //收到服务端的packet,并且多携带了一个文件句柄tmp_output_fd
22  unique_fd tmp_output_fd;
23  ssize_t rc = ReceiveFileDescriptors(sockfd, &packet, sizeof(packet), &tmp_output_fd);
24  *tombstoned_socket = std::move(sockfd);
25  *output_fd = std::move(tmp_output_fd);
26  return true;
27}

1.7.1.2服务端

 1//system/core/debuggerd/tombstoned/tombstoned.cpp
 2//tombstoned是开机就启动的进程
 3int main(int, char* []) {
 4  umask(0137);
 5
 6  //初始化信号量,并且注册进去
 7  struct sigaction action = {};
 8  action.sa_handler = [](int signal) {
 9    LOG(ERROR) << "received fatal signal " << signal;
10    _exit(1);
11  };
12  debuggerd_register_handlers(&action);
13  //初始化句柄,对应/dev/socket/tombstoned_java_trace
14  int intercept_socket = android_get_control_socket(kTombstonedInterceptSocketName);
15  //初始化句柄,对应/dev/socket/crash_dump
16  int crash_socket = android_get_control_socket(kTombstonedCrashSocketName);
17
18  evutil_make_socket_nonblocking(intercept_socket);
19  evutil_make_socket_nonblocking(crash_socket);
20  //这里用到了event数据结构,最终回调到crash_accept_cb函数
21  evconnlistener* tombstone_listener =
22      evconnlistener_new(base, crash_accept_cb, CrashQueue::for_tombstones(), LEV_OPT_CLOSE_ON_FREE,
23                         -1 /* backlog */, crash_socket);
24
25  //这里初始化成功,开始处理数据,因为这个函数是轮询,所以不会退出tombstoned进程
26  event_base_dispatch(base);
27}

回调到crash_accept_cb函数

服务端分别调用四个函数

  1. crash_accept_cb,接收到客户端连接对应的句柄
  2. crash_request_cb,接收客户端发的信息并校验
  3. perform_request,发给客户端一个文件句柄
  4. crash_completed_cb,接收客户端完成的信息并校验,通过linkat将之前的fd句柄对应到新文件名/data/tombstones/tombstone_xxx
 1//system/core/debuggerd/tombstoned/tombstoned.cpp
 2static void crash_accept_cb(evconnlistener* listener, evutil_socket_t sockfd, sockaddr*, int,
 3                            void*) {
 4  event_base* base = evconnlistener_get_base(listener);
 5  Crash* crash = new Crash();
 6
 7  // TODO: Make sure that only java crashes come in on the java socket
 8  // and only native crashes on the native socket.
 9  struct timeval timeout = { 1, 0 };
10  //注册crash_request_cb函数,并且函数参数为crash
11  event* crash_event = event_new(base, sockfd, EV_TIMEOUT | EV_READ, crash_request_cb, crash);
12  crash->crash_socket_fd.reset(sockfd);
13  crash->crash_event = crash_event;
14  event_add(crash_event, &timeout);
15}
16//传入的参数args为上述的crash
17static void crash_request_cb(evutil_socket_t sockfd, short ev, void* arg) {
18  ssize_t rc;
19  Crash* crash = static_cast<Crash*>(arg);
20  //服务端接收一个request,并且校验客户端传入的数据request
21  TombstonedCrashPacket request = {};
22  rc = TEMP_FAILURE_RETRY(read(sockfd, &request, sizeof(request)));
23  if (request.packet_type != CrashPacketType::kDumpRequest) {
24    LOG(WARNING) << "unexpected crash packet type, expected kDumpRequest, received  "
25                 << StringPrintf("%#2hhX", request.packet_type);
26    goto fail;
27  }
28  ...
29  crash->crash_type = request.packet.dump_request.dump_type;
30  if (CrashQueue::for_crash(crash)->maybe_enqueue_crash(crash)) {
31    LOG(INFO) << "enqueueing crash request for pid " << crash->crash_pid;
32  } else {
33    //调用到这里
34    perform_request(crash);
35  }
36
37  return;
38}
39//处理这个请求
40static void perform_request(Crash* crash) {
41  unique_fd output_fd;
42  bool intercepted =
43      intercept_manager->GetIntercept(crash->crash_pid, crash->crash_type, &output_fd);
44  if (!intercepted) {
45    if (crash->crash_type == kDebuggerdNativeBacktrace) {
46      ...
47    } else {
48      //这里没有intercepted,会调用到这里,并且将/data/tombstone/tombstone_xxx文件的句柄传出,为output_fd
49      std::tie(crash->crash_tombstone_path, output_fd) = CrashQueue::for_crash(crash)->get_output();
50      crash->crash_tombstone_fd.reset(dup(output_fd.get()));
51    }
52  }
53  //传入的参数为kPerformDump,pid,output_fd
54  TombstonedCrashPacket response = {
55    .packet_type = CrashPacketType::kPerformDump
56  };
57  ssize_t rc =
58      SendFileDescriptors(crash->crash_socket_fd, &response, sizeof(response), output_fd.get());
59  output_fd.reset();
60
61    // TODO: Make this configurable by the interceptor?
62    struct timeval timeout = { 10, 0 };
63
64    event_base* base = event_get_base(crash->crash_event);
65    //最终调用crash_completed_cb,参数为crash
66    event_assign(crash->crash_event, base, crash->crash_socket_fd, EV_TIMEOUT | EV_READ,
67                 crash_completed_cb, crash);
68    event_add(crash->crash_event, &timeout);
69
70
71  CrashQueue::for_crash(crash)->on_crash_started();
72  return;
73}
74
75static void crash_completed_cb(evutil_socket_t sockfd, short ev, void* arg) {
76  ssize_t rc;
77  Crash* crash = static_cast<Crash*>(arg);
78  TombstonedCrashPacket request = {};
79  ...
80  //开始阻塞读取,直到有数据过来    
81  rc = TEMP_FAILURE_RETRY(read(sockfd, &request, sizeof(request)));
82  //数据过来之后,校验数据的合法性  
83  if (request.packet_type != CrashPacketType::kCompletedDump) {
84    LOG(WARNING) << "unexpected crash packet type, expected kCompletedDump, received "
85                 << uint32_t(request.packet_type);
86    goto fail;
87  }
88  //校验通过之后,通过linkat将之前的fd句柄对应到新文件名/data/tombstones/tombstone_xxx
89  if (crash->crash_tombstone_fd != -1) {
90    //这个句柄就是之前的output_fd  
91    std::string fd_path = StringPrintf("/proc/self/fd/%d", crash->crash_tombstone_fd.get());
92    std::string tombstone_path = CrashQueue::for_crash(crash)->get_next_artifact_path();
93    rc = linkat(AT_FDCWD, fd_path.c_str(), AT_FDCWD, tombstone_path.c_str(), AT_SYMLINK_FOLLOW);
94    ...
95  }
96}

1.7.2engrave_tombstone

打印部分堆栈信息,将所有堆栈信息写入到句柄g_output_fd中去

 1//system/core/debuggerd/libdebuggerd/tombstone.cpp
 2void engrave_tombstone_ucontext(int tombstone_fd, uint64_t abort_msg_address, siginfo_t* siginfo,
 3                                ucontext_t* ucontext) {
 4  //初始化信息
 5  ...
 6  std::unique_ptr<unwindstack::Regs> regs(
 7      unwindstack::Regs::CreateFromUcontext(unwindstack::Regs::CurrentArch(), ucontext));
 8
 9  std::map<pid_t, ThreadInfo> threads;
10  threads[gettid()] = ThreadInfo{
11      .registers = std::move(regs),
12      .uid = uid,
13      .tid = tid,
14      .thread_name = thread_name,
15      .pid = pid,
16      .process_name = process_name,
17      .siginfo = siginfo,
18  };
19  //初始化寄存器
20  unwindstack::UnwinderFromPid unwinder(kMaxFrames, pid);
21  if (!unwinder.Init(unwindstack::Regs::CurrentArch())) {
22    LOG(FATAL) << "Failed to init unwinder object.";
23  }
24
25  engrave_tombstone(unique_fd(dup(tombstone_fd)), &unwinder, threads, tid, abort_msg_address,
26                    nullptr, nullptr, 0u, 0u);
27}
28
29void engrave_tombstone(unique_fd output_fd, unwindstack::Unwinder* unwinder,
30                       const std::map<pid_t, ThreadInfo>& threads, pid_t target_thread,
31                       uint64_t abort_msg_address, OpenFilesList* open_files,
32                       std::string* amfd_data, uintptr_t gwp_asan_state_ptr,
33                       uintptr_t gwp_asan_metadata_ptr) {
34  // don't copy log messages to tombstone unless this is a dev device
35  bool want_logs = android::base::GetBoolProperty("ro.debuggable", false);
36
37  log_t log;
38  log.current_tid = target_thread;
39  log.crashed_tid = target_thread;
40  //设置log中的tfd的句柄为先前的文件句柄output_fd
41  log.tfd = output_fd.get();
42  log.amfd_data = amfd_data;
43  //开始打印第一行
44  _LOG(&log, logtype::HEADER, "*** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***\n");
45  //打印header,包括架构信息版本号等  
46  dump_header_info(&log);
47  //打印时间戳
48  dump_timestamp(&log, time(nullptr));
49
50  auto it = threads.find(target_thread);
51  if (it == threads.end()) {
52    LOG(FATAL) << "failed to find target thread";
53  }
54
55  GwpAsanCrashData gwp_asan_crash_data(unwinder->GetProcessMemory().get(),
56                                       gwp_asan_state_ptr,
57                                       gwp_asan_metadata_ptr, it->second);
58  //打印线程中的进程号线程号、信号及其产生原因、堆栈等信息
59  dump_thread(&log, unwinder, it->second, abort_msg_address, true,
60              gwp_asan_crash_data);
61
62  if (want_logs) {
63    dump_logs(&log, it->second.pid, 50);
64  }
65
66  for (auto& [tid, thread_info] : threads) {
67    if (tid == target_thread) {
68      continue;
69    }
70    //打印相关线程的相关信息
71    dump_thread(&log, unwinder, thread_info, 0, false, gwp_asan_crash_data);
72  }
73
74  if (open_files) {
75    _LOG(&log, logtype::OPEN_FILES, "\nopen files:\n");
76    dump_open_files_list(&log, *open_files, "    ");
77  }
78
79  if (want_logs) {
80    dump_logs(&log, it->second.pid, 0);
81  }
82}

这里的部分堆栈打印原理,只选择性的打印三类:HEADERREGISTERSBACKTRACE

 1__attribute__((__weak__, visibility("default")))
 2void _LOG(log_t* log, enum logtype ltype, const char* fmt, ...) {
 3  va_list ap;
 4  va_start(ap, fmt);
 5  _VLOG(log, ltype, fmt, ap);
 6  va_end(ap);
 7}
 8
 9__attribute__((__weak__, visibility("default")))
10void _VLOG(log_t* log, enum logtype ltype, const char* fmt, va_list ap) {
11  bool write_to_tombstone = (log->tfd != -1);
12  bool write_to_logcat = is_allowed_in_logcat(ltype)
13                      && log->crashed_tid != -1
14                      && log->current_tid != -1
15                      && (log->crashed_tid == log->current_tid);
16  //是否加入kmsg日志  
17  static bool write_to_kmsg = should_write_to_kmsg();
18
19  std::string msg;
20  android::base::StringAppendV(&msg, fmt, ap);
21
22  if (msg.empty()) return;
23  //之前传入的tfd已经是指定的文件句柄,所有的数据都会被写到该句柄中
24  if (write_to_tombstone) {
25    TEMP_FAILURE_RETRY(write(log->tfd, msg.c_str(), msg.size()));
26  }
27  //根据是否是HEADER、REGISTERS、BACKTRACE判断是否需要部分写入logcat
28  if (write_to_logcat) {
29    __android_log_buf_write(LOG_ID_CRASH, ANDROID_LOG_FATAL, LOG_TAG, msg.c_str());
30    if (log->amfd_data != nullptr) {
31      *log->amfd_data += msg;
32    }
33
34    if (write_to_kmsg) {
35      unique_fd kmsg_fd(open("/dev/kmsg_debug", O_WRONLY | O_APPEND | O_CLOEXEC));
36      if (kmsg_fd.get() >= 0) {
37        std::vector<std::string> fragments = android::base::Split(msg, "\n");
38        for (const std::string& fragment : fragments) {
39          static constexpr char prefix[] = "<3>DEBUG: ";
40          struct iovec iov[3];
41          iov[0].iov_base = const_cast<char*>(prefix);
42          iov[0].iov_len = strlen(prefix);
43          iov[1].iov_base = const_cast<char*>(fragment.c_str());
44          iov[1].iov_len = fragment.length();
45          iov[2].iov_base = const_cast<char*>("\n");
46          iov[2].iov_len = 1;
47          TEMP_FAILURE_RETRY(writev(kmsg_fd.get(), iov, 3));
48        }
49      }
50    }
51  }
52}
53
54bool is_allowed_in_logcat(enum logtype ltype) {
55  if ((ltype == HEADER)
56   || (ltype == REGISTERS)
57   || (ltype == BACKTRACE)) {
58    return true;
59  }
60  return false;
61}

着重突出

1)

1_LOG(&log, logtype::HEADER, "*** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***\n");

对应

108-04 17:02:06.366  1671  1671 F DEBUG   : *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

2)dump_header_info对应

108-04 17:02:06.366  1671  1671 F DEBUG   : Build fingerprint: 'xxx'
208-04 17:02:06.366  1671  1671 F DEBUG   : Revision: '0'
308-04 17:02:06.366  1671  1671 F DEBUG   : ABI: 'arm'

3)dump_thread_info对应

108-04 17:02:06.366  1671  1671 F DEBUG   : pid: 361, tid: 361, name: demo  >>> /system/bin/demo <<<

4)dump_signal_info对应

108-04 17:02:06.366  1671  1671 F DEBUG   : signal 11 (SIGSEGV), code 1 (SEGV_MAPERR), fault addr 0xa

5)dump_probable_cause对应

108-04 17:02:06.366  1671  1671 F DEBUG   : Cause: null pointer dereference

6)dump_registers对应

108-04 17:02:06.366  1671  1671 F DEBUG   :     r0 0000000a  r1 00000061  r2 0538ad06  r3 fff162b0
208-04 17:02:06.366  1671  1671 F DEBUG   :     r4 b91d04ed  r5 b91dde08  r6 00000004  r7 fff1639c
308-04 17:02:06.366  1671  1671 F DEBUG   :     r8 00000000  r9 00000000  sl 00000000  fp fff1638c
408-04 17:02:06.367  1671  1671 F DEBUG   :     ip fff1639c  sp fff16308  lr eb97f0eb  pc b91d0342  cpsr 20010030

7)log_backtrace对应

108-04 17:02:06.373  1671  1671 F DEBUG   :
208-04 17:02:06.373  1671  1671 F DEBUG   : backtrace:
308-04 17:02:06.373  1671  1671 F DEBUG   :     #00 pc 00006342  /system/bin/demo (main+489)
408-04 17:02:06.373  1671  1671 F DEBUG   :     #01 pc 0007669d  /system/lib/libc.so (__libc_init+48)
508-04 17:02:06.373  1671  1671 F DEBUG   :     #02 pc 000024d8  /system/bin/demo (_start_main+88)

1.7.3activity_manager_notify

通知AMS做进一步的堆栈处理

1.7.3.1客户端

 1//system/core/debuggerd/crash_dump.cpp
 2static bool activity_manager_notify(pid_t pid, int signal, const std::string& amfd_data) {
 3  //建立socket连接,连接本机socket路径为/data/system/ndebugsocket
 4  android::base::unique_fd amfd(socket_local_client(
 5      "/data/system/ndebugsocket", ANDROID_SOCKET_NAMESPACE_FILESYSTEM, SOCK_STREAM));
 6
 7  struct timeval tv = {
 8    .tv_sec = 1,
 9    .tv_usec = 0,
10  };
11  //设置发送超时时间为1s
12  if (setsockopt(amfd.get(), SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)) == -1) {
13    PLOG(ERROR) << "failed to set send timeout on activity manager socket";
14    return false;
15  }
16  //设置接收超时时间为1s
17  tv.tv_sec = 3;  // 3 seconds on handshake read
18  if (setsockopt(amfd.get(), SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)) == -1) {
19    PLOG(ERROR) << "failed to set receive timeout on activity manager socket";
20    return false;
21  }
22
23  //下面传输的是crash进程的pid和signal
24  //这里将主机字节序转化成网络字节序
25  uint32_t datum = htonl(pid);
26  if (!android::base::WriteFully(amfd, &datum, 4)) {
27    PLOG(ERROR) << "AM pid write failed";
28    return false;
29  }
30  datum = htonl(signal);
31  if (!android::base::WriteFully(amfd, &datum, 4)) {
32    PLOG(ERROR) << "AM signal write failed";
33    return false;
34  }
35  //amfd_data就是先前crash进程的所有打印堆栈内容,就是0NE简介中的打印
36  if (!android::base::WriteFully(amfd, amfd_data.c_str(), amfd_data.size() + 1)) {
37    PLOG(ERROR) << "AM data write failed";
38    return false;
39  }
40
41  //最终回收到一个ack,这里服务端没有设置任何值,所以客户端实际上收到的是0x0
42  char ack;
43  android::base::ReadFully(amfd, &ack, 1);
44  return true;
45}

关于这里为什么有pidsignal之后还需要重新拼接为网络字节序

网络协议指定了字节序,因此异构计算机系统能够交换协议信息而不会被字节序所混淆。

TCP/IP协议栈使用大端字节序。应用程序交换格式化数据时,字节序问题就会出现。对于TCP/IP,地址用网络字节序来表示,所以应用程序有时需要在处理器的字节序与网络字节序之间转换它们。例如,以一种易读的形式打印一个地址时,这种转换很常见。

1.7.3.2服务端

服务端即为AMS服务

1//frameworks/base/services/core/java/com/android/server/am/ActivityManagerService.java
2public void startObservingNativeCrashes() {
3    final NativeCrashListener ncl = new NativeCrashListener(this);
4    ncl.start();
5}

NativeCrashListenerThread启动

 1//frameworks/base/services/core/java/com/android/server/am/NativeCrashListener.java
 2final class NativeCrashListener extends Thread {
 3    static final String DEBUGGERD_SOCKET_PATH = "/data/system/ndebugsocket";
 4	@Override
 5    public void run() {
 6        //这里是响应客户端的ack,为一个0值
 7        final byte[] ackSignal = new byte[1];
 8
 9        if (DEBUG) Slog.i(TAG, "Starting up");
10        {
11            //创建一个socket文件,DEBUGGERD_SOCKET_PATH为/data/system/ndebugsocket
12            File socketFile = new File(DEBUGGERD_SOCKET_PATH);
13            if (socketFile.exists()) {
14                socketFile.delete();
15            }
16        }
17
18        try {
19            //1.创建服务端socket句柄,AF_UNIX为本地通信,SOCK_STREAM为TCP协议
20            FileDescriptor serverFd = Os.socket(AF_UNIX, SOCK_STREAM, 0);
21            //获取DEBUGGERD_SOCKET_PATH,转化为UnixSocketAddress
22            final UnixSocketAddress sockAddr = UnixSocketAddress.createFileSystem(
23                    DEBUGGERD_SOCKET_PATH);
24            //2.关联地址和套接字
25            Os.bind(serverFd, sockAddr);
26            //3.如果是TCP协议,那么创建监听队列,这里队列只有一个,同时只能处理一个IO
27            Os.listen(serverFd, 1);
28            //设置socket文件权限为可读可写可执行
29            Os.chmod(DEBUGGERD_SOCKET_PATH, 0777);
30			//这里就是为了保证可以循环处理多个IO,服务端不能直接关闭
31            while (true) {
32                FileDescriptor peerFd = null;
33                try {
34                    if (MORE_DEBUG) Slog.v(TAG, "Waiting for debuggerd connection");
35                    //4.服务端获取连接请求并建立连接
36                    peerFd = Os.accept(serverFd, null /* peerAddress */);
37                    if (peerFd != null) {
38						//服务端处理客户端的请求
39                        consumeNativeCrashData(peerFd);
40                    }
41                } catch (Exception e) {
42                    Slog.w(TAG, "Error handling connection", e);
43                } finally {
44                    if (peerFd != null) {
45                        try {
46                            //5.存在客户端,那么发送一个响应
47                            Os.write(peerFd, ackSignal, 0, 1);
48                        } catch (Exception e) {
49                            ...
50                        }
51                        try {
52                            //6.关闭客户端句柄
53                            Os.close(peerFd);
54                        } catch (ErrnoException e) {
55                            ...
56                        }
57                    }
58                }
59            }
60        } catch (Exception e) {
61            Slog.e(TAG, "Unable to init native debug socket!", e);
62        }
63    }

consumeNativeCrashData处理来自客户端的请求

 1//frameworks/base/services/core/java/com/android/server/am/NativeCrashListener.java
 2void consumeNativeCrashData(FileDescriptor fd) {
 3    final byte[] buf = new byte[4096];
 4    final ByteArrayOutputStream os = new ByteArrayOutputStream(4096);
 5
 6    try {
 7        StructTimeval timeout = StructTimeval.fromMillis(SOCKET_TIMEOUT_MILLIS);
 8        Os.setsockoptTimeval(fd, SOL_SOCKET, SO_RCVTIMEO, timeout);
 9        Os.setsockoptTimeval(fd, SOL_SOCKET, SO_SNDTIMEO, timeout);
10        //这里的buf实际上就是pid和signal的拼接,本文是0x169 0xB 对应正好pid为361和signal为11
11        int headerBytes = readExactly(fd, buf, 0, 8);
12        int pid = unpackInt(buf, 0);
13        int signal = unpackInt(buf, 4);
14
15        // now the text of the dump
16        if (pid > 0) {
17            final ProcessRecord pr;
18            synchronized (mAm.mPidsSelfLocked) {
19                pr = mAm.mPidsSelfLocked.get(pid);
20            }
21            //因为这个是自定义的native,没有涉及到system_server进程,所以不会找到对应的pr
22            if (pr != null) {
23                ...
24            } else {
25                Slog.w(TAG, "Couldn't find ProcessRecord for pid " + pid);
26            }
27        } else {
28            Slog.e(TAG, "Bogus pid!");
29        }
30    } catch (Exception e) {
31        Slog.e(TAG, "Exception dealing with report", e);
32    }
33}

这里最终会打印Couldn't find ProcessRecord for pid 361

1.7.4tombstoned_notify_completion

通知tombstoned服务端,将tombstone_xxx文件生成

1.7.4.1客户端

这里发送了一个包,类型为kCompletedDump

1//system/core/debuggerd/tombstoned/tombstoned_client.cpp
2bool tombstoned_notify_completion(int tombstoned_socket) {
3  TombstonedCrashPacket packet = {};
4  packet.packet_type = CrashPacketType::kCompletedDump;
5  if (TEMP_FAILURE_RETRY(write(tombstoned_socket, &packet, sizeof(packet))) != sizeof(packet)) {
6    return false;
7  }
8  return true;
9}

1.7.4.2服务端

 1static void crash_completed_cb(evutil_socket_t sockfd, short ev, void* arg) {
 2  ...
 3  //之前在这里阻塞,现在可以接受到packet了    
 4  rc = TEMP_FAILURE_RETRY(read(sockfd, &request, sizeof(request)));
 5  //解析packet的合法性
 6  if (request.packet_type != CrashPacketType::kCompletedDump) {
 7    LOG(WARNING) << "unexpected crash packet type, expected kCompletedDump, received "
 8                 << uint32_t(request.packet_type);
 9    goto fail;
10  }
11  //这里这一步最关键,通过之前保存的output_fd文件句柄,通过linkat将tombstone文件生成
12  //这里面的tombstone_path就是对应的/data/tombstones/tombstone_xxx
13  if (crash->crash_tombstone_fd != -1) {
14    std::string fd_path = StringPrintf("/proc/self/fd/%d", crash->crash_tombstone_fd.get());
15    std::string tombstone_path = CrashQueue::for_crash(crash)->get_next_artifact_path();
16  }
17
18  rc = linkat(AT_FDCWD, fd_path.c_str(), AT_FDCWD, tombstone_path.c_str(), AT_SYMLINK_FOLLOW);
19  //成功之后会打印这句话
20  LOG(ERROR) << "Tombstone written to: " << tombstone_path;
21  ...
22}

2关于一些细节

1)堆栈信息是怎么打印出来的

堆栈信息实际上是最后tombstoned_notify_completion去通知服务端,让其将将tombstone_xxx文件生成。对应tombstone也会出现对应的tombstone_xxx ,里面有更加完整的堆栈信息

2)这个堆栈信息是不是每次都会打印,什么情况不再打印

如果拦截堆栈信息,即自定义signal信号处理就不会打印signal信息。但是如果使用sigcation之后,不做拦截,继续调用就会继续打印

 1#include <signal.h>
 2struct sigaction newact, oldact;
 3void sig_func(int signo, siginfo_t *info, void *context)
 4{
 5    //handler signal SIGINT
 6    ...
 7    //continue signal SIGINT,添加这句可以继续调用打印
 8    sigaction(SIGINT, &oldact, NULL);
 9}
10
11void func(){
12  
13  newact.sa_handler = sig_func;
14  sigemptyset(&newact.sa_mask);
15  newact.sa_flags = 0;
16  newact.sa_flags |= SA_SIGINFO;
17
18  sigaction(SIGINT, &newact, &oldact);
19}

3总结

1)NE进程句柄之间的关系

2)NE进程的调用流程

参考

[1] 内核工匠, Tombstone原理分析, 2021.

[2] 内核工匠, Android内存异常机制(用户空间)_NE, 2020.

[3] 内核工匠, Android内存异常机制(用户空间)_JE, 2020.

[4] 袁辉辉, 理解Native Crash处理流程, 2016.

[5] 袁辉辉, 理解Android Crash处理流程, 2016.

[6] 袁辉辉, 解读Java进程的Trace文件, 2016.

[7] 袁辉辉, Native进程之Trace原理, 2016.

[8] onITLoad, PRCTL - Linux手册页, 2020.

[9] liuanhf, linux daemon进程为何 fork 两次, 2020.

[10] man7.org , ptrace(2) — Linux manual page, 2020.

[11] gitbook.net, linkat()函数 Unix/Linux, 2020.

[12] 小林code, TCP 三次握手与四次挥手面试题, 2022.

[13] 皇甫懿, linux的SA_RESTART信号, 2019.

[14] Spring__Rider, 函数link、linkat、unlink、unlinkat和remove, 2018.

[15] xinjing_wangtao, 高性能网络编程(4)–TCP连接的关闭 (B), 2016.