1.问题背景

开发同事反馈Vehicle服务下行信号可以发出,但是没有收到回调。给了一份关键日志如下:

1
2
3
4
5
6
7
8
9
// logD:进程输出日志    326是logD的进程id
// 报错的进程是automotive.vehicle@2.0-impl 进程id是270
LOGD 326 log error verbose 1 E/automotive.vehicle@2.0-impl( 270): Failed to notify client [class or subclass of android.hardware.automotive.vehicle@2.0::IVehicleCallback]@remote, err: Status(EX_TRANSACTION_FAILED): 'FAILED_TRANSACTION: '
LOGD 326 log error verbose 1 E/automotive.vehicle@2.0-impl( 270): Failed to notify client [class or subclass of android.hardware.automotive.vehicle@2.0::IVehicleCallback]@remote, err: Status(EX_TRANSACTION_FAILED): 'FAILED_TRANSACTION: '
LOGD 326 log error verbose 1 E/automotive.vehicle@2.0-impl( 270): Failed to notify client [class or subclass of android.hardware.automotive.vehicle@2.0::IVehicleCallback]@remote, err: Status(EX_TRANSACTION_FAILED): 'FAILED_TRANSACTION: '
LOGD 326 log error verbose 1 E/automotive.vehicle@2.0-impl( 270): Failed to notify client [class or subclass of android.hardware.automotive.vehicle@2.0::IVehicleCallback]@remote, err: Status(EX_TRANSACTION_FAILED): 'FAILED_TRANSACTION: '
LOGD 326 log error verbose 1 E/automotive.vehicle@2.0-impl( 270): Failed to notify client [class or subclass of android.hardware.automotive.vehicle@2.0::IVehicleCallback]@remote, err: Status(EX_TRANSACTION_FAILED): 'FAILED_TRANSACTION: '
LOGD 326 log error verbose 1 E/automotive.vehicle@2.0-impl( 270): Failed to notify client [class or subclass of android.hardware.automotive.vehicle@2.0::IVehicleCallback]@remote, err: Status(EX_TRANSACTION_FAILED): 'FAILED_TRANSACTION: '
LOGD 326 log error verbose 1 E/automotive.vehicle@2.0-impl( 270): Failed to notify client [class or subclass of android.hardware.automotive.vehicle@2.0::IVehicleCallback]@remote, err: Status(EX_TRANSACTION_FAILED): 'FAILED_TRANSACTION: '

2.问题调查过程

调查开始肯定是从报错的进程开始,遍历PID找到进程所有日志,看能不能找到部分线索。

1
2
3
4
5
6
KERN KERN 326 log info verbose 1 binder: 270:282 transaction failed 29201/-28, size 1365540-109248 line 3274
auto LOGD 326 log error verbose 1 E/automotive.vehicle@2.0-impl( 270): Failed to notify client [class or subclass of android.hardware.automotive.vehicle@2.0::IVehicleCallback]@remote, err: Status(EX_TRANSACTION_FAILED): 'FAILED_TRANSACTION: '
KERN KERN 326 log info verbose 1 binder: 270:282 transaction failed 29201/-28, size 957540-76608 line 3274
KERN KERN 326 log info verbose 1 binder: 270:282 transaction failed 29201/-28, size 591540-47328 line 3274
KERN KERN 326 log info verbose 1 binder: 270:282 transaction failed 29201/-28, size 577940-46240 line 3274
KERN KERN 326 log info verbose 1 binder: 270:282 transaction failed 29201/-28, size 637140-50976 line 3274

搜索”270“进程id可以看到vehicle进程报了binder传输异常。

然后看到KERN, 是kernal报出来的错误。因此在kernal中去搜索错误报错地方,看是否能找到相关线索。

搜索”transaction failed“

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
// kernel/msm-4.14/drivers/android/binder.c
err_invalid_target_handle:
if (target_thread)
binder_thread_dec_tmpref(target_thread);
if (target_proc)
binder_proc_dec_tmpref(target_proc);
if (target_node) {
binder_dec_node(target_node, 1, 0);
binder_dec_node_tmpref(target_node);
}

binder_debug(BINDER_DEBUG_FAILED_TRANSACTION,
"%d:%d transaction failed %d/%d, size %lld-%lld line %d\n",
proc->pid, thread->pid, return_error, return_error_param,
(u64)tr->data_size, (u64)tr->offsets_size,
return_error_line);

搜索关键词,找到了关键log,binder驱动报错了,我们细数一下日志中每个报错参数的意思。

1
2
3
4
5
6
7
8
9
10
"%d:%d transaction failed %d/%d, size %lld-%lld line %d\n"
proc->pid, thread->pid, return_error, return_error_param,(u64)tr->data_size, (u64)tr->offsets_size, return_error_line);

270:282 transaction failed 29201/-28, size 637140-50976 line 3274

报错进程id:270 线程id:282
错误码:29201(BR_FAILED_REPLY) 标准错误码:-28 (No space left on device)
binder_transaction_data的data_size大小为637140字节
binder_transaction_data的offsets_size数据偏移为50976
报错代码行数:3274

因为GOS的linux系统版本是:Linux version 4.14.156;6125项目的kernal代码版本是Linux version 4.14.112的。因此这个报错行数参考意义不大。

从上面日志已经可以大概猜测到是发送的数据量过大。当前的binder_transaction_data的大小是637140字节。

3.源码分析

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
// frameworks/native/libs/binder/ProcessState.cpp
// BINDER_VM_SIZE binder的一次消息传送大小为1M - 8k
#define BINDER_VM_SIZE ((1 * 1024 * 1024) - sysconf(_SC_PAGE_SIZE) * 2)

ProcessState::ProcessState(const char *driver)
: mDriverName(String8(driver))
, mDriverFD(open_driver(driver))
, mVMStart(MAP_FAILED)
, mThreadCountLock(PTHREAD_MUTEX_INITIALIZER)
, mThreadCountDecrement(PTHREAD_COND_INITIALIZER)
, mExecutingThreadsCount(0)
, mMaxThreads(DEFAULT_MAX_BINDER_THREADS)
, mStarvationStartTimeMs(0)
, mManagesContexts(false)
, mBinderContextCheckFunc(NULL)
, mBinderContextUserData(NULL)
, mThreadPoolStarted(false)
, mThreadPoolSeq(1)
{
if (mDriverFD >= 0) {
// mmap the binder, providing a chunk of virtual address space to receive transactions.
// 构造方法中调用mmap将申请的大小传给binder驱动
mVMStart = mmap(0, BINDER_VM_SIZE, PROT_READ, MAP_PRIVATE | MAP_NORESERVE, mDriverFD, 0);
if (mVMStart == MAP_FAILED) {
// *sigh*
ALOGE("Using %s failed: unable to mmap transaction memory.\n", mDriverName.c_str());
close(mDriverFD);
mDriverFD = -1;
mDriverName.clear();
}
}

LOG_ALWAYS_FATAL_IF(mDriverFD < 0, "Binder driver could not be opened. Terminating.");
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
// kernel/msm-4.14/drivers/android/binder.c

static const struct file_operations binder_fops = {
.owner = THIS_MODULE,
.poll = binder_poll,
.unlocked_ioctl = binder_ioctl,
.compat_ioctl = binder_ioctl,
.mmap = binder_mmap,
.open = binder_open,
.flush = binder_flush,
.release = binder_release,
};

static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
{
......
// 如果传入的值大于4M,最多只申请4M
if ((vma->vm_end - vma->vm_start) > SZ_4M)
vma->vm_end = vma->vm_start + SZ_4M;

......

// 真正去分配内存的地方
ret = binder_alloc_mmap_handler(&proc->alloc, vma);
if (ret)
return ret;
mutex_lock(&proc->files_lock);
proc->files = get_files_struct(current);
mutex_unlock(&proc->files_lock);
return 0;

err_bad_arg:
pr_err("%s: %d %lx-%lx %s failed %d\n", __func__,
proc->pid, vma->vm_start, vma->vm_end, failure_string, ret);
return ret;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
// kernel/msm-4.14/drivers/android/binder_alloc.c
int binder_alloc_mmap_handler(struct binder_alloc *alloc,
struct vm_area_struct *vma)
{
int ret;
const char *failure_string;
struct binder_buffer *buffer;
......

alloc->buffer = (void __user *)vma->vm_start;
mutex_unlock(&binder_alloc_mmap_lock);

// 申请内核空间内存
alloc->pages = kzalloc(sizeof(alloc->pages[0]) *
((vma>vm_end - vma->vm_start)- / PAGE_SIZE),
GFP_KERNEL);
if (alloc->pages == NULL) {
ret = -ENOMEM;
failure_string = "alloc page array";
goto err_alloc_pages_failed;
}
alloc->buffer_size = vma->vm_end - vma->vm_start;

// 存binder_buffer指针地址
buffer = kzalloc(sizeof(*buffer), GFP_KERNEL);
if (!buffer) {
ret = -ENOMEM;
failure_string = "alloc buffer struct";
goto err_alloc_buf_struct_failed;
}

// !!!非oneway的传输方式user_data大小就是用户空间传入的大小。
buffer->user_data = alloc->buffer;
list_add(&buffer->entry, &alloc->buffers);
buffer->free = 1;
binder_insert_free_buffer(alloc, buffer);
// !!!oneway模式传输方式,free_async_space空间只有用户空间传入空间大小的一半。
alloc->free_async_space = alloc->buffer_size / 2;
binder_alloc_set_vma(alloc, vma);
mmgrab(alloc->vma_vm_mm);

return 0;

......
return ret;
}

可以看到如果binder采用Binder oneway方式通信,数据的传输空间大小只有(1M - 8k) / 2,即520190字节。

因此,需要模块具体调查哪个信号传输的数据超过了508kb。