2017-12-07
记录linux-tracing-workshop实验过程,第二部分共三篇。
本节试着写一个BPF来跟踪setuid系统调用。 我们可以使用trace来跟踪setuid:
root@ubuntu1604:~/bcc/tools# ./trace.py 'sys_setuid "uid=0x%x", arg1' 'r::sys_setuid "rc=%d", retval'
PID TID COMM FUNC -
34913 34913 su sys_setuid uid=0x3e8
34913 34913 su sys_setuid rc=0
34932 34932 cron sys_setuid uid=0x0
34932 34932 cron sys_setuid rc=0
也可以写一个独立的BPF工具,本节模仿killsnoop.py内容实现setuid的trace。
第一步,替换sys_kill为sys_setuid
kprobe__sys_kill->kprobe__sys_setuid
kretprobe__sys_kill->kretprobe__sys_setuid
第二步,修改函数签名
int kprobe__sys_setuid(struct pt_regs *ctx, int tpid, int sig)-->
int kprobe__sys_setuid(struct pt_regs *ctx, u32 uid)
第三步,修改数据结构,用setuid的参数替换kill的参数
struct val_t {
u64 pid;
u32 uid;
char comm[TASK_COMM_LEN];
};
struct data_t {
u64 pid;
u32 uid;
int ret;
char comm[TASK_COMM_LEN];
};
class Data(ct.Structure):
_fields_ = [
("pid", ct.c_ulonglong),
("uid", ct.c_uint),
("ret", ct.c_int),
("comm", ct.c_char * TASK_COMM_LEN)
]
第四步,在kprobe和kretprobe修改相应的数据
int kprobe__sys_kill(struct pt_regs *ctx, u32 uid)
{
u32 pid = bpf_get_current_pid_tgid();
FILTER
struct val_t val = {.pid = pid};
if (bpf_get_current_comm(&val.comm, sizeof(val.comm)) == 0) {
val.uid = uid;
infotmp.update(&pid, &val);
}
return 0;
};
int kretprobe__sys_kill(struct pt_regs *ctx)
{
struct data_t data = {};
struct val_t *valp;
u32 pid = bpf_get_current_pid_tgid();
valp = infotmp.lookup(&pid);
if (valp == 0) {
// missed entry
return 0;
}
bpf_probe_read(&data.comm, sizeof(data.comm), valp->comm);
data.pid = pid;
data.uid = valp->uid;
data.ret = PT_REGS_RC(ctx);
events.perf_submit(ctx, &data, sizeof(data));
infotmp.delete(&pid);
return 0;
}
第五步,修改print的数据
print("%-9s %-6s %-16s %-6s %s" % (
"TIME", "PID", "COMM", "UID", "RESULT"))
# process event
def print_event(cpu, data, size):
event = ct.cast(data, ct.POINTER(Data)).contents
if (args.failed and (event.ret >= 0)):
return
print("%-9s %-6d %-16s %-6d %d" % (strftime("%H:%M:%S"),
event.pid, event.comm.decode(), event.uid, event.ret))
效果
root@ubuntu1604:~/bcc/tools# ./setuidsnoop.py
TIME PID COMM UID RESULT
11:41:05 36919 su 1000 0
11:45:01 36941 cron 0 0
原实验完整版
该实验开发一个机遇USDT probe的BCC工具,用来监控数据库的query延迟和执行。
首先下载postgresql,使用–enable-dtraceb编译,使其支持USDT,运行:
$ cd /usr/local/pgsql/bin
$ ./initdb -D /tmp/pgdata
$ ./pg_ctl -D /tmp/pgdata start
查看USDT probe点:
test@ubuntu1604:/usr/local/pgsql/bin$ /usr/share/bcc/tools/tplist -p $(pgrep -n postgres) | grep query
/usr/local/pgsql/bin/postgres postgresql:query__parse__start
/usr/local/pgsql/bin/postgres postgresql:query__parse__done
/usr/local/pgsql/bin/postgres postgresql:query__rewrite__start
/usr/local/pgsql/bin/postgres postgresql:query__rewrite__done
/usr/local/pgsql/bin/postgres postgresql:query__plan__start
/usr/local/pgsql/bin/postgres postgresql:query__plan__done
/usr/local/pgsql/bin/postgres postgresql:query__start
/usr/local/pgsql/bin/postgres postgresql:query__done
/usr/local/pgsql/bin/postgres postgresql:query__execute__start
/usr/local/pgsql/bin/postgres postgresql:query__execute__done
本实验关注query__start 和 query__done,query__start第一个参数就是query参数。
下面根据实验给的整体框架完成工具编写。
第一步:找到PostgreSQL的进程ID
dbpid = int(subprocess.check_output("pgrep -n postgres".split()))
第二步:定义数据结构,包含PID, timestamp, duration, 以及 query文本
struct temp_t {
u64 timestamp;
char *query;
};
struct data_t {
u64 pid;
u64 timestamp;
u64 duration;
char query[256];
};
BPF_HASH(temp, u64, struct temp_t);
BPF_PERF_OUTPUT(events);
第三步:第一两个函数处理query__start 和 query__end
int probe_query_start(struct pt_regs *ctx) {
struct temp_t tmp = {};
tmp.timestamp = bpf_ktime_get_ns();
bpf_usdt_readarg(1, ctx, &tmp.query);
u64 pid = bpf_get_current_pid_tgid();
temp.update(&pid, &tmp);
return 0;
}
int probe_query_end(struct pt_regs *ctx) {
struct temp_t *tempp;
u64 pid = bpf_get_current_pid_tgid();
tempp = temp.lookup(&pid);
if (!tempp)
return 0;
u64 delta = bpf_ktime_get_ns() - tempp->timestamp;
if (delta >=""" + str(threshold_ns) + """) {
struct data_t data = {};
data.pid = pid >> 32;
data.timestamp = tempp->timestamp;
data.duration = delta;
bpf_probe_read(&data.query, sizeof(data.query), tempp->query);
events.perf_submit(ctx, &data, sizeof(data));
}
temp.delete(&pid);
return 0;
};
第四步:使用enable_probe enable query__start和query__end
usdt = USDT(pid=int(dbpid))
usdt.enable_probe("query__start", "probe_query_start")
usdt.enable_probe("query__done", "probe_query_end")
第五步:定义Python数据结构b表示输出
class Data(ct.Structure):
_fields_ = [
("pid", ct.c_ulonglong),
("timestamp", ct.c_ulonglong),
("delta", ct.c_ulonglong),
("query", ct.c_char * 256)
]
第六步:输出
start = 0
def print_event(cpu, data, size):
global start
event = ct.cast(data, ct.POINTER(Data)).contents
if start == 0:
start = event.timestamp
print("%-14.6f %-6d %8.3f %s" % (float(event.timestamp - start) / 1000000000,
event.pid, float(event.delta) / 1000000, event.query))
print("Tracing database queries for PID %d slower than %d ms..." %
(dbpid, args.threshold))
print("%-14s %-6s %8s %s" % ("TIME(s)", "PID", "MS", "QUERY"))
bpf["events"].open_perf_buffer(print_event)
效果:
root@ubuntu1604:~/bcc/tools# ./lqdbslower.py postgres 0
/virtual/main.c:45:15: warning: comparison of unsigned expression >= 0 is always true [-Wtautological-compare]
if (delta >=0) {
~~~~~ ^ ~
1 warning generated.
Tracing database queries for PID 50216 slower than 0 ms...
TIME(s) PID MS QUERY
0.000000 50216 1.806 INSERT INTO tbl(name, date) VALUES('aaa', '2013-12-22');
7.150496 50216 0.227 select * from tbl
原实验的dbslower.py
该实验编写一个基于BCC的观察Linux锁的竞争状态的工具。
首先编译,运行程序:
root@ubuntu1604:~/linux-tracing-workshop# gcc -g -fno-omit-frame-pointer -pthread parprimes.c -o parprimes
root@ubuntu1604:~/linux-tracing-workshop# ./parprimes 4 10000
在lockstat.py查找TODO完成该工具。
// TODO Update tm_key fields with the mutex, tid, and stack id
tm_key.tid = pid;
tm_key.mtx = entry->mtx;
tm_key.lock_stack_id = stack_id;
// TODO Call locks.lookup_or_init(…) and update the wait time and the enter count // of the entry in the locks data structure
struct thread_mutex_val_t *existing_tm_val, new_tm_val = {};
existing_tm_val = locks.lookup_or_init(&tm_key, &new_tm_val);
existing_tm_val->wait_time_ns += wait_time;
if (PT_REGS_RC(ctx) == 0) {
existing_tm_val->enter_count += 1;
}
// TODO Update the mutex_lock_hist histogram with the time we held the lock
u64 slot = bpf_log2l(hold_time / 1000);
mutex_lock_hist.increment(slot);
// TODO Similarly to the previous probe, attach the following probes: // uprobe in pthread_mutex_lock handled by probe_mutex_lock // uretprobe in pthread_mutex_lock handled by probe_mutex_lock_return // uprobe in pthread_mutex_unlock handled by probe_mutex_unlock
bpf.attach_uprobe(name="pthread", sym="pthread_mutex_lock", fn_name="probe_mutex_lock", pid=pid)
bpf.attach_uretprobe(name="pthread", sym="pthread_mutex_lock", fn_name="probe_mutex_lock_return", pid=pid)
bpf.attach_uprobe(name="pthread", sym="pthread_mutex_unlock", fn_name="probe_mutex_unlock", pid=pid)
// TODO Print a nicely formatted line with the mutex description, wait time, // hold time, enter count, and stack (use print_stack)
print("\tmutex %s ::: wait time %.2fus ::: hold time %.2fus ::: enter count %d" %
(mutex_descr, v.wait_time_ns/1000.0, v.lock_time_ns/1000.0, v.enter_count))
print_stack(bpf, pid, stacks, k.lock_stack_id)
效果:
root@ubuntu1604:~/linux-tracing-workshop# python lockstat.py $(pidof parprimes)
init stack for mutex 7fff3dfa1fa0 (#1)
[unknown] (7f2eebaa85a0)
[unknown] (7f2eeb6f5830)
[unknown] (113e258d4c544155)
thread 53243
mutex [unknown] ::: wait time 7.01us ::: hold time 5.56us ::: enter count 1
[unknown] (7f2eebcccb34)
[unknown] (7f2eeb70eff8)
[unknown] (7f2eeba9b060)
thread 53246
mutex #1 ::: wait time 1655.31us ::: hold time 809.63us ::: enter count 369
[unknown] (4009f0)
[unknown] (400a44)
[unknown] (7f2eebaa66ba)
thread 53247
mutex #1 ::: wait time 12850.63us ::: hold time 660.04us ::: enter count 302
[unknown] (4009f0)
[unknown] (400a44)
[unknown] (7f2eebaa66ba)
thread 53248
mutex #1 ::: wait time 13290.15us ::: hold time 610.43us ::: enter count 281
[unknown] (4009f0)
[unknown] (400a44)
[unknown] (7f2eebaa66ba)
thread 53249
mutex #1 ::: wait time 1282.58us ::: hold time 621.87us ::: enter count 279
[unknown] (4009f0)
[unknown] (400a44)
[unknown] (7f2eebaa66ba)
wait time (us) : count distribution
0 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 1229 |****************************************|
8 -> 15 : 1 | |
16 -> 31 : 0 | |
32 -> 63 : 0 | |
64 -> 127 : 0 | |
128 -> 255 : 0 | |
256 -> 511 : 0 | |
512 -> 1023 : 0 | |
1024 -> 2047 : 0 | |
2048 -> 4095 : 0 | |
4096 -> 8191 : 0 | |
8192 -> 16383 : 2 | |
hold time (us) : count distribution
0 -> 1 : 0 | |
2 -> 3 : 1227 |****************************************|
4 -> 7 : 4 | |
8 -> 15 : 1 | |
原实验解答lockstat-solution