Using eBPF to measure CPU mode switch overhead incured by making system call

问题

As title, but the measurement result is unreasonable. Let me describe the current status.

I'm using syscall getuid as measurement target, I started by measureing the complete overhead with two clock_gettime bounded around, then measure the entry (what SYSCALL instruction does before executing the actual getuid code) and leaving overhead saparately (with eBPF program hook onto the entry and leaving point).

The result for the complete overhead is ~65ns, and regarding to the entry and leaving overhead, it's ~77ns and ~70ns respectively.

It's obvious that my measurement has some additional overhead except the typical overhead. However, it's weird that since clock_gettime is a vDSO syscall, it should barely have noticeable overhead. And BPF, which is a lightweight instrumental tool (JIT-ed and etc.) these day in Linux, shouldn't have noticeable overhead too.

Is there anyone have idea what additional overhead my measurement incurs?

Following is my measurement code:

userland (measuring the return-from-kernel overhead):

#define _GNU_SOURCE
#include <bpf.h>
#include <libbpf.h>
#include <stdlib.h>
#include <arpa/inet.h>
#include <net/if.h>
#include <string.h>
#include <asm/errno.h>
#include <linux/if_link.h>
#include <errno.h>

#include <sys/resource.h>
#include <unistd.h>
#include <asm/unistd.h>
#include <time.h>
#include <linux/perf_event.h>
#include <linux/hw_breakpoint.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdio.h>
#include <sys/ioctl.h>
#include <sched.h>

#define likely(x)   __builtin_expect((x),1) 
#define unlikely(x) __builtin_expect((x),0)
#define TEST_CNT 1000000
#define BPF_FILE_NAME "mkern.o"
#define BPF_MAP_NAME "msys"

static inline int sys_perf_event_open(struct perf_event_attr *attr, pid_t pid,
                                      int cpu, int group_fd,
                                      unsigned long flags)
{
    attr->size = sizeof(*attr);
    return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags);
}

static int attach_kprobe(int prog_fd)
{
    int err, fd, id;
    char buf[32];
    struct perf_event_attr attr = {};

    err = system("echo 'r:kp_sys_batch __x64_sys_getuid' > /sys/kernel/debug/tracing/kprobe_events");
    if (err < 0) {
        fprintf(stderr, "Failed to create kprobe, error '%s'\n", strerror(errno));
        return -1;
    }
    fd = open("/sys/kernel/debug/tracing/events/kprobes/kp_sys_batch/id", O_RDONLY, 0);
    if (fd < 0) {
        fprintf(stderr, "Failed to open event %s\n", "sys_batch");
        return -1;
    }
    err = read(fd, buf, sizeof(buf));
    if (err < 0 || err >= sizeof(buf)) {
        fprintf(stderr, "read from '%s' failed '%s'\n", "sys_batch", strerror(errno));
        return -1;
    }
    close(fd);

    buf[err] = 0;
    id = atoi(buf);
    attr.config = id;
    attr.type = PERF_TYPE_TRACEPOINT;
    attr.sample_type = PERF_SAMPLE_RAW;
    attr.sample_period = 1;
    attr.wakeup_events = 1;

    fd = sys_perf_event_open(&attr, 0/*this process*/, -1/*any cpu*/, -1/*group leader*/, 0);
    if (fd < 0) {
        perror("sys_perf_event_open");
        fprintf(stderr, "Failed to open perf_event (id: %llu)\n", attr.config);
        return -1;
    }
    err = ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
    if (err < 0) {
        fprintf(stderr, "ioctl PERF_EVENT_IOC_ENABLE failed err %s\n",
               strerror(errno));
        return -1;
    }
    err = ioctl(fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
    if (err < 0) {
        fprintf(stderr, "ioctl PERF_EVENT_IOC_SET_BPF failed: %s\n",
               strerror(errno));
        return -1;
    }

    return 0;
}

static void maxi_memlock_rlimit(void)
{
    struct rlimit rlim_new = {
        .rlim_cur   = RLIM_INFINITY,
        .rlim_max   = RLIM_INFINITY,
    };

    if (setrlimit(RLIMIT_MEMLOCK, &rlim_new)) {
        fprintf(stderr, "Failed to increase RLIMIT_MEMLOCK limit!\n");
        exit(-1);
    }
}

static int find_map_fd(struct bpf_object *bpf_obj, const char *mapname)
{
    struct bpf_map *map;
    int map_fd = -1;

    map = bpf_object__find_map_by_name(bpf_obj, mapname);
        if (!map) {
        fprintf(stderr, "Failed finding map by name: %s\n", mapname);
        exit(-1);
    }

    map_fd = bpf_map__fd(map);

    return map_fd;
}

int main(int argc, char **argv)
{
    int bpf_map_fd;
    int bpf_prog_fd = -1;
    int err;
    int key = 0;
    struct timespec tp;
    struct bpf_object *bpf_obj;
    struct reals map;
    struct bpf_prog_load_attr xattr = {
        .prog_type = BPF_PROG_TYPE_KPROBE,
        .file = BPF_FILE_NAME,
    };

    maxi_memlock_rlimit();

    err = bpf_prog_load_xattr(&xattr, &bpf_obj, &bpf_prog_fd);
    if (err) {
        fprintf(stderr, "Failed loading bpf object file\n");
        exit(-1);
    }

    if (attach_kprobe(bpf_prog_fd)) {
        fprintf(stderr, "Failed attaching kprobe\n");
        exit(-1);
    }

    bpf_map_fd = find_map_fd(bpf_obj, BPF_MAP_NAME);
    if (find_map_fd < 0) {
        fprintf(stderr, "Failed finding map fd\n");
        exit(-1);
    }

    /* warm up */
    for (int i = 0; i < TEST_CNT; i++) {
        syscall(__NR_getuid); /* dummy call */
        clock_gettime(CLOCK_MONOTONIC, &tp);

        if (unlikely(bpf_map_lookup_elem(bpf_map_fd, &key, &map))) {
            fprintf(stderr, "Failed to lookup map element\n");
            perror("lookup");
            exit(-1);
        }
    }

    uint64_t delta = 0;
    for (int i = 0; i < TEST_CNT; i++) {
        syscall(__NR_getuid); /* dummy call */
        clock_gettime(CLOCK_MONOTONIC, &tp);

        if (unlikely(bpf_map_lookup_elem(bpf_map_fd, &key, &map))) {
            fprintf(stderr, "Failed to lookup map element\n");
            perror("lookup");
            exit(-1);
        }

        delta += (1000000000 * tp.tv_sec + tp.tv_nsec) - map.ts;
    }
    printf("avg: %fns\n", (double) delta / TEST_CNT);

    return 0;
}

user land (measuring the enter-kernel overhead, almost same as the above, except what I pointed out):

err = system("echo 'p:kp_sys_batch sys_batch' > /sys/kernel/debug/tracing/kprobe_events");
...
clock_gettime(CLOCK_MONOTONIC, &tp);
syscall(__NR_getuid); /* dummy call */
...
delta += map.ts - (1000000000 * tp.tv_sec + tp.tv_nsec);

kernel land:

SEC("getuid")
int kp_sys_batch(struct pt_regs *ctx)
{
    __u32 i = 0;
    struct reals *r;

    r = bpf_map_lookup_elem(&reals, &i);
    if (!r)
        return 1;

    r->ts = bpf_ktime_get_ns();

    return 0;
}

Except the additional overhead I mentioned above, inside the return-from-kernel measurement code, if the echo 'r:kp_sys_batch sys_batch' is changed to echo 'p:kp_sys_batch sys_batch' (which means that the measurement would take the syscall execution overhead into account), the result would be ~48ns, this means that the result includes overhead of syscall execution and return-from-kernel. Any idea why this could be only ~48ns?

Thanks!

来源：https://stackoverflow.com/questions/65753521/using-ebpf-to-measure-cpu-mode-switch-overhead-incured-by-making-system-call

标签

Linux

ebpf