CFS关于vruntime 和 min_vruntime 的简单笔记


/*
 * This is the main, per-CPU runqueue data structure.
 *
 * Locking rule: those places that want to lock multiple runqueues
 * (such as the load balancing or the thread migration code), lock
 * acquire operations must be ordered by ascending &runqueue.
 */
struct rq {
    /* runqueue lock: */
    raw_spinlock_t lock;

    /*
     * nr_running and cpu_load should be in the same cacheline because
     * remote CPUs use both these fields when doing load calculation.
     */
    unsigned long nr_running;
    #define CPU_LOAD_IDX_MAX 5
    unsigned long cpu_load[CPU_LOAD_IDX_MAX];
    unsigned long last_load_update_tick;
#ifdef CONFIG_NO_HZ
    u64 nohz_stamp;
    unsigned char nohz_balance_kick;
#endif
    unsigned int skip_clock_update;

    /* capture load from *all* tasks on this cpu: */
    struct load_weight load;
    unsigned long nr_load_updates;
    u64 nr_switches;

    struct cfs_rq cfs;
    struct rt_rq rt;

#ifdef CONFIG_FAIR_GROUP_SCHED
    /* list of leaf cfs_rq on this cpu: */
    struct list_head leaf_cfs_rq_list;
#endif
#ifdef CONFIG_RT_GROUP_SCHED
    struct list_head leaf_rt_rq_list;
#endif

    /*
     * This is part of a global counter where only the total sum
     * over all CPUs matters. A task can increase this counter on
     * one CPU and if it got migrated afterwards it may decrease
     * it on another CPU. Always updated under the runqueue lock:
     */
    unsigned long nr_uninterruptible;

    struct task_struct *curr, *idle, *stop;
    unsigned long next_balance;
    struct mm_struct *prev_mm;

    u64 clock;
    u64 clock_task;

    atomic_t nr_iowait;

#ifdef CONFIG_SMP
    struct root_domain *rd;
    struct sched_domain *sd;

    unsigned long cpu_power;

    unsigned char idle_at_tick;
    /* For active balancing */
    int post_schedule;
    int active_balance;
    int push_cpu;
    struct cpu_stop_work active_balance_work;
    /* cpu of this runqueue: */
    int cpu;
    int online;

    unsigned long avg_load_per_task;

    u64 rt_avg;
    u64 age_stamp;
    u64 idle_stamp;
    u64 avg_idle;
#endif

#ifdef CONFIG_IRQ_TIME_ACCOUNTING
    u64 prev_irq_time;
#endif

    /* calc_load related fields */
    unsigned long calc_load_update;
    long calc_load_active;

#ifdef CONFIG_SCHED_HRTICK
#ifdef CONFIG_SMP
    int hrtick_csd_pending;
    struct call_single_data hrtick_csd;
#endif
    struct hrtimer hrtick_timer;
#endif

#ifdef CONFIG_SCHEDSTATS
    /* latency stats */
    struct sched_info rq_sched_info;
    unsigned long long rq_cpu_time;
    /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */

    /* sys_sched_yield() stats */
    unsigned int yld_count;

    /* schedule() stats */
    unsigned int sched_switch;
    unsigned int sched_count;
    unsigned int sched_goidle;

    /* try_to_wake_up() stats */
    unsigned int ttwu_count;
    unsigned int ttwu_local;
#endif
};


struct sched_entity {
    struct load_weight  load;       /* for load-balancing */
    struct rb_node      run_node;
    struct list_head    group_node;
    unsigned int        on_rq;

    u64         exec_start;
    u64         sum_exec_runtime;
    u64         vruntime;
    u64         prev_sum_exec_runtime;

    u64         nr_migrations;

#ifdef CONFIG_SCHEDSTATS
    struct sched_statistics statistics;
#endif

#ifdef CONFIG_FAIR_GROUP_SCHED
    struct sched_entity *parent;
    /* rq on which this entity is (to be) queued: */
    struct cfs_rq       *cfs_rq;
    /* rq "owned" by this entity/group: */
    struct cfs_rq       *my_q;
#endif
};


/* CFS-related fields in a runqueue */
struct cfs_rq {
    struct load_weight load;
    unsigned long nr_running;

    u64 exec_clock;
    u64 min_vruntime;

    struct rb_root tasks_timeline;
    struct rb_node *rb_leftmost;

    struct list_head tasks;
    struct list_head *balance_iterator;

    /*
     * 'curr' points to currently running entity on this cfs_rq.
     * It is set to NULL otherwise (i.e when none are currently running).
     */
    struct sched_entity *curr, *next, *last, *skip;

    unsigned int nr_spread_over;

#ifdef CONFIG_FAIR_GROUP_SCHED
    struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */

    /*
     * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
     * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
     * (like users, containers etc.)
     *
     * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
     * list is used during load balance.
     */
    int on_list;
    struct list_head leaf_cfs_rq_list;
    struct task_group *tg;  /* group that "owns" this runqueue */

#ifdef CONFIG_SMP
    /*
     * the part of load.weight contributed by tasks
     */
    unsigned long task_weight;

    /*
     *   h_load = weight * f(tg)
     *
     * Where f(tg) is the recursive weight fraction assigned to
     * this group.
     */
    unsigned long h_load;

    /*
     * Maintaining per-cpu shares distribution for group scheduling
     *
     * load_stamp is the last time we updated the load average
     * load_last is the last time we updated the load average and saw load
     * load_unacc_exec_time is currently unaccounted execution time
     */
    u64 load_avg;
    u64 load_period;
    u64 load_stamp, load_last, load_unacc_exec_time;

    unsigned long load_contribution;
#endif
#endif
};


static void update_curr(struct cfs_rq *cfs_rq)
{
    struct sched_entity *curr = cfs_rq->curr;
    /*
        cfs队列中正在执行的进程的调度条目信息
     */
    u64 now = rq_of(cfs_rq)->clock_task;
    /*
        获取当前的时间
    */
    unsigned long delta_exec;

    /*
        表示 cfs 队列中 没有执行进程。
    */
    if (unlikely(!curr))
        return;

    /*
     * Get the amount of time the current task was running
     * since the last time we changed load (this cannot
     * overflow on 32 bits):
     * 获取上一次权重发生变化的时间
     */
    delta_exec = (unsigned long)(now - curr->exec_start);
    /*
        没有发生变化就不需要任何更新
    */
    if (!delta_exec)
        return;

    __update_curr(cfs_rq, curr, delta_exec);
    /*
        正式开始执行
    */
    curr->exec_start = now;

    if (entity_is_task(curr)) {
        struct task_struct *curtask = task_of(curr);

        trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
        cpuacct_charge(curtask, delta_exec);
        account_group_exec_runtime(curtask, delta_exec);
    }
}

# define schedstat_inc(rq, field)do { (rq)->field++; } while (0)
# define schedstat_add(rq, field, amt)do { (rq)->field += (amt); } while (0)
# define schedstat_set(var, val)do { var = (val); } while (0)
/*
 * Update the current task's runtime statistics. Skip current tasks that
 * are not in our scheduling class.
 * 更新当前进程的时间统计,如果进程不属于CFS调度器则跳过
 */
static inline void
__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
          unsigned long delta_exec)
{
    unsigned long delta_exec_weighted;

    schedstat_set(curr->statistics.exec_max,
              max((u64)delta_exec, curr->statistics.exec_max));

    curr->sum_exec_runtime += delta_exec;
    schedstat_add(cfs_rq, exec_clock, delta_exec);
    delta_exec_weighted = calc_delta_fair(delta_exec, curr);

    curr->vruntime += delta_exec_weighted;
    update_min_vruntime(cfs_rq);

#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
    cfs_rq->load_unacc_exec_time += delta_exec;
#endif
}
/*
    这个就是作为 node 的排序规则
    
*/
static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
  return se->vruntime - cfs_rq->min_vruntime;
}
/*
    min_vruntime 和 vruntime , vruntime 表示的是 已使用虚拟时长
    1. 如果 vruntime 是实时的,min_vruntime 记录的是最小的.溢出的可能性也不大应该不会溢出,
       因为 u 64 其中 32位用来存储 ns 级别,那么 32 位用来存储s级别。那就是特别特别长了。不太可能。
    2. 那么不考虑 vruntime 溢出, min_vruntime 就应该存储实际 min_vruntime,那么
*/
static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
    if (cfs_rq->curr)
        vruntime = cfs_rq->curr->vruntime;
    if (cfs_rq->rb_leftmost) {
        struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
                           struct sched_entity,
                           run_node);
        if (!cfs_rq->curr)
            vruntime = se->vruntime;
        else
            vruntime = min_vruntime(vruntime, se->vruntime);
    }

    // 无符号的增长可能会溢出
    cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
}

/**************************************************************
 * Scheduling class tree data structure manipulation methods:
 * 这就是结果就是key
 * 通过有符号的计算结果来辨别。无符号的 小 - 大 会变成很大一个数字。
 */

static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
{
    s64 delta = (s64)(vruntime - min_vruntime);
    if (delta > 0)
        min_vruntime = vruntime;

    return min_vruntime;
}

static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
{
    //出现 delta < 0 只可能是 vruntime 比 min_runtime大特别多,则表示 min_vruntime已经溢出 vruntime没有溢出 ,这种可能性有吗？
    //或者就是 vruntime 比 min_vruntime 小,即 vruntime 已经溢出了
    s64 delta = (s64)(vruntime - min_vruntime);
    if (delta < 0)
        min_vruntime = vruntime;

    return min_vruntime;
}


/*
 * delta /= w
 */
static inline unsigned long
calc_delta_fair(unsigned long delta, struct sched_entity *se)
{
    /*
        如果 自身权重为 NICE_0_LOAD 则 不用进行伸缩
        如果 不是 则需要按照比例进行伸缩计算  vruntime 值
    */
    if (unlikely(se->load.weight != NICE_0_LOAD))
        delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);

    return delta;
}

/*
 * Increase resolution of nice-level calculations:
 */
#define SCHED_LOAD_SHIFT    10
#define SCHED_LOAD_SCALE    (1L << SCHED_LOAD_SHIFT)

// 表示 nice 值为0 表示的权重
#define NICE_0_LOAD     SCHED_LOAD_SCALE
#define LONG_MAX    ((long)(~0UL>>1))

#if BITS_PER_LONG == 32
# define WMULT_CONST    (~0UL)
#else
# define WMULT_CONST    (1UL << 32)
#endif

#define WMULT_SHIFT 32

struct load_weight {
    /*
        实际的权重
    */
    unsigned long weight;
    /*
        2 ^ 32 / weight
    */
    unsigned long inv_weight;
};

/*
 * Shift right and round:
 */
#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))

/*
 * delta *= weight / lw
 */
static unsigned long
calc_delta_mine(unsigned long delta_exec, unsigned long weight,
        struct load_weight *lw)
{
    u64 tmp;

    if (!lw->inv_weight) {
        /*
            如果  long 是 8 字节的,而且权重 大于权重比例。
        */
        if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
            lw->inv_weight = 1;
        else
        {
            /*
                权重增长按照比例增长 ~ 0.5 + WMULT_CONST/weight 
                如果当前权重发生了变化，改变对应的关系
            */
            lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
                / (lw->weight+1);
        }
    }
    /*
        按照权重计算对应的变化时间。
        delta_exec * NICE_0_LOAD 
    */
    tmp = (u64)delta_exec * weight;

    if (unlikely(tmp > WMULT_CONST))
    {
        /*
            因为  tmp > WMULT_CONST 所以 delta_exec * NICE_0_LOAD >> 16 不为 0,即delta_exec特别大
            SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,WMULT_SHIFT/2)
            = SRR(SRR(delta_exec * NICE_0_LOAD , 16 ) * ( 2^32 / lw->weight) ,  16 )
            = SRR( delta_exec * NICE_0_LOAD >> 16 ( 2^32 / lw->weight) ,  16 )
            = delta_exec * NICE_0_LOAD >> 16 ( 2^32 / lw->weight) >>  16
            = delta_exec * NICE_0_LOAD * 2 / lw->weight
        */
        tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
            WMULT_SHIFT/2);
    }
    else
    {
        /*
            SRR(tmp * lw->inv_weight, WMULT_SHIFT)
            = SRR(delta_exec * NICE_0_LOAD * 2 ^ 32 / lw->weight,WMULT_CONST)
            ~ (delta_exec * NICE_0_LOAD * 2 ^ 32 / lw->weight ) >> 32
            ~ delta_exec * NICE_0_LOAD * 2 / lw->weight
        */
        tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
    }

    return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
}

/*
    1024 == NICE_0_LOAD
    分配给进程的运行时间 = 调度周期 * 进程权重 / 所有进程总权重
    vruntime = 分配给进程的运行时间 * 1024 / 进程权重
    vruntime = (调度周期 * 进程权重 / 所有进程总权重) * 1024 / 进程权重 = 调度周期 * 1024 / 所有进程总权重 
*/
来源：oschina
链接：https://my.oschina.net/u/3695598/blog/4297435
标签
Delta
entity