/*
* This is the main, per-CPU runqueue data structure.
*
* Locking rule: those places that want to lock multiple runqueues
* (such as the load balancing or the thread migration code), lock
* acquire operations must be ordered by ascending &runqueue.
*/
struct rq {
/* runqueue lock: */
raw_spinlock_t lock;
/*
* nr_running and cpu_load should be in the same cacheline because
* remote CPUs use both these fields when doing load calculation.
*/
unsigned long nr_running;
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
unsigned long last_load_update_tick;
#ifdef CONFIG_NO_HZ
u64 nohz_stamp;
unsigned char nohz_balance_kick;
#endif
unsigned int skip_clock_update;
/* capture load from *all* tasks on this cpu: */
struct load_weight load;
unsigned long nr_load_updates;
u64 nr_switches;
struct cfs_rq cfs;
struct rt_rq rt;
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
struct list_head leaf_cfs_rq_list;
#endif
#ifdef CONFIG_RT_GROUP_SCHED
struct list_head leaf_rt_rq_list;
#endif
/*
* This is part of a global counter where only the total sum
* over all CPUs matters. A task can increase this counter on
* one CPU and if it got migrated afterwards it may decrease
* it on another CPU. Always updated under the runqueue lock:
*/
unsigned long nr_uninterruptible;
struct task_struct *curr, *idle, *stop;
unsigned long next_balance;
struct mm_struct *prev_mm;
u64 clock;
u64 clock_task;
atomic_t nr_iowait;
#ifdef CONFIG_SMP
struct root_domain *rd;
struct sched_domain *sd;
unsigned long cpu_power;
unsigned char idle_at_tick;
/* For active balancing */
int post_schedule;
int active_balance;
int push_cpu;
struct cpu_stop_work active_balance_work;
/* cpu of this runqueue: */
int cpu;
int online;
unsigned long avg_load_per_task;
u64 rt_avg;
u64 age_stamp;
u64 idle_stamp;
u64 avg_idle;
#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
#endif
/* calc_load related fields */
unsigned long calc_load_update;
long calc_load_active;
#ifdef CONFIG_SCHED_HRTICK
#ifdef CONFIG_SMP
int hrtick_csd_pending;
struct call_single_data hrtick_csd;
#endif
struct hrtimer hrtick_timer;
#endif
#ifdef CONFIG_SCHEDSTATS
/* latency stats */
struct sched_info rq_sched_info;
unsigned long long rq_cpu_time;
/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
/* sys_sched_yield() stats */
unsigned int yld_count;
/* schedule() stats */
unsigned int sched_switch;
unsigned int sched_count;
unsigned int sched_goidle;
/* try_to_wake_up() stats */
unsigned int ttwu_count;
unsigned int ttwu_local;
#endif
};
struct sched_entity {
struct load_weight load; /* for load-balancing */
struct rb_node run_node;
struct list_head group_node;
unsigned int on_rq;
u64 exec_start;
u64 sum_exec_runtime;
u64 vruntime;
u64 prev_sum_exec_runtime;
u64 nr_migrations;
#ifdef CONFIG_SCHEDSTATS
struct sched_statistics statistics;
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
struct sched_entity *parent;
/* rq on which this entity is (to be) queued: */
struct cfs_rq *cfs_rq;
/* rq "owned" by this entity/group: */
struct cfs_rq *my_q;
#endif
};
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
unsigned long nr_running;
u64 exec_clock;
u64 min_vruntime;
struct rb_root tasks_timeline;
struct rb_node *rb_leftmost;
struct list_head tasks;
struct list_head *balance_iterator;
/*
* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
struct sched_entity *curr, *next, *last, *skip;
unsigned int nr_spread_over;
#ifdef CONFIG_FAIR_GROUP_SCHED
struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
/*
* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
* a hierarchy). Non-leaf lrqs hold other higher schedulable entities
* (like users, containers etc.)
*
* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
* list is used during load balance.
*/
int on_list;
struct list_head leaf_cfs_rq_list;
struct task_group *tg; /* group that "owns" this runqueue */
#ifdef CONFIG_SMP
/*
* the part of load.weight contributed by tasks
*/
unsigned long task_weight;
/*
* h_load = weight * f(tg)
*
* Where f(tg) is the recursive weight fraction assigned to
* this group.
*/
unsigned long h_load;
/*
* Maintaining per-cpu shares distribution for group scheduling
*
* load_stamp is the last time we updated the load average
* load_last is the last time we updated the load average and saw load
* load_unacc_exec_time is currently unaccounted execution time
*/
u64 load_avg;
u64 load_period;
u64 load_stamp, load_last, load_unacc_exec_time;
unsigned long load_contribution;
#endif
#endif
};
static void update_curr(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
/*
cfs队列中正在执行的进程的调度条目信息
*/
u64 now = rq_of(cfs_rq)->clock_task;
/*
获取当前的时间
*/
unsigned long delta_exec;
/*
表示 cfs 队列中 没有执行进程。
*/
if (unlikely(!curr))
return;
/*
* Get the amount of time the current task was running
* since the last time we changed load (this cannot
* overflow on 32 bits):
* 获取上一次权重发生变化的时间
*/
delta_exec = (unsigned long)(now - curr->exec_start);
/*
没有发生变化就不需要任何更新
*/
if (!delta_exec)
return;
__update_curr(cfs_rq, curr, delta_exec);
/*
正式开始执行
*/
curr->exec_start = now;
if (entity_is_task(curr)) {
struct task_struct *curtask = task_of(curr);
trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
cpuacct_charge(curtask, delta_exec);
account_group_exec_runtime(curtask, delta_exec);
}
}
# define schedstat_inc(rq, field)do { (rq)->field++; } while (0)
# define schedstat_add(rq, field, amt)do { (rq)->field += (amt); } while (0)
# define schedstat_set(var, val)do { var = (val); } while (0)
/*
* Update the current task's runtime statistics. Skip current tasks that
* are not in our scheduling class.
* 更新当前进程的时间统计,如果进程不属于CFS调度器则跳过
*/
static inline void
__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
unsigned long delta_exec)
{
unsigned long delta_exec_weighted;
schedstat_set(curr->statistics.exec_max,
max((u64)delta_exec, curr->statistics.exec_max));
curr->sum_exec_runtime += delta_exec;
schedstat_add(cfs_rq, exec_clock, delta_exec);
delta_exec_weighted = calc_delta_fair(delta_exec, curr);
curr->vruntime += delta_exec_weighted;
update_min_vruntime(cfs_rq);
#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
cfs_rq->load_unacc_exec_time += delta_exec;
#endif
}
/*
这个就是作为 node 的排序规则
*/
static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
return se->vruntime - cfs_rq->min_vruntime;
}
/*
min_vruntime 和 vruntime , vruntime 表示的是 已使用虚拟时长
1. 如果 vruntime 是实时的,min_vruntime 记录的是最小的.溢出的可能性也不大应该不会溢出,
因为 u 64 其中 32位用来存储 ns 级别,那么 32 位用来存储s级别。那就是特别特别长了。不太可能。
2. 那么不考虑 vruntime 溢出, min_vruntime 就应该存储实际 min_vruntime,那么
*/
static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
if (cfs_rq->curr)
vruntime = cfs_rq->curr->vruntime;
if (cfs_rq->rb_leftmost) {
struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
struct sched_entity,
run_node);
if (!cfs_rq->curr)
vruntime = se->vruntime;
else
vruntime = min_vruntime(vruntime, se->vruntime);
}
// 无符号的增长可能会溢出
cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
}
/**************************************************************
* Scheduling class tree data structure manipulation methods:
* 这就是结果就是key
* 通过有符号的计算结果来辨别。无符号的 小 - 大 会变成很大一个数字。
*/
static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
{
s64 delta = (s64)(vruntime - min_vruntime);
if (delta > 0)
min_vruntime = vruntime;
return min_vruntime;
}
static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
{
//出现 delta < 0 只可能是 vruntime 比 min_runtime大特别多,则表示 min_vruntime已经溢出 vruntime没有溢出 ,这种可能性有吗?
//或者就是 vruntime 比 min_vruntime 小,即 vruntime 已经溢出了
s64 delta = (s64)(vruntime - min_vruntime);
if (delta < 0)
min_vruntime = vruntime;
return min_vruntime;
}
/*
* delta /= w
*/
static inline unsigned long
calc_delta_fair(unsigned long delta, struct sched_entity *se)
{
/*
如果 自身权重为 NICE_0_LOAD 则 不用进行伸缩
如果 不是 则需要按照比例进行伸缩计算 vruntime 值
*/
if (unlikely(se->load.weight != NICE_0_LOAD))
delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
return delta;
}
/*
* Increase resolution of nice-level calculations:
*/
#define SCHED_LOAD_SHIFT 10
#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT)
// 表示 nice 值为0 表示的权重
#define NICE_0_LOAD SCHED_LOAD_SCALE
#define LONG_MAX ((long)(~0UL>>1))
#if BITS_PER_LONG == 32
# define WMULT_CONST (~0UL)
#else
# define WMULT_CONST (1UL << 32)
#endif
#define WMULT_SHIFT 32
struct load_weight {
/*
实际的权重
*/
unsigned long weight;
/*
2 ^ 32 / weight
*/
unsigned long inv_weight;
};
/*
* Shift right and round:
*/
#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
/*
* delta *= weight / lw
*/
static unsigned long
calc_delta_mine(unsigned long delta_exec, unsigned long weight,
struct load_weight *lw)
{
u64 tmp;
if (!lw->inv_weight) {
/*
如果 long 是 8 字节的,而且权重 大于权重比例。
*/
if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
lw->inv_weight = 1;
else
{
/*
权重增长按照比例增长 ~ 0.5 + WMULT_CONST/weight
如果当前权重发生了变化,改变对应的关系
*/
lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
/ (lw->weight+1);
}
}
/*
按照权重计算对应的变化时间。
delta_exec * NICE_0_LOAD
*/
tmp = (u64)delta_exec * weight;
if (unlikely(tmp > WMULT_CONST))
{
/*
因为 tmp > WMULT_CONST 所以 delta_exec * NICE_0_LOAD >> 16 不为 0,即delta_exec特别大
SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,WMULT_SHIFT/2)
= SRR(SRR(delta_exec * NICE_0_LOAD , 16 ) * ( 2^32 / lw->weight) , 16 )
= SRR( delta_exec * NICE_0_LOAD >> 16 ( 2^32 / lw->weight) , 16 )
= delta_exec * NICE_0_LOAD >> 16 ( 2^32 / lw->weight) >> 16
= delta_exec * NICE_0_LOAD * 2 / lw->weight
*/
tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
WMULT_SHIFT/2);
}
else
{
/*
SRR(tmp * lw->inv_weight, WMULT_SHIFT)
= SRR(delta_exec * NICE_0_LOAD * 2 ^ 32 / lw->weight,WMULT_CONST)
~ (delta_exec * NICE_0_LOAD * 2 ^ 32 / lw->weight ) >> 32
~ delta_exec * NICE_0_LOAD * 2 / lw->weight
*/
tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
}
return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
}
/*
1024 == NICE_0_LOAD
分配给进程的运行时间 = 调度周期 * 进程权重 / 所有进程总权重
vruntime = 分配给进程的运行时间 * 1024 / 进程权重
vruntime = (调度周期 * 进程权重 / 所有进程总权重) * 1024 / 进程权重 = 调度周期 * 1024 / 所有进程总权重
*/
来源:oschina
链接:https://my.oschina.net/u/3695598/blog/4297435