每次时钟节拍到来时,会触发中断-------->调用timer_interrupt------> do_timer_interrupt_hook,从而调用do_timer和update_process_times函数;
最后会调用scheduler_tick(void);
void scheduler_tick(void) { int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); struct task_struct *curr = rq->curr; sched_clock_tick(); raw_spin_lock(&rq->lock); /* 更新运行队列的时间,rq->clock和rq->clock_task,可以看出在真正计算 时间的时候用的是clock_task */ update_rq_clock(rq); update_cpu_load_active(rq); curr->sched_class->task_tick(rq, curr, 0); 如果是CFS调度则回调。 raw_spin_unlock(&rq->lock); perf_event_task_tick(); #ifdef CONFIG_SMP rq->idle_balance = idle_cpu(cpu); trigger_load_balance(rq, cpu); #endif
回调函数为task_tick_fair-------->最后调用entity_tick(cfs_rq,
se, queued);
static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) { Update run-time statistics of the 'current'. update_curr(cfs_rq);更新可运行进程中的run-time的统计计数;系统根据这些数字来决定调度哪一个进程;
/* * Update share accounting for long-running entities. */ update_entity_shares_tick(cfs_rq); #ifdef CONFIG_SCHED_HRTICK /* * queued ticks are scheduled to match the slice, so don't bother * validating it and just reschedule. */ if (queued) { resched_task(rq_of(cfs_rq)->curr); return; } /* * don't let the period tick interfere with the hrtick preemption */ if (!sched_feat(DOUBLE_TICK) && hrtimer_active(&rq_of(cfs_rq)->hrtick_timer)) return; #endif if (cfs_rq->nr_running > 1) check_preempt_tick(cfs_rq, curr); }
/* * Preempt the current task with a newly woken task if needed: */ static void check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) { unsigned long ideal_runtime, delta_exec; struct sched_entity *se; s64 delta; ideal_runtime = sched_slice(cfs_rq, curr); //计算curr进程的理想运行时间 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;//计算该进程实际运行时间 if (delta_exec > ideal_runtime) {//如果实际运行的时间超出了它应该运行的时间,则set该进程的TIF_NEED_RESCHED标志位 resched_task(rq_of(cfs_rq)->curr); /* * The current task ran long enough, ensure it doesn't get * re-elected due to buddy favours. */ clear_buddies(cfs_rq, curr); return; } /* 这里是第二个抢占条件:最小虚拟运行时间的进程和当前进程的虚拟运行时间进行比较, 如果后者比前者大了 ideal_runtime,就需要进行调度。 为什么要用虚拟时间个sched_slice产生的真实时间进行比较呢? 大了ideal_runtime又能代表什么呢? */ /* * Ensure that a task that missed wakeup preemption by a * narrow margin doesn't have to wait for a full slice. * This also mitigates buddy induced latencies under load. */ if (delta_exec < sysctl_sched_min_granularity) return; se = __pick_first_entity(cfs_rq); delta = curr->vruntime - se->vruntime; if (delta < 0) return; if (delta > ideal_runtime) resched_task(rq_of(cfs_rq)->curr); }
- sched_slice函数用来计算一个进程时间基准(wall time),一个进程的理论运行时间是和整个cfs_rq中的进程数量和权重有关系的。
- /*
__sched_period这个函数得到的是每一个进程运行一次的时间总和,公式为:
p = (nr <= nl) ? l : mg * nr
l :系统常数,为调度延时,就是系统所有进程运行一次的时间总和
nl :系统常数,系统活动进程的上限
nr :当前的进程数
mg :系统常数,最小的调度时间间隔
*/
* We calculate the wall-time slice from the period by taking a part * proportional to the weight. * * s = p*P[w/rw] */ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq); for_each_sched_entity(se) { struct load_weight *load; struct load_weight lw; cfs_rq = cfs_rq_of(se); load = &cfs_rq->load; if (unlikely(!se->on_rq)) {//这里和上面都考虑了情况,当进程刚刚创建时,当前进程不在运行队列中,为了计算合理,就临时加上去。 lw = cfs_rq->load; update_load_add(&lw, se->load.weight); load = &lw; } slice = calc_delta_mine(slice, se->load.weight, load); } return slice;
对于进程fork中调用sched_fork;即copy_process();
/* * fork()/clone()-time setup: */ void sched_fork(struct task_struct *p) { unsigned long flags; int cpu = get_cpu(); __sched_fork(p);初始task_struct中调度器se成员;struct sched_entity结构等; /* * We mark the process as running here. This guarantees that * nobody will actually run it, and a signal or other external * event cannot wake it up and insert it on the runqueue either. */ p->state = TASK_RUNNING; /* * Make sure we do not leak PI boosting priority to the child.//确保临时的优先级的提升不会继承到新的进程中 */ p->prio = current->normal_prio; /* * Revert to default priority/policy on fork if requested. */ if (unlikely(p->sched_reset_on_fork)) { //如果设置了sched_reset_on_fork,会恢复默认调度策略 if (task_has_rt_policy(p)) { p->policy = SCHED_NORMAL; p->static_prio = NICE_TO_PRIO(0); p->rt_priority = 0; } else if (PRIO_TO_NICE(p->static_prio) < 0) p->static_prio = NICE_TO_PRIO(0); p->prio = p->normal_prio = __normal_prio(p); set_load_weight(p); //根据优先级和调度策略设置权重 /* * We don't need the reset flag anymore after the fork. It has * fulfilled its duty: */ p->sched_reset_on_fork = 0; } if (!rt_prio(p->prio)) p->sched_class = &fair_sched_class; //如果不是实时进程,就用CFS调度 if (p->sched_class->task_fork) p->sched_class->task_fork(p);回调CFS的task_fork函数; /* * The child is not yet in the pid-hash so no cgroup attach races, * and the cgroup is pinned to this child due to cgroup_fork() * is ran before sched_fork(). * * Silence PROVE_RCU. */ raw_spin_lock_irqsave(&p->pi_lock, flags); set_task_cpu(p, cpu); raw_spin_unlock_irqrestore(&p->pi_lock, flags); #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif #if defined(CONFIG_SMP) p->on_cpu = 0; #endif #ifdef CONFIG_PREEMPT_COUNT /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; #endif #ifdef CONFIG_SMP plist_node_init(&p->pushable_tasks, MAX_PRIO); #endif put_cpu(); }
回调。。。。。
/* * scheduler tick hitting a task of our scheduling class: */ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) { struct cfs_rq *cfs_rq; struct sched_entity *se = &curr->se; for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); entity_tick(cfs_rq, se, queued); } } /* * called on fork with the child task as argument from the parent's context * - child not yet on the tasklist * - preemption disabled */ static void task_fork_fair(struct task_struct *p) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se, *curr; int this_cpu = smp_processor_id(); struct rq *rq = this_rq(); unsigned long flags; raw_spin_lock_irqsave(&rq->lock, flags); update_rq_clock(rq);//更新rq的时钟 cfs_rq = task_cfs_rq(current); curr = cfs_rq->curr; if (unlikely(task_cpu(p) != this_cpu)) { rcu_read_lock(); __set_task_cpu(p, this_cpu); rcu_read_unlock(); } update_curr(cfs_rq); //更新cfs_rq的统计数据 if (curr) se->vruntime = curr->vruntime;//子进程虚拟时间以 父进程的虚拟时间为基准 place_entity(cfs_rq, se, 1); //调整子进程虚拟时间 /* 参数sysctl_sched_child_runs_first强制子进程在父进程之前运行。 entity_before(curr, se)判断是否需要进行虚拟运行时间的对调, 只有父进程的虚拟运行时间小于子进程的虚拟运行时间时才需要此操作。 */ if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { /* * Upon rescheduling, sched_class::put_prev_task() will place * 'current' within the tree based on its new key value. */ swap(curr->vruntime, se->vruntime); resched_task(rq->curr); } se->vruntime -= cfs_rq->min_vruntime; raw_spin_unlock_irqrestore(&rq->lock, flags); }
static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) { //基准为min_vruntime u64 vruntime = cfs_rq->min_vruntime; /* * The 'current' period is already promised to the current tasks, * however the extra weight of the new task will slow them down a * little, place the new task so that it fits in the slot that * stays open at the end. */ /* 新进程开始的时候,基准值要比min_vruntime稍微慢一些。 */ if (initial) vruntime += sched_vslice(cfs_rq, se); /* sleeps up to a single latency don't count. */ if (!initial) { /* 这里是睡眠进程唤醒时的时间处理。 因为进程休眠了,se->vruntime肯定很小,但是如果太小 的话,又会抢其他进程运行的机会。所以这里采用了 cfs_rq->min_vruntime - thresh。即保证了它调度的优先权, 又不至于太小,影响其他进程。 */ unsigned long thresh = sysctl_sched_latency; /* * Halve their sleep time's effect, to allow * for a gentler effect of sleepers: */ if (sched_feat(GENTLE_FAIR_SLEEPERS)) thresh >>= 1; vruntime -= thresh; } //这里保证vruntime只能往大了调整 /* ensure we never gain time by being placed backwards. */ vruntime = max_vruntime(se->vruntime, vruntime); se->vruntime = vruntime; }
在fork---do_fork中
------------------->p = copy_process
=---------------------------->if (!IS_ERR(p))------------->则调用wake_up_new_task;
#define MAX_ERRNO 4095
#define IS_ERR_VALUE(x) unlikely((x) >= (unsigned long)-MAX_ERRNO) 一个负数转换为一个正数。。学习之
即判断是不是在(0xfffff000,0xffffffff)之间,因此,可以用IS_ERR()来判断内核函数的返回值是不是一个有效的指针。注意这里用unlikely()的用意
* wake_up_new_task - wake up a newly created task for the first time. * * This function will do some initial scheduler statistics housekeeping * that must be done for every newly created context, then puts the task * on the runqueue and wakes it. */ void wake_up_new_task(struct task_struct *p) { unsigned long flags; struct rq *rq; raw_spin_lock_irqsave(&p->pi_lock, flags); #ifdef CONFIG_SMP /* * Fork balancing, do it here and not earlier because: * - cpus_allowed can change in the fork path * - any previously selected cpu might disappear through hotplug */ set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); #endif rq = __task_rq_lock(p); activate_task(rq, p, 0); p->on_rq = 1; trace_sched_wakeup_new(p, true); check_preempt_curr(rq, p, WF_FORK); //判断新的进程是否能抢占当前进程 #ifdef CONFIG_SMP if (p->sched_class->task_woken) p->sched_class->task_woken(rq, p); #endif task_rq_unlock(rq, p, &flags); }
void activate_task(struct rq *rq, struct task_struct *p, int flags) { if (task_contributes_to_load(p)) rq->nr_uninterruptible--; enqueue_task(rq, p, flags); }
activate_task最终会调到CFS中的函数enqueue_task_fair;之前有分析;
判断当前进程是否能被进程p抢占
void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) { const struct sched_class *class; if (p->sched_class == rq->curr->sched_class) { rq->curr->sched_class->check_preempt_curr(rq, p, flags); } else { for_each_class(class) { if (class == rq->curr->sched_class) break; if (class == p->sched_class) { resched_task(rq->curr); break; } } } /* * A queue event has occurred, and we're going to schedule. In * this case, we can save a useless back to back clock update. */ if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) rq->skip_clock_update = 1; }
回调函数。
* * Preempt the current task with a newly woken task if needed: */ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) { struct task_struct *curr = rq->curr; struct sched_entity *se = &curr->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(curr); int scale = cfs_rq->nr_running >= sched_nr_latency; int next_buddy_marked = 0; if (unlikely(se == pse))/ /要唤醒的进程和当前运行的进程是同一个,就返回 return; /* * This is possible from callers such as move_task(), in which we * unconditionally check_prempt_curr() after an enqueue (which may have * lead to a throttle). This both saves work and prevents false * next-buddy nomination below. */ if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) return; if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { set_next_buddy(pse); next_buddy_marked = 1; } /* * We can come here with TIF_NEED_RESCHED already set from new task * wake up path. * * Note: this also catches the edge-case of curr being in a throttled * group (e.g. via set_curr_task), since update_curr() (in the * enqueue of curr) will have resulted in resched being set. This * prevents us from potentially nominating it as a false LAST_BUDDY * below. */ if (test_tsk_need_resched(curr))//如果调度位已经设置,就不再往下在走了 return; /* Idle tasks are by definition preempted by non-idle tasks. */ if (unlikely(curr->policy == SCHED_IDLE) && likely(p->policy != SCHED_IDLE)) goto preempt; /* * Batch and idle tasks do not preempt non-idle tasks (their preemption * is driven by the tick): */ if (unlikely(p->policy != SCHED_NORMAL))//如果不是使用CFS,就返回 return; find_matching_se(&se, &pse); update_curr(cfs_rq_of(se)); BUG_ON(!pse); //判断是否需要抢占的核心函数 if (wakeup_preempt_entity(se, pse) == 1) { /* * Bias pick_next to pick the sched entity that is * triggering this preemption. */ if (!next_buddy_marked) set_next_buddy(pse); goto preempt; } return; preempt: resched_task(curr); /* * Only set the backward buddy when the current task is still * on the rq. This can happen when a wakeup gets interleaved * with schedule on the ->pre_schedule() or idle_balance() * point, either of which can * drop the rq lock. * * Also, during early boot the idle thread is in the fair class, * for obvious reasons its a bad idea to schedule back to it. */ if (unlikely(!se->on_rq || curr == rq->idle)) return; if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) set_last_buddy(se); }
- /*
- * Should ‘se‘ preempt ‘curr‘.
- *
- * |s1
- * |s2
- * |s3
- * g
- * |<--->|c
- *
- * w(c, s1) = -1
- * w(c, s2) = 0
- * w(c, s3) = 1
- *
- */
static int wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) { s64 gran, vdiff = curr->vruntime - se->vruntime;//计算两个虚拟时间之差 if (vdiff <= 0) //如果se的虚拟时间比curr还大,说明本该curr执行,无需抢占 return -1; gran = wakeup_gran(curr, se); /* gran为需要抢占的时间差,只有两个时间差大于需要抢占的时间差, 才需要抢占,这里避免了太频繁的抢占。 */ if (vdiff > gran) return 1; return 0; }
wake_up进程:
/** * try_to_wake_up - wake up a thread * @p: the thread to be awakened * @state: the mask of task states that can be woken * @wake_flags: wake modifier flags (WF_*) * * Put it on the run-queue if it's not already there. The "current" * thread is always on the run-queue (except when the actual * re-schedule is in progress), and as such you're allowed to do * the simpler "current->state = TASK_RUNNING" to mark yourself * runnable without the overhead of this. * * Returns %true if @p was woken up, %false if it was already running * or @state didn't match @p's state. */ static int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) { unsigned long flags; int cpu, success = 0; smp_wmb(); raw_spin_lock_irqsave(&p->pi_lock, flags); if (!(p->state & state)) goto out; success = 1; /* we're going to change ->state */ cpu = task_cpu(p); if (p->on_rq && ttwu_remote(p, wake_flags)) //如果进程一直都在rq上,就做一次远程唤醒 goto stat; #ifdef CONFIG_SMP /* * If the owning (remote) cpu is still in the middle of schedule() with * this task as prev, wait until its done referencing the task. */ while (p->on_cpu) { #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW /* * In case the architecture enables interrupts in * context_switch(), we cannot busy wait, since that * would lead to deadlocks when an interrupt hits and * tries to wake up @prev. So bail and do a complete * remote wakeup. */ if (ttwu_activate_remote(p, wake_flags)) goto stat; #else cpu_relax(); #endif } /* * Pairs with the smp_wmb() in finish_lock_switch(). */ smp_rmb(); p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; if (p->sched_class->task_waking) p->sched_class->task_waking(p); cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; set_task_cpu(p, cpu); } #endif /* CONFIG_SMP */ ttwu_queue(p, cpu); stat: ttwu_stat(p, cpu, wake_flags); out: raw_spin_unlock_irqrestore(&p->pi_lock, flags); return success; } /**
/* * Called in case the task @p isn't fully descheduled from its runqueue, * in this case we must do a remote wakeup. Its a 'light' wakeup though, * since all we need to do is flip p->state to TASK_RUNNING, since * the task is still ->on_rq. */ static int ttwu_remote(struct task_struct *p, int wake_flags) { struct rq *rq; int ret = 0; rq = __task_rq_lock(p); if (p->on_rq) { ttwu_do_wakeup(rq, p, wake_flags); ret = 1; } __task_rq_unlock(rq); return ret; }
/* * Mark the task runnable and perform wakeup-preemption. */ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) { trace_sched_wakeup(p, true); check_preempt_curr(rq, p, wake_flags); p->state = TASK_RUNNING; #ifdef CONFIG_SMP if (p->sched_class->task_woken) p->sched_class->task_woken(rq, p); if (rq->idle_stamp) { u64 delta = rq->clock - rq->idle_stamp; u64 max = 2*sysctl_sched_migration_cost; if (delta > max) rq->avg_idle = max; else update_avg(&rq->avg_idle, delta); rq->idle_stamp = 0; } #endif
对于:ttwu_queue;
static void ttwu_queue(struct task_struct *p, int cpu) { struct rq *rq = cpu_rq(cpu); #if defined(CONFIG_SMP) if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { sched_clock_cpu(cpu); /* sync clocks x-cpu */ ttwu_queue_remote(p, cpu); return; } #endif raw_spin_lock(&rq->lock); ttwu_do_activate(rq, p, 0); raw_spin_unlock(&rq->lock); }
static void ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) { #ifdef CONFIG_SMP if (p->sched_contributes_to_load) rq->nr_uninterruptible--; #endif ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); ttwu_do_wakeup(rq, p, wake_flags); }