static inline int __ffs(int x) { int r = 0; if (!x) return 0; if (!(x & 0xffff)) { x >>= 16; r += 16; } if (!(x & 0xff)) { x >>= 8; r += 8; } if (!(x & 0xf)) { x >>= 4; r += 4; } if (!(x & 3)) { x >>= 2; r += 2; } if (!(x & 1)) { x >>= 1; r += 1; } return r; }
struct runqueue { spinlock_t lock; 这是个自旋锁,nginx里解决惊群现象时也是用这个。与普通锁的区别就是,使用普通锁时,你去试图拿一把锁,结果发现已经被别人拿走了,你就在那睡觉,等别人锁用完了叫你起来。所以如果有一个人拿住锁了,一百个人都在门前睡觉等。当之前的人用完锁回来后,会叫醒所有100个等锁的人,然后这些人开始互相抢,抢到的人拿锁进去,其他的人继续等。自旋锁不同,当他去拿锁发现锁被别人拿走了,他在那不睡觉的等,稍打个盹就看看自己主动看看锁有没有还回来。大家比较出优劣了吗? /* * nr_running and cpu_load should be in the same cacheline because * remote CPUs use both these fields when doing load calculation. */ unsigned long nr_running; #ifdef CONFIG_SMP unsigned long cpu_load; #endif unsigned long long nr_switches; /* * This is part of a global counter where only the total sum * over all CPUs matters. A task can increase this counter on * one CPU and if it got migrated afterwards it may decrease * it on another CPU. Always updated under the runqueue lock: */ unsigned long nr_uninterruptible; unsigned long expired_timestamp; unsigned long long timestamp_last_tick; task_t *curr, *idle; struct mm_struct *prev_mm; prio_array_t *active, *expired, arrays[2];上面说了半天的优先级队列在这里,但是在runqueue里,为什么不只一个呢?这个在下面讲。 int best_expired_prio; atomic_t nr_iowait; ... ... };
asmlinkage void __sched schedule(void) { long *switch_count; task_t *prev, *next; runqueue_t *rq; prio_array_t *array; struct list_head *queue; unsigned long long now; unsigned long run_time; int cpu, idx; /* * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path for now. * Otherwise, whine if we are scheduling when we should not be. */ if (likely(!(current->exit_state & (EXIT_DEAD EXIT_ZOMBIE)))) {先看看当前运行进程的状态 if (unlikely(in_atomic())) { printk(KERN_ERR "scheduling while atomic: " "%s/0x%08x/%d\n", current->comm, preempt_count(), current->pid); dump_stack(); } } profile_hit(SCHED_PROFILING, __builtin_return_address(0)); need_resched: preempt_disable(); prev = current; release_kernel_lock(prev); need_resched_nonpreemptible: rq = this_rq(); 这行找到这个CPU对应的runqueue,再次强调,每个CPU有一个自己的runqueue /* * The idle thread is not allowed to schedule! * Remove this check after it has been exercised a bit. */ if (unlikely(current == rq->idle) && current->state != TASK_RUNNING) { printk(KERN_ERR "bad: scheduling from the idle thread!\n"); dump_stack(); } schedstat_inc(rq, sched_cnt); now = sched_clock(); if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG)) run_time = now - prev->timestamp; else run_time = NS_MAX_SLEEP_AVG; /* * Tasks with interactive credits get charged less run_time * at high sleep_avg to delay them losing their interactive * status */ if (HIGH_CREDIT(prev)) run_time /= (CURRENT_BONUS(prev) ? : 1); spin_lock_irq(&rq->lock); if (unlikely(current->flags & PF_DEAD)) current->state = EXIT_DEAD; /* * if entering off of a kernel preemption go straight * to picking the next task. */ switch_count = &prev->nivcsw; if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { switch_count = &prev->nvcsw; if (unlikely((prev->state & TASK_INTERRUPTIBLE) && unlikely(signal_pending(prev)))) prev->state = TASK_RUNNING; else { if (prev->state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible++; deactivate_task(prev, rq); } } cpu = smp_processor_id(); if (unlikely(!rq->nr_running)) { go_idle: idle_balance(cpu, rq); if (!rq->nr_running) { next = rq->idle; rq->expired_timestamp = 0; wake_sleeping_dependent(cpu, rq); /* * wake_sleeping_dependent() might have released * the runqueue, so break out if we got new * tasks meanwhile: */ if (!rq->nr_running) goto switch_tasks; } } else { if (dependent_sleeper(cpu, rq)) { next = rq->idle; goto switch_tasks; } /* * dependent_sleeper() releases and reacquires the runqueue * lock, hence go into the idle loop if the rq went * empty meanwhile: */ if (unlikely(!rq->nr_running)) goto go_idle; } array = rq->active; if (unlikely(!array->nr_active)) { 上面说过的,需要重新计算时间片时,就用已经计算好的expired队列了 /* * Switch the active and expired arrays. */ schedstat_inc(rq, sched_switch); rq->active = rq->expired; rq->expired = array; array = rq->active; rq->expired_timestamp = 0; rq->best_expired_prio = MAX_PRIO; } else schedstat_inc(rq, sched_noswitch); idx = sched_find_first_bit(array->bitmap); 找到优先级最高的队列 queue = array->queue + idx; next = list_entry(queue->next, task_t, run_list); if (!rt_task(next) && next->activated > 0) { unsigned long long delta = now - next->timestamp; if (next->activated == 1) delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; array = next->array; dequeue_task(next, array); recalc_task_prio(next, next->timestamp + delta); enqueue_task(next, array); } next->activated = 0; switch_tasks: if (next == rq->idle) schedstat_inc(rq, sched_goidle); prefetch(next); clear_tsk_need_resched(prev); rcu_qsctr_inc(task_cpu(prev)); prev->sleep_avg -= run_time; if ((long)prev->sleep_avg <= 0) { prev->sleep_avg = 0; if (!(HIGH_CREDIT(prev) LOW_CREDIT(prev))) prev->interactive_credit--; } prev->timestamp = prev->last_ran = now; sched_info_switch(prev, next); if (likely(prev != next)) { 表面现在正在执行的进程,不是选出来的优先级最高的进程 next->timestamp = now; rq->nr_switches++; rq->curr = next; ++*switch_count; prepare_arch_switch(rq, next); prev = context_switch(rq, prev, next); 所以需要完成进程上下文切换,把之前的进程信息CACHE住 barrier(); finish_task_switch(prev); } else spin_unlock_irq(&rq->lock); prev = current; if (unlikely(reacquire_kernel_lock(prev) < 0)) goto need_resched_nonpreemptible; preempt_enable_no_resched(); if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) goto need_resched; }