Linux Headquarters
[ Register ]
[ About us ] [ Home Page ]

Advertisement
[ Kernel ] [ Documentation ] [ Links ] [ Books ]

Advertisement

Kernel v2.6.28 /kernel/sched.c

Filename:/kernel/sched.c
Lines Added:321
Lines Deleted:196
Also changed in: (Previous) 2.6.28-rc9  2.6.28-rc8  2.6.28-rc7  2.6.28-rc6-git4  2.6.28-rc6-git3  2.6.28-rc6-git2 
(Following) 2.6.28-git1  2.6.28-git2  2.6.28-git3  2.6.28-git4  2.6.28-git5  2.6.28-git6 

Location
[  2.6.28
  [  kernel
     o  sched.c

Patch

diff --git a/kernel/sched.c b/kernel/sched.c
index ad1962d..e4bb1dd 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -55,6 +55,7 @@
 #include <linux/cpuset.h>
 #include <linux/percpu.h>
 #include <linux/kthread.h>
+#include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/sysctl.h>
 #include <linux/syscalls.h>
@@ -71,6 +72,7 @@
 #include <linux/debugfs.h>
 #include <linux/ctype.h>
 #include <linux/ftrace.h>
+#include <trace/sched.h>
 
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
@@ -204,11 +206,16 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
    rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
 }
 
+static inline int rt_bandwidth_enabled(void)
+{
+   return sysctl_sched_rt_runtime >= 0;
+}
+
 static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 {
    ktime_t now;
 
-   if (rt_b->rt_runtime == RUNTIME_INF)
+   if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
       return;
 
    if (hrtimer_active(&rt_b->rt_period_timer))
@@ -221,9 +228,8 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 
       now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
       hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
-      hrtimer_start(&rt_b->rt_period_timer,
-               rt_b->rt_period_timer.expires,
-               HRTIMER_MODE_ABS);
+      hrtimer_start_expires(&rt_b->rt_period_timer,
+            HRTIMER_MODE_ABS);
    }
    spin_unlock(&rt_b->rt_runtime_lock);
 }
@@ -298,9 +304,9 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
 static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
 static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
 #endif /* CONFIG_RT_GROUP_SCHED */
-#else /* !CONFIG_FAIR_GROUP_SCHED */
+#else /* !CONFIG_USER_SCHED */
 #define root_task_group init_task_group
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* CONFIG_USER_SCHED */
 
 /* task_group_lock serializes add/remove of task groups and also changes to
  * a task group's cpu shares.
@@ -380,7 +386,6 @@ struct cfs_rq {
 
    u64 exec_clock;
    u64 min_vruntime;
-   u64 pair_start;
 
    struct rb_root tasks_timeline;
    struct rb_node *rb_leftmost;
@@ -392,9 +397,9 @@ struct cfs_rq {
     * 'curr' points to currently running entity on this cfs_rq.
     * It is set to NULL otherwise (i.e when none are currently running).
     */
-   struct sched_entity *curr, *next;
+   struct sched_entity *curr, *next, *last;
 
-   unsigned long nr_spread_over;
+   unsigned int nr_spread_over;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
    struct rq *rq;   /* cpu runqueue to which this cfs_rq is attached */
@@ -604,9 +609,9 @@ struct rq {
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
-static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
+static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
 {
-   rq->curr->sched_class->check_preempt_curr(rq, p);
+   rq->curr->sched_class->check_preempt_curr(rq, p, sync);
 }
 
 static inline int cpu_of(struct rq *rq)
@@ -813,6 +818,13 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
 unsigned int sysctl_sched_shares_ratelimit = 250000;
 
 /*
+ * Inject some fuzzyness into changing the per-cpu group shares
+ * this avoids remote rq-locks at the expense of fairness.
+ * default: 4
+ */
+unsigned int sysctl_sched_shares_thresh = 4;
+
+/*
  * period over which we measure -rt task cpu usage in us.
  * default: 1s
  */
@@ -957,6 +969,14 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
    }
 }
 
+void task_rq_unlock_wait(struct task_struct *p)
+{
+   struct rq *rq = task_rq(p);
+
+   smp_mb(); /* spin-unlock-wait is not a full memory barrier */
+   spin_unlock_wait(&rq->lock);
+}
+
 static void __task_rq_unlock(struct rq *rq)
    __releases(rq->lock)
 {
@@ -1058,7 +1078,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
    struct hrtimer *timer = &rq->hrtick_timer;
    ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
 
-   timer->expires = time;
+   hrtimer_set_expires(timer, time);
 
    if (rq == this_rq()) {
       hrtimer_restart(timer);
@@ -1102,7 +1122,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
    hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
 }
 
-static void init_hrtick(void)
+static inline void init_hrtick(void)
 {
 }
 #endif /* CONFIG_SMP */
@@ -1121,7 +1141,7 @@ static void init_rq_hrtick(struct rq *rq)
    rq->hrtick_timer.function = hrtick;
    rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
 }
-#else
+#else   /* CONFIG_SCHED_HRTICK */
 static inline void hrtick_clear(struct rq *rq)
 {
 }
@@ -1133,7 +1153,7 @@ static inline void init_rq_hrtick(struct rq *rq)
 static inline void init_hrtick(void)
 {
 }
-#endif
+#endif   /* CONFIG_SCHED_HRTICK */
 
 /*
  * resched_task - mark a task 'to be rescheduled now'.
@@ -1380,38 +1400,24 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
    update_load_sub(&rq->load, load);
 }
 
-#ifdef CONFIG_SMP
-static unsigned long source_load(int cpu, int type);
-static unsigned long target_load(int cpu, int type);
-static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
-
-static unsigned long cpu_avg_load_per_task(int cpu)
-{
-   struct rq *rq = cpu_rq(cpu);
-
-   if (rq->nr_running)
-      rq->avg_load_per_task = rq->load.weight / rq->nr_running;
-
-   return rq->avg_load_per_task;
-}
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-
-typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
+#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
+typedef int (*tg_visitor)(struct task_group *, void *);
 
 /*
  * Iterate the full tree, calling @down when first entering a node and @up when
  * leaving it for the final time.
  */
-static void
-walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
+static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
 {
    struct task_group *parent, *child;
+   int ret;
 
    rcu_read_lock();
    parent = &root_task_group;
 down:
-   (*down)(parent, cpu, sd);
+   ret = (*down)(parent, data);
+   if (ret)
+      goto out_unlock;
    list_for_each_entry_rcu(child, &parent->children, siblings) {
       parent = child;
       goto down;
@@ -1419,14 +1425,45 @@ down:
 up:
       continue;
    }
-   (*up)(parent, cpu, sd);
+   ret = (*up)(parent, data);
+   if (ret)
+      goto out_unlock;
 
    child = parent;
    parent = parent->parent;
    if (parent)
       goto up;
+out_unlock:
    rcu_read_unlock();
+
+   return ret;
+}
+
+static int tg_nop(struct task_group *tg, void *data)
+{
+   return 0;
 }
+#endif
+
+#ifdef CONFIG_SMP
+static unsigned long source_load(int cpu, int type);
+static unsigned long target_load(int cpu, int type);
+static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
+
+static unsigned long cpu_avg_load_per_task(int cpu)
+{
+   struct rq *rq = cpu_rq(cpu);
+   unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
+
+   if (nr_running)
+      rq->avg_load_per_task = rq->load.weight / nr_running;
+   else
+      rq->avg_load_per_task = 0;
+
+   return rq->avg_load_per_task;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
 
 static void __set_se_shares(struct sched_entity *se, unsigned long shares);
 
@@ -1434,8 +1471,8 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
  * Calculate and set the cpu's group shares.
  */
 static void
-__update_group_shares_cpu(struct task_group *tg, int cpu,
-           unsigned long sd_shares, unsigned long sd_rq_weight)
+update_group_shares_cpu(struct task_group *tg, int cpu,
+         unsigned long sd_shares, unsigned long sd_rq_weight)
 {
    int boost = 0;
    unsigned long shares;
@@ -1466,19 +1503,23 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
     *
     */
    shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
+   shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
 
-   /*
-    * record the actual number of shares, not the boosted amount.
-    */
-   tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
-   tg->cfs_rq[cpu]->rq_weight = rq_weight;
+   if (abs(shares - tg->se[cpu]->load.weight) >
+         sysctl_sched_shares_thresh) {
+      struct rq *rq = cpu_rq(cpu);
+      unsigned long flags;
 
-   if (shares < MIN_SHARES)
-      shares = MIN_SHARES;
-   else if (shares > MAX_SHARES)
-      shares = MAX_SHARES;
+      spin_lock_irqsave(&rq->lock, flags);
+      /*
+       * record the actual number of shares, not the boosted amount.
+       */
+      tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
+      tg->cfs_rq[cpu]->rq_weight = rq_weight;
 
-   __set_se_shares(tg->se[cpu], shares);
+      __set_se_shares(tg->se[cpu], shares);
+      spin_unlock_irqrestore(&rq->lock, flags);
+   }
 }
 
 /*
@@ -1486,11 +1527,11 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
  * This needs to be done in a bottom-up fashion because the rq weight of a
  * parent group depends on the shares of its child groups.
  */
-static void
-tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
+static int tg_shares_up(struct task_group *tg, void *data)
 {
    unsigned long rq_weight = 0;
    unsigned long shares = 0;
+   struct sched_domain *sd = data;
    int i;
 
    for_each_cpu_mask(i, sd->span) {
@@ -1507,14 +1548,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
    if (!rq_weight)
       rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
 
-   for_each_cpu_mask(i, sd->span) {
-      struct rq *rq = cpu_rq(i);
-      unsigned long flags;
+   for_each_cpu_mask(i, sd->span)
+      update_group_shares_cpu(tg, i, shares, rq_weight);
 
-      spin_lock_irqsave(&rq->lock, flags);
-      __update_group_shares_cpu(tg, i, shares, rq_weight);
-      spin_unlock_irqrestore(&rq->lock, flags);
-   }
+   return 0;
 }
 
 /*
@@ -1522,10 +1559,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
  * This needs to be done in a top-down fashion because the load of a child
  * group is a fraction of its parents load.
  */
-static void
-tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
+static int tg_load_down(struct task_group *tg, void *data)
 {
    unsigned long load;
+   long cpu = (long)data;
 
    if (!tg->parent) {
       load = cpu_rq(cpu)->load.weight;
@@ -1536,11 +1573,8 @@ tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
    }
 
    tg->cfs_rq[cpu]->h_load = load;
-}
 
-static void
-tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
+   return 0;
 }
 
 static void update_shares(struct sched_domain *sd)
@@ -1550,7 +1584,7 @@ static void update_shares(struct sched_domain *sd)
 
    if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
       sd->last_update = now;
-      walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
+      walk_tg_tree(tg_nop, tg_shares_up, sd);
    }
 }
 
@@ -1561,9 +1595,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
    spin_lock(&rq->lock);
 }
 
-static void update_h_load(int cpu)
+static void update_h_load(long cpu)
 {
-   walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
+   walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
 }
 
 #else
@@ -1782,7 +1816,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
    /*
     * Buddy candidates are cache hot:
     */
-   if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next))
+   if (sched_feat(CACHE_HOT_BUDDY) &&
+         (&p->se == cfs_rq_of(&p->se)->next ||
+          &p->se == cfs_rq_of(&p->se)->last))
       return 1;
 
    if (p->sched_class != &fair_sched_class)
@@ -1918,14 +1954,12 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
        * just go back and repeat.
        */
       rq = task_rq_lock(p, &flags);
+      trace_sched_wait_task(rq, p);
       running = task_running(rq, p);
       on_rq = p->se.on_rq;
       ncsw = 0;
-      if (!match_state || p->state == match_state) {
-         ncsw = p->nivcsw + p->nvcsw;
-         if (unlikely(!ncsw))
-            ncsw = 1;
-      }
+      if (!match_state || p->state == match_state)
+         ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
       task_rq_unlock(rq, &flags);
 
       /*
@@ -2282,10 +2316,8 @@ out_activate:
    success = 1;
 
 out_running:
-   trace_mark(kernel_sched_wakeup,
-      "pid %d state %ld ## rq %p task %p rq->curr %p",
-      p->pid, p->state, rq, p, rq->curr);
-   check_preempt_curr(rq, p);
+   trace_sched_wakeup(rq, p);
+   check_preempt_curr(rq, p, sync);
 
    p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
@@ -2417,10 +2449,8 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
       p->sched_class->task_new(rq, p);
       inc_nr_running(rq);
    }
-   trace_mark(kernel_sched_wakeup_new,
-      "pid %d state %ld ## rq %p task %p rq->curr %p",
-      p->pid, p->state, rq, p, rq->curr);
-   check_preempt_curr(rq, p);
+   trace_sched_wakeup_new(rq, p);
+   check_preempt_curr(rq, p, 0);
 #ifdef CONFIG_SMP
    if (p->sched_class->task_wake_up)
       p->sched_class->task_wake_up(rq, p);
@@ -2592,11 +2622,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
    struct mm_struct *mm, *oldmm;
 
    prepare_task_switch(rq, prev, next);
-   trace_mark(kernel_sched_schedule,
-      "prev_pid %d next_pid %d prev_state %ld "
-      "## rq %p prev %p next %p",
-      prev->pid, next->pid, prev->state,
-      rq, prev, next);
+   trace_sched_switch(rq, prev, next);
    mm = next->mm;
    oldmm = prev->active_mm;
    /*
@@ -2836,6 +2862,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
        || unlikely(!cpu_active(dest_cpu)))
       goto out;
 
+   trace_sched_migrate_task(rq, p, dest_cpu);
    /* force the process onto the specified CPU */
    if (migrate_task(p, dest_cpu, &req)) {
       /* Need to wait for migration thread (might exit: take ref). */
@@ -2880,7 +2907,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
     * Note that idle threads have a prio of MAX_PRIO, for this test
     * to be always true for them.
     */
-   check_preempt_curr(this_rq, p);
+   check_preempt_curr(this_rq, p, 0);
 }
 
 /*
@@ -3329,7 +3356,7 @@ small_imbalance:
       } else
          this_load_per_task = cpu_avg_load_per_task(this_cpu);
 
-      if (max_load - this_load + 2*busiest_load_per_task >=
+      if (max_load - this_load + busiest_load_per_task >=
                busiest_load_per_task * imbn) {
          *imbalance = busiest_load_per_task;
          return busiest;
@@ -4037,23 +4064,26 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
 EXPORT_PER_CPU_SYMBOL(kstat);
 
 /*
- * Return p->sum_exec_runtime plus any more ns on the sched_clock
- * that have not yet been banked in case the task is currently running.
+ * Return any ns on the sched_clock that have not yet been banked in
+ * @p in case that task is currently running.
  */
-unsigned long long task_sched_runtime(struct task_struct *p)
+unsigned long long task_delta_exec(struct task_struct *p)
 {
    unsigned long flags;
-   u64 ns, delta_exec;
    struct rq *rq;
+   u64 ns = 0;
 
    rq = task_rq_lock(p, &flags);
-   ns = p->se.sum_exec_runtime;
+
    if (task_current(rq, p)) {
+      u64 delta_exec;
+
       update_rq_clock(rq);
       delta_exec = rq->clock - p->se.exec_start;
       if ((s64)delta_exec > 0)
-         ns += delta_exec;
+         ns = delta_exec;
    }
+
    task_rq_unlock(rq, &flags);
 
    return ns;
@@ -4070,6 +4100,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
    cputime64_t tmp;
 
    p->utime = cputime_add(p->utime, cputime);
+   account_group_user_time(p, cputime);
 
    /* Add user time to cpustat. */
    tmp = cputime_to_cputime64(cputime);
@@ -4094,6 +4125,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime)
    tmp = cputime_to_cputime64(cputime);
 
    p->utime = cputime_add(p->utime, cputime);
+   account_group_user_time(p, cputime);
    p->gtime = cputime_add(p->gtime, cputime);
 
    cpustat->user = cputime64_add(cpustat->user, tmp);
@@ -4129,6 +4161,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
    }
 
    p->stime = cputime_add(p->stime, cputime);
+   account_group_system_time(p, cputime);
 
    /* Add system time to cpustat. */
    tmp = cputime_to_cputime64(cputime);
@@ -4170,6 +4203,7 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
 
    if (p == rq->idle) {
       p->stime = cputime_add(p->stime, steal);
+      account_group_system_time(p, steal);
       if (atomic_read(&rq->nr_iowait) > 0)
          cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
       else
@@ -4426,12 +4460,8 @@ need_resched_nonpreemptible:
    if (sched_feat(HRTICK))
       hrtick_clear(rq);
 
-   /*
-    * Do the rq-clock update outside the rq lock:
-    */
-   local_irq_disable();
+   spin_lock_irq(&rq->lock);
    update_rq_clock(rq);
-   spin_lock(&rq->lock);
    clear_tsk_need_resched(prev);
 
    if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@ -4627,6 +4657,15 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
 }
 EXPORT_SYMBOL_GPL(__wake_up_sync);   /* For internal use only */
 
+/**
+ * complete: - signals a single thread waiting on this completion
+ * @x:  holds the state of this particular completion
+ *
+ * This will wake up a single thread waiting on this completion. Threads will be
+ * awakened in the same order in which they were queued.
+ *
+ * See also complete_all(), wait_for_completion() and related routines.
+ */
 void complete(struct completion *x)
 {
    unsigned long flags;
@@ -4638,6 +4677,12 @@ void complete(struct completion *x)
 }
 EXPORT_SYMBOL(complete);
 
+/**
+ * complete_all: - signals all threads waiting on this completion
+ * @x:  holds the state of this particular completion
+ *
+ * This will wake up all threads waiting on this particular completion event.
+ */
 void complete_all(struct completion *x)
 {
    unsigned long flags;
@@ -4658,10 +4703,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
       wait.flags |= WQ_FLAG_EXCLUSIVE;
       __add_wait_queue_tail(&x->wait, &wait);
       do {
-         if ((state == TASK_INTERRUPTIBLE &&
-              signal_pending(current)) ||
-             (state == TASK_KILLABLE &&
-              fatal_signal_pending(current))) {
+         if (signal_pending_state(state, current)) {
             timeout = -ERESTARTSYS;
             break;
          }
@@ -4689,12 +4731,31 @@ wait_for_common(struct completion *x, long timeout, int state)
    return timeout;
 }
 
+/**
+ * wait_for_completion: - waits for completion of a task
+ * @x:  holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout.
+ *
+ * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
+ * and interrupt capability. Also see complete().
+ */
 void __sched wait_for_completion(struct completion *x)
 {
    wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(wait_for_completion);
 
+/**
+ * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
+ * @x:  holds the state of this particular completion
+ * @timeout:  timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible.
+ */
 unsigned long __sched
 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
 {
@@ -4702,6 +4763,13 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout)
 }
 EXPORT_SYMBOL(wait_for_completion_timeout);
 
+/**
+ * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
+ * @x:  holds the state of this particular completion
+ *
+ * This waits for completion of a specific task to be signaled. It is
+ * interruptible.
+ */
 int __sched wait_for_completion_interruptible(struct completion *x)
 {
    long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
@@ -4711,6 +4779,14 @@ int __sched wait_for_completion_interruptible(struct completion *x)
 }
 EXPORT_SYMBOL(wait_for_completion_interruptible);
 
+/**
+ * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
+ * @x:  holds the state of this particular completion
+ * @timeout:  timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. It is interruptible. The timeout is in jiffies.
+ */
 unsigned long __sched
 wait_for_completion_interruptible_timeout(struct completion *x,
                  unsigned long timeout)
@@ -4719,6 +4795,13 @@ wait_for_completion_interruptible_timeout(struct completion *x,
 }
 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
 
+/**
+ * wait_for_completion_killable: - waits for completion of a task (killable)
+ * @x:  holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It can be
+ * interrupted by a kill signal.
+ */
 int __sched wait_for_completion_killable(struct completion *x)
 {
    long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
@@ -5121,7 +5204,8 @@ recheck:
        * Do not allow realtime tasks into groups that have no runtime
        * assigned.
        */
-      if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
+      if (rt_bandwidth_enabled() && rt_policy(policy) &&
+            task_group(p)->rt_bandwidth.rt_runtime == 0)
          return -EPERM;
 #endif
 
@@ -5787,6 +5871,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
    struct rq *rq = cpu_rq(cpu);
    unsigned long flags;
 
+   spin_lock_irqsave(&rq->lock, flags);
+
    __sched_fork(idle);
    idle->se.exec_start = sched_clock();
 
@@ -5794,7 +5880,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
    idle->cpus_allowed = cpumask_of_cpu(cpu);
    __set_task_cpu(idle, cpu);
 
-   spin_lock_irqsave(&rq->lock, flags);
    rq->curr = rq->idle = idle;
 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
    idle->oncpu = 1;
@@ -5957,7 +6042,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
    set_task_cpu(p, dest_cpu);
    if (on_rq) {
       activate_task(rq_dest, p, 0);
-      check_preempt_curr(rq_dest, p);
+      check_preempt_curr(rq_dest, p, 0);
    }
 done:
    ret = 1;
@@ -6282,7 +6367,7 @@ set_table_entry(struct ctl_table *entry,
 static struct ctl_table *
 sd_alloc_ctl_domain_table(struct sched_domain *sd)
 {
-   struct ctl_table *table = sd_alloc_ctl_entry(12);
+   struct ctl_table *table = sd_alloc_ctl_entry(13);
 
    if (table == NULL)
       return NULL;
@@ -6310,7 +6395,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
       sizeof(int), 0644, proc_dointvec_minmax);
    set_table_entry(&table[10], "flags", &sd->flags,
       sizeof(int), 0644, proc_dointvec_minmax);
-   /* &table[11] is terminator */
+   set_table_entry(&table[11], "name", sd->name,
+      CORENAME_MAX_SIZE, 0444, proc_dostring);
+   /* &table[12] is terminator */
 
    return table;
 }
@@ -6500,7 +6587,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
          req = list_entry(rq->migration_queue.next,
                 struct migration_req, list);
          list_del_init(&req->list);
+         spin_unlock_irq(&rq->lock);
          complete(&req->done);
+         spin_lock_irq(&rq->lock);
       }
       spin_unlock_irq(&rq->lock);
       break;
@@ -6802,15 +6891,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
    struct sched_domain *tmp;
 
    /* Remove the sched domains which do not contribute to scheduling. */
-   for (tmp = sd; tmp; tmp = tmp->parent) {
+   for (tmp = sd; tmp; ) {
       struct sched_domain *parent = tmp->parent;
       if (!parent)
          break;
+
       if (sd_parent_degenerate(tmp, parent)) {
          tmp->parent = parent->parent;
          if (parent->parent)
             parent->parent->child = tmp;
-      }
+      } else
+         tmp = tmp->parent;
    }
 
    if (sd && sd_degenerate(sd)) {
@@ -7194,13 +7285,21 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
  * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
  */
 
+#ifdef CONFIG_SCHED_DEBUG
+# define SD_INIT_NAME(sd, type)      sd->name = #type
+#else
+# define SD_INIT_NAME(sd, type)      do { } while (0)
+#endif
+
 #define   SD_INIT(sd, type)   sd_init_##type(sd)
+
 #define SD_INIT_FUNC(type)   \
 static noinline void sd_init_##type(struct sched_domain *sd)   \
 {                        \
    memset(sd, 0, sizeof(*sd));            \
    *sd = SD_##type##_INIT;               \
    sd->level = SD_LV_##type;            \
+   SD_INIT_NAME(sd, type);               \
 }
 
 SD_INIT_FUNC(CPU)
@@ -7591,6 +7690,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 error:
    free_sched_groups(cpu_map, tmpmask);
    SCHED_CPUMASK_FREE((void *)allmasks);
+   kfree(rd);
    return -ENOMEM;
 #endif
 }
@@ -7692,13 +7792,14 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
  *
  * The passed in 'doms_new' should be kmalloc'd. This routine takes
  * ownership of it and will kfree it when done with it. If the caller
- * failed the kmalloc call, then it can pass in doms_new == NULL,
- * and partition_sched_domains() will fallback to the single partition
- * 'fallback_doms', it also forces the domains to be rebuilt.
+ * failed the kmalloc call, then it can pass in doms_new == NULL &&
+ * ndoms_new == 1, and partition_sched_domains() will fallback to
+ * the single partition 'fallback_doms', it also forces the domains
+ * to be rebuilt.
  *
- * If doms_new==NULL it will be replaced with cpu_online_map.
- * ndoms_new==0 is a special case for destroying existing domains.
- * It will not create the default domain.
+ * If doms_new == NULL it will be replaced with cpu_online_map.
+ * ndoms_new == 0 is a special case for destroying existing domains,
+ * and it will not create the default domain.
  *
  * Call with hotplug lock held
  */
@@ -8242,20 +8343,25 @@ void __might_sleep(char *file, int line)
 #ifdef in_atomic
    static unsigned long prev_jiffy;   /* ratelimiting */
 
-   if ((in_atomic() || irqs_disabled()) &&
-       system_state == SYSTEM_RUNNING && !oops_in_progress) {
-      if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
-         return;
-      prev_jiffy = jiffies;
-      printk(KERN_ERR "BUG: sleeping function called from invalid"
-            " context at %s:%d\n", file, line);
-      printk("in_atomic():%d, irqs_disabled():%d\n",
-         in_atomic(), irqs_disabled());
-      debug_show_held_locks(current);
-      if (irqs_disabled())
-         print_irqtrace_events(current);
-      dump_stack();
-   }
+   if ((!in_atomic() && !irqs_disabled()) ||
+          system_state != SYSTEM_RUNNING || oops_in_progress)
+      return;
+   if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+      return;
+   prev_jiffy = jiffies;
+
+   printk(KERN_ERR
+      "BUG: sleeping function called from invalid context at %s:%d\n",
+         file, line);
+   printk(KERN_ERR
+      "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
+         in_atomic(), irqs_disabled(),
+         current->pid, current->comm);
+
+   debug_show_held_locks(current);
+   if (irqs_disabled())
+      print_irqtrace_events(current);
+   dump_stack();
 #endif
 }
 EXPORT_SYMBOL(__might_sleep);
@@ -8753,73 +8859,95 @@ static DEFINE_MUTEX(rt_constraints_mutex);
 static unsigned long to_ratio(u64 period, u64 runtime)
 {
    if (runtime == RUNTIME_INF)
-      return 1ULL << 16;
+      return 1ULL << 20;
 
-   return div64_u64(runtime << 16, period);
+   return div64_u64(runtime << 20, period);
 }
 
-#ifdef CONFIG_CGROUP_SCHED
-static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
+/* Must be called with tasklist_lock held */
+static inline int tg_has_rt_tasks(struct task_group *tg)
 {
-   struct task_group *tgi, *parent = tg->parent;
-   unsigned long total = 0;
+   struct task_struct *g, *p;
 
-   if (!parent) {
-      if (global_rt_period() < period)
-         return 0;
+   do_each_thread(g, p) {
+      if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
+         return 1;
+   } while_each_thread(g, p);
 
-      return to_ratio(period, runtime) <
-         to_ratio(global_rt_period(), global_rt_runtime());
-   }
+   return 0;
+}
 
-   if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period)
-      return 0;
+struct rt_schedulable_data {
+   struct task_group *tg;
+   u64 rt_period;
+   u64 rt_runtime;
+};
 
-   rcu_read_lock();
-   list_for_each_entry_rcu(tgi, &parent->children, siblings) {
-      if (tgi == tg)
-         continue;
+static int tg_schedulable(struct task_group *tg, void *data)
+{
+   struct rt_schedulable_data *d = data;
+   struct task_group *child;
+   unsigned long total, sum = 0;
+   u64 period, runtime;
+
+   period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+   runtime = tg->rt_bandwidth.rt_runtime;
 
-      total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
-            tgi->rt_bandwidth.rt_runtime);
+   if (tg == d->tg) {
+      period = d->rt_period;
+      runtime = d->rt_runtime;
    }
-   rcu_read_unlock();
 
-   return total + to_ratio(period, runtime) <=
-      to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
-            parent->rt_bandwidth.rt_runtime);
-}
-#elif defined CONFIG_USER_SCHED
-static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
-{
-   struct task_group *tgi;
-   unsigned long total = 0;
-   unsigned long global_ratio =
-      to_ratio(global_rt_period(), global_rt_runtime());
+   /*
+    * Cannot have more runtime than the period.
+    */
+   if (runtime > period && runtime != RUNTIME_INF)
+      return -EINVAL;
 
-   rcu_read_lock();
-   list_for_each_entry_rcu(tgi, &task_groups, list) {
-      if (tgi == tg)
-         continue;
+   /*
+    * Ensure we don't starve existing RT tasks.
+    */
+   if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
+      return -EBUSY;
+
+   total = to_ratio(period, runtime);
+
+   /*
+    * Nobody can have more than the global setting allows.
+    */
+   if (total > to_ratio(global_rt_period(), global_rt_runtime()))
+      return -EINVAL;
 
-      total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
-            tgi->rt_bandwidth.rt_runtime);
+   /*
+    * The sum of our children's runtime should not exceed our own.
+    */
+   list_for_each_entry_rcu(child, &tg->children, siblings) {
+      period = ktime_to_ns(child->rt_bandwidth.rt_period);
+      runtime = child->rt_bandwidth.rt_runtime;
+
+      if (child == d->tg) {
+         period = d->rt_period;
+         runtime = d->rt_runtime;
+      }
+
+      sum += to_ratio(period, runtime);
    }
-   rcu_read_unlock();
 
-   return total + to_ratio(period, runtime) < global_ratio;
+   if (sum > total)
+      return -EINVAL;
+
+   return 0;
 }
-#endif
 
-/* Must be called with tasklist_lock held */
-static inline int tg_has_rt_tasks(struct task_group *tg)
+static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 {
-   struct task_struct *g, *p;
-   do_each_thread(g, p) {
-      if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
-         return 1;
-   } while_each_thread(g, p);
-   return 0;
+   struct rt_schedulable_data data = {
+      .tg = tg,
+      .rt_period = period,
+      .rt_runtime = runtime,
+   };
+
+   return walk_tg_tree(tg_schedulable, tg_nop, &data);
 }
 
 static int tg_set_bandwidth(struct task_group *tg,
@@ -8829,14 +8957,9 @@ static int tg_set_bandwidth(struct task_group *tg,
 
    mutex_lock(&rt_constraints_mutex);
    read_lock(&tasklist_lock);
-   if (rt_runtime == 0 && tg_has_rt_tasks(tg)) {
-      err = -EBUSY;
-      goto unlock;
-   }
-   if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
-      err = -EINVAL;
+   err = __rt_schedulable(tg, rt_period, rt_runtime);
+   if (err)
       goto unlock;
-   }
 
    spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
    tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
@@ -8905,19 +9028,25 @@ long sched_group_rt_period(struct task_group *tg)
 
 static int sched_rt_global_constraints(void)
 {
-   struct task_group *tg = &root_task_group;
-   u64 rt_runtime, rt_period;
+   u64 runtime, period;
    int ret = 0;
 
    if (sysctl_sched_rt_period <= 0)
       return -EINVAL;
 
-   rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
-   rt_runtime = tg->rt_bandwidth.rt_runtime;
+   runtime = global_rt_runtime();
+   period = global_rt_period();
+
+   /*
+    * Sanity check on the sysctl variables.
+    */
+   if (runtime > period && runtime != RUNTIME_INF)
+      return -EINVAL;
 
    mutex_lock(&rt_constraints_mutex);
-   if (!__rt_schedulable(tg, rt_period, rt_runtime))
-      ret = -EINVAL;
+   read_lock(&tasklist_lock);
+   ret = __rt_schedulable(NULL, 0, 0);
+   read_unlock(&tasklist_lock);
    mutex_unlock(&rt_constraints_mutex);
 
    return ret;
@@ -8991,7 +9120,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
 
    if (!cgrp->parent) {
       /* This is early initialization for the top cgroup */
-      init_task_group.css.cgroup = cgrp;
       return &init_task_group.css;
    }
 
@@ -9000,9 +9128,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
    if (IS_ERR(tg))
       return ERR_PTR(-ENOMEM);
 
-   /* Bind the cgroup to task_group object we just created */
-   tg->css.cgroup = cgrp;
-
    return &tg->css;
 }
 


Comments: webmaster (at) linuxhq.com.
Advertising: banners (at) linuxhq.com.
Compilation ©1998-2008 Linux Headquarters, Inc.