From: Bharata B Rao on 25 Feb 2010 03:20 On Fri, Feb 12, 2010 at 06:54:57PM -0800, Paul wrote: > --- a/kernel/sched.c > +++ b/kernel/sched.c > @@ -190,10 +190,28 @@ static inline int rt_bandwidth_enabled(void) > return sysctl_sched_rt_runtime >= 0; > } > > -static void start_rt_bandwidth(struct rt_bandwidth *rt_b) > +static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) > { > - ktime_t now; > + unsigned long delta; > + ktime_t soft, hard, now; > + > + for (;;) { > + if (hrtimer_active(period_timer)) > + break; > + > + now = hrtimer_cb_get_time(period_timer); > + hrtimer_forward(period_timer, now, period); > + > + soft = hrtimer_get_softexpires(period_timer); > + hard = hrtimer_get_expires(period_timer); > + delta = ktime_to_ns(ktime_sub(hard, soft)); > + __hrtimer_start_range_ns(period_timer, soft, delta, > + HRTIMER_MODE_ABS_PINNED, 0); > + } > +} > > +static void start_rt_bandwidth(struct rt_bandwidth *rt_b) > +{ > if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) > return; > > @@ -201,22 +219,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) > return; > > raw_spin_lock(&rt_b->rt_runtime_lock); > - for (;;) { > - unsigned long delta; > - ktime_t soft, hard; > - > - if (hrtimer_active(&rt_b->rt_period_timer)) > - break; > - > - now = hrtimer_cb_get_time(&rt_b->rt_period_timer); > - hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); > - > - soft = hrtimer_get_softexpires(&rt_b->rt_period_timer); > - hard = hrtimer_get_expires(&rt_b->rt_period_timer); > - delta = ktime_to_ns(ktime_sub(hard, soft)); > - __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, > - HRTIMER_MODE_ABS_PINNED, 0); > - } > + start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); > raw_spin_unlock(&rt_b->rt_runtime_lock); > } > > @@ -241,6 +244,15 @@ struct cfs_rq; > > static LIST_HEAD(task_groups); > > +#ifdef CONFIG_CFS_BANDWIDTH > +struct cfs_bandwidth { > + raw_spinlock_t lock; > + ktime_t period; > + u64 runtime, quota; > + struct hrtimer period_timer; > +}; > +#endif > + > /* task group related information */ > struct task_group { > #ifdef CONFIG_CGROUP_SCHED > @@ -272,6 +284,10 @@ struct task_group { > struct task_group *parent; > struct list_head siblings; > struct list_head children; > + > +#ifdef CONFIG_CFS_BANDWIDTH > + struct cfs_bandwidth cfs_bandwidth; > +#endif > }; > > #ifdef CONFIG_USER_SCHED > @@ -445,9 +461,76 @@ struct cfs_rq { > */ > unsigned long rq_weight; > #endif > +#ifdef CONFIG_CFS_BANDWIDTH > + u64 quota_assigned, quota_used; > +#endif > #endif > }; > > +#ifdef CONFIG_CFS_BANDWIDTH > +static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); > + > +static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) > +{ > + struct cfs_bandwidth *cfs_b = > + container_of(timer, struct cfs_bandwidth, period_timer); > + ktime_t now; > + int overrun; > + int idle = 0; > + > + for (;;) { > + now = hrtimer_cb_get_time(timer); > + overrun = hrtimer_forward(timer, now, cfs_b->period); > + > + if (!overrun) > + break; > + > + idle = do_sched_cfs_period_timer(cfs_b, overrun); > + } > + > + return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; > +} > + > +static > +void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, u64 quota, u64 period) > +{ > + raw_spin_lock_init(&cfs_b->lock); > + cfs_b->quota = cfs_b->runtime = quota; > + cfs_b->period = ns_to_ktime(period); > + > + hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); > + cfs_b->period_timer.function = sched_cfs_period_timer; > +} > + > +static > +void init_cfs_rq_quota(struct cfs_rq *cfs_rq) > +{ > + cfs_rq->quota_used = 0; > + if (cfs_rq->tg->cfs_bandwidth.quota == RUNTIME_INF) > + cfs_rq->quota_assigned = RUNTIME_INF; > + else > + cfs_rq->quota_assigned = 0; > +} > + > +static void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) > +{ > + if (cfs_b->quota == RUNTIME_INF) > + return; > + > + if (hrtimer_active(&cfs_b->period_timer)) > + return; > + > + raw_spin_lock(&cfs_b->lock); > + start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period); > + raw_spin_unlock(&cfs_b->lock); > +} > + > +static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) > +{ > + hrtimer_cancel(&cfs_b->period_timer); > +} > +#endif May be you could define some of this functions for !CONFIG_CFS_BANDWIDTH case and avoid them calling under #ifdef ? I was given this comment during my initial iterations. > + > /* Real-Time classes' related field in a runqueue: */ > struct rt_rq { > struct rt_prio_array active; > @@ -1834,6 +1917,14 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) > #endif > } > > +#ifdef CONFIG_CFS_BANDWIDTH > +/* > + * default period for cfs group bandwidth. > + * default: 0.5s > + */ > +static u64 sched_cfs_bandwidth_period = 500000000ULL; > +#endif > + > #include "sched_stats.h" > #include "sched_idletask.c" > #include "sched_fair.c" > @@ -9422,6 +9513,9 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, > tg->cfs_rq[cpu] = cfs_rq; > init_cfs_rq(cfs_rq, rq); > cfs_rq->tg = tg; > +#ifdef CONFIG_CFS_BANDWIDTH > + init_cfs_rq_quota(cfs_rq); > +#endif > if (add) > list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); > > @@ -9594,6 +9688,10 @@ void __init sched_init(void) > * We achieve this by letting init_task_group's tasks sit > * directly in rq->cfs (i.e init_task_group->se[] = NULL). > */ > +#ifdef CONFIG_CFS_BANDWIDTH > + init_cfs_bandwidth(&init_task_group.cfs_bandwidth, > + RUNTIME_INF, sched_cfs_bandwidth_period); > +#endif > init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); > #elif defined CONFIG_USER_SCHED > root_task_group.shares = NICE_0_LOAD; > @@ -9851,6 +9949,10 @@ static void free_fair_sched_group(struct task_group *tg) > { > int i; > > +#ifdef CONFIG_CFS_BANDWIDTH > + destroy_cfs_bandwidth(&tg->cfs_bandwidth); > +#endif > + > for_each_possible_cpu(i) { > if (tg->cfs_rq) > kfree(tg->cfs_rq[i]); > @@ -9878,7 +9980,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) > goto err; > > tg->shares = NICE_0_LOAD; > - > +#ifdef CONFIG_CFS_BANDWIDTH > + init_cfs_bandwidth(&tg->cfs_bandwidth, RUNTIME_INF, > + sched_cfs_bandwidth_period); > +#endif > for_each_possible_cpu(i) { > rq = cpu_rq(i); > > @@ -10333,7 +10438,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) > return walk_tg_tree(tg_schedulable, tg_nop, &data); > } > > -static int tg_set_bandwidth(struct task_group *tg, > +static int tg_set_rt_bandwidth(struct task_group *tg, > u64 rt_period, u64 rt_runtime) > { > int i, err = 0; > @@ -10372,7 +10477,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) > if (rt_runtime_us < 0) > rt_runtime = RUNTIME_INF; > > - return tg_set_bandwidth(tg, rt_period, rt_runtime); > + return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); > } > > long sched_group_rt_runtime(struct task_group *tg) > @@ -10397,7 +10502,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) > if (rt_period == 0) > return -EINVAL; > > - return tg_set_bandwidth(tg, rt_period, rt_runtime); > + return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); > } > > long sched_group_rt_period(struct task_group *tg) > @@ -10604,6 +10709,120 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) > > return (u64) tg->shares; > } > + > +#ifdef CONFIG_CFS_BANDWIDTH > +static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) > +{ > + int i; > + static DEFINE_MUTEX(mutex); > + > + if (tg == &init_task_group) > + return -EINVAL; > + > + if (!period) > + return -EINVAL; > + > + mutex_lock(&mutex); What is this mutex for ? So you essentially serializing the bandwidth setting of all groups ? While that iself isn't an issue, just wondering if cfs_bandwidth.lock isn't suffient ? > + /* > + * Ensure we have at least one tick of bandwidth every period. This is > + * to prevent reaching a state of large arrears when throttled via > + * entity_tick() resulting in prolonged exit starvation. > + */ > + if (NS_TO_JIFFIES(quota) < 1) > + return -EINVAL; Return with mutex held ? > + > + raw_spin_lock_irq(&tg->cfs_bandwidth.lock); > + tg->cfs_bandwidth.period = ns_to_ktime(period); > + tg->cfs_bandwidth.runtime = tg->cfs_bandwidth.quota = quota; > + raw_spin_unlock_irq(&tg->cfs_bandwidth.lock); > + > + for_each_possible_cpu(i) { > + struct cfs_rq *cfs_rq = tg->cfs_rq[i]; > + struct rq *rq = rq_of(cfs_rq); > + > + raw_spin_lock_irq(&rq->lock); > + cfs_rq->quota_used = 0; > + if (quota == RUNTIME_INF) > + cfs_rq->quota_assigned = RUNTIME_INF; > + else > + cfs_rq->quota_assigned = 0; > + raw_spin_unlock_irq(&rq->lock); > + } > + mutex_unlock(&mutex); > + > + return 0; > +} > + > static void task_waking_fair(struct rq *rq, struct task_struct *p) > @@ -1172,7 +1180,7 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p) > * We still saw a performance dip, some tracing learned us that between > * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased > * significantly. Therefore try to bias the error in direction of failing > - * the affine wakeup. > + * the affie wakeup. Unintended change ? Regards, Bharata. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo(a)vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
From: Bharata B Rao on 26 Feb 2010 07:00 On Thu, Feb 25, 2010 at 02:30:44AM -0800, Paul Turner wrote: > On Thu, Feb 25, 2010 at 12:14 AM, Bharata B Rao > <bharata(a)linux.vnet.ibm.com> wrote: > > On Fri, Feb 12, 2010 at 06:54:57PM -0800, Paul wrote: > >> --- a/kernel/sched.c > >> +++ b/kernel/sched.c > >> @@ -190,10 +190,28 @@ static inline int rt_bandwidth_enabled(void) > >> � � � return sysctl_sched_rt_runtime >= 0; > >> �} > >> > >> -static void start_rt_bandwidth(struct rt_bandwidth *rt_b) > >> +static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) > >> �{ > >> - � � ktime_t now; > >> + � � unsigned long delta; > >> + � � ktime_t soft, hard, now; > >> + > >> + � � for (;;) { > >> + � � � � � � if (hrtimer_active(period_timer)) > >> + � � � � � � � � � � break; > >> + > >> + � � � � � � now = hrtimer_cb_get_time(period_timer); > >> + � � � � � � hrtimer_forward(period_timer, now, period); > >> + > >> + � � � � � � soft = hrtimer_get_softexpires(period_timer); > >> + � � � � � � hard = hrtimer_get_expires(period_timer); > >> + � � � � � � delta = ktime_to_ns(ktime_sub(hard, soft)); > >> + � � � � � � __hrtimer_start_range_ns(period_timer, soft, delta, > >> + � � � � � � � � � � � � � � � � � � �HRTIMER_MODE_ABS_PINNED, 0); > >> + � � } > >> +} > >> > >> +static void start_rt_bandwidth(struct rt_bandwidth *rt_b) > >> +{ > >> � � � if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) > >> � � � � � � � return; > >> > >> @@ -201,22 +219,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) > >> � � � � � � � return; > >> > >> � � � raw_spin_lock(&rt_b->rt_runtime_lock); > >> - � � for (;;) { > >> - � � � � � � unsigned long delta; > >> - � � � � � � ktime_t soft, hard; > >> - > >> - � � � � � � if (hrtimer_active(&rt_b->rt_period_timer)) > >> - � � � � � � � � � � break; > >> - > >> - � � � � � � now = hrtimer_cb_get_time(&rt_b->rt_period_timer); > >> - � � � � � � hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); > >> - > >> - � � � � � � soft = hrtimer_get_softexpires(&rt_b->rt_period_timer); > >> - � � � � � � hard = hrtimer_get_expires(&rt_b->rt_period_timer); > >> - � � � � � � delta = ktime_to_ns(ktime_sub(hard, soft)); > >> - � � � � � � __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, > >> - � � � � � � � � � � � � � � HRTIMER_MODE_ABS_PINNED, 0); > >> - � � } > >> + � � start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); > >> � � � raw_spin_unlock(&rt_b->rt_runtime_lock); > >> �} > >> > >> @@ -241,6 +244,15 @@ struct cfs_rq; > >> > >> �static LIST_HEAD(task_groups); > >> > >> +#ifdef CONFIG_CFS_BANDWIDTH > >> +struct cfs_bandwidth { > >> + � � raw_spinlock_t � � � � �lock; > >> + � � ktime_t � � � � � � � � period; > >> + � � u64 � � � � � � � � � � runtime, quota; > >> + � � struct hrtimer � � � � �period_timer; > >> +}; > >> +#endif > >> + > >> �/* task group related information */ > >> �struct task_group { > >> �#ifdef CONFIG_CGROUP_SCHED > >> @@ -272,6 +284,10 @@ struct task_group { > >> � � � struct task_group *parent; > >> � � � struct list_head siblings; > >> � � � struct list_head children; > >> + > >> +#ifdef CONFIG_CFS_BANDWIDTH > >> + � � struct cfs_bandwidth cfs_bandwidth; > >> +#endif > >> �}; > >> > >> �#ifdef CONFIG_USER_SCHED > >> @@ -445,9 +461,76 @@ struct cfs_rq { > >> � � � �*/ > >> � � � unsigned long rq_weight; > >> �#endif > >> +#ifdef CONFIG_CFS_BANDWIDTH > >> + � � u64 quota_assigned, quota_used; > >> +#endif > >> �#endif > >> �}; > >> > >> +#ifdef CONFIG_CFS_BANDWIDTH > >> +static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); > >> + > >> +static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) > >> +{ > >> + � � struct cfs_bandwidth *cfs_b = > >> + � � � � � � container_of(timer, struct cfs_bandwidth, period_timer); > >> + � � ktime_t now; > >> + � � int overrun; > >> + � � int idle = 0; > >> + > >> + � � for (;;) { > >> + � � � � � � now = hrtimer_cb_get_time(timer); > >> + � � � � � � overrun = hrtimer_forward(timer, now, cfs_b->period); > >> + > >> + � � � � � � if (!overrun) > >> + � � � � � � � � � � break; > >> + > >> + � � � � � � idle = do_sched_cfs_period_timer(cfs_b, overrun); > >> + � � } > >> + > >> + � � return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; > >> +} > >> + > >> +static > >> +void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, u64 quota, u64 period) > >> +{ > >> + � � raw_spin_lock_init(&cfs_b->lock); > >> + � � cfs_b->quota = cfs_b->runtime = quota; > >> + � � cfs_b->period = ns_to_ktime(period); > >> + > >> + � � hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); > >> + � � cfs_b->period_timer.function = sched_cfs_period_timer; > >> +} > >> + > >> +static > >> +void init_cfs_rq_quota(struct cfs_rq *cfs_rq) > >> +{ > >> + � � cfs_rq->quota_used = 0; > >> + � � if (cfs_rq->tg->cfs_bandwidth.quota == RUNTIME_INF) > >> + � � � � � � cfs_rq->quota_assigned = RUNTIME_INF; > >> + � � else > >> + � � � � � � cfs_rq->quota_assigned = 0; > >> +} > >> + > >> +static void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) > >> +{ > >> + � � if (cfs_b->quota == RUNTIME_INF) > >> + � � � � � � return; > >> + > >> + � � if (hrtimer_active(&cfs_b->period_timer)) > >> + � � � � � � return; > >> + > >> + � � raw_spin_lock(&cfs_b->lock); > >> + � � start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period); > >> + � � raw_spin_unlock(&cfs_b->lock); > >> +} > >> + > >> +static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) > >> +{ > >> + � � hrtimer_cancel(&cfs_b->period_timer); > >> +} > >> +#endif > > > > May be you could define some of this functions for !CONFIG_CFS_BANDWIDTH case > > and avoid them calling under #ifdef ? I was given this comment during my > > initial iterations. > > > > Was it for init or run-time functions? We try to maintain the empty > def style for most of our run-time functions (e.g. cfs_throttled); for > init it seems more descriptive (and in keeping with the rest of sched > init) to ifdef specific initialization. > > Regardless, I will definitely give this a pass-over to see what I can clean up. Even for init functions. > > >> + > >> �/* Real-Time classes' related field in a runqueue: */ > >> �struct rt_rq { > >> � � � struct rt_prio_array active; > >> @@ -1834,6 +1917,14 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) > >> �#endif > >> �} > >> > >> +#ifdef CONFIG_CFS_BANDWIDTH > >> +/* > >> + * default period for cfs group bandwidth. > >> + * default: 0.5s > >> + */ > >> +static u64 sched_cfs_bandwidth_period = 500000000ULL; > >> +#endif > >> + > >> �#include "sched_stats.h" > >> �#include "sched_idletask.c" > >> �#include "sched_fair.c" > >> @@ -9422,6 +9513,9 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, > >> � � � tg->cfs_rq[cpu] = cfs_rq; > >> � � � init_cfs_rq(cfs_rq, rq); > >> � � � cfs_rq->tg = tg; > >> +#ifdef CONFIG_CFS_BANDWIDTH > >> + � � init_cfs_rq_quota(cfs_rq); > >> +#endif > >> � � � if (add) > >> � � � � � � � list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); > >> > >> @@ -9594,6 +9688,10 @@ void __init sched_init(void) > >> � � � � � � � �* We achieve this by letting init_task_group's tasks sit > >> � � � � � � � �* directly in rq->cfs (i.e init_task_group->se[] = NULL). > >> � � � � � � � �*/ > >> +#ifdef CONFIG_CFS_BANDWIDTH > >> + � � � � � � init_cfs_bandwidth(&init_task_group.cfs_bandwidth, > >> + � � � � � � � � � � � � � � RUNTIME_INF, sched_cfs_bandwidth_period); > >> +#endif > >> � � � � � � � init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); > >> �#elif defined CONFIG_USER_SCHED > >> � � � � � � � root_task_group.shares = NICE_0_LOAD; > >> @@ -9851,6 +9949,10 @@ static void free_fair_sched_group(struct task_group *tg) > >> �{ > >> � � � int i; > >> > >> +#ifdef CONFIG_CFS_BANDWIDTH > >> + � � destroy_cfs_bandwidth(&tg->cfs_bandwidth); > >> +#endif > >> + > >> � � � for_each_possible_cpu(i) { > >> � � � � � � � if (tg->cfs_rq) > >> � � � � � � � � � � � kfree(tg->cfs_rq[i]); > >> @@ -9878,7 +9980,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) > >> � � � � � � � goto err; > >> > >> � � � tg->shares = NICE_0_LOAD; > >> - > >> +#ifdef CONFIG_CFS_BANDWIDTH > >> + � � init_cfs_bandwidth(&tg->cfs_bandwidth, RUNTIME_INF, > >> + � � � � � � � � � � sched_cfs_bandwidth_period); > >> +#endif > >> � � � for_each_possible_cpu(i) { > >> � � � � � � � rq = cpu_rq(i); > >> > >> @@ -10333,7 +10438,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) > >> � � � return walk_tg_tree(tg_schedulable, tg_nop, &data); > >> �} > >> > >> -static int tg_set_bandwidth(struct task_group *tg, > >> +static int tg_set_rt_bandwidth(struct task_group *tg, > >> � � � � � � � u64 rt_period, u64 rt_runtime) > >> �{ > >> � � � int i, err = 0; > >> @@ -10372,7 +10477,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) > >> � � � if (rt_runtime_us < 0) > >> � � � � � � � rt_runtime = RUNTIME_INF; > >> > >> - � � return tg_set_bandwidth(tg, rt_period, rt_runtime); > >> + � � return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); > >> �} > >> > >> �long sched_group_rt_runtime(struct task_group *tg) > >> @@ -10397,7 +10502,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) > >> � � � if (rt_period == 0) > >> � � � � � � � return -EINVAL; > >> > >> - � � return tg_set_bandwidth(tg, rt_period, rt_runtime); > >> + � � return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); > >> �} > >> > >> �long sched_group_rt_period(struct task_group *tg) > >> @@ -10604,6 +10709,120 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) > >> > >> � � � return (u64) tg->shares; > >> �} > >> + > >> +#ifdef CONFIG_CFS_BANDWIDTH > >> +static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) > >> +{ > >> + � � int i; > >> + � � static DEFINE_MUTEX(mutex); > >> + > >> + � � if (tg == &init_task_group) > >> + � � � � � � return -EINVAL; > >> + > >> + � � if (!period) > >> + � � � � � � return -EINVAL; > >> + > >> + � � mutex_lock(&mutex); > > > > What is this mutex for ? So you essentially serializing the bandwidth > > setting of all groups ? While that iself isn't an issue, just wondering if > > cfs_bandwidth.lock isn't suffient ? > > > > The idea isn't to synchronize quota updates for all groups, but to > synchronize it within a single group. Consider the case of 2 parallel > writes, one setting infinite bandwidth the other setting finite. > Depending on rq->lock contention it's possible for both values to > propagate to some of the cpus. > > You sit on the bandwidth lock because then there is inversion with > update_curr, e.g. > > cpu1 -> rq->lock held, update_curr -> request bandwidth -> acquire > cfs_bandwidth.lock > cpu2 -> tg_set_cfs_bandwidth, hold cfs_bandwidth -> try to acquire cpu1 rq->lock > > This mutex could be per-cgroup but users shouldn't be updating fast > enough to the point where they require it, it also reduces rq->lock > slamming when users update several cgroups in parallel. I get it. cfs_bandwidth.lock was suffienct for me since I have per cfs_rq locks protecting the runtime related fields of cfs_rq. When I started, I too had a simple scheme like yours where a per-rq lock protected the runtime related fields of all cfs_rqs under it, but when I added runtime rebalancing, I found the need for per cfs_rq locks and hence followed rt code more closely. But I can see that since you don't do runtime rebalancing you can keep the locking simple with just per rq lock protecting the runtime related fields of all cfs_rqs under it. Regards, Bharata. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo(a)vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
|
Pages: 1 Prev: Perf python scripting support Next: workqueues: change cancel_work_sync() to clear work->data |