From: Suresh Siddha on
On Thu, 2010-08-12 at 10:25 -0700, Heiko Carstens wrote:
> From: Heiko Carstens <heiko.carstens(a)de.ibm.com>
>
> On top of the SMT and MC scheduling domains this adds the BOOK scheduling
> domain. This is useful for machines that have a four level cache hierarchy
> and but do not fall into the NUMA category.
>
> Signed-off-by: Heiko Carstens <heiko.carstens(a)de.ibm.com>

PeterZ had some ideas in cleaning up the sched domain setup to avoid
this maze of #ifdef's. I will let him comment on this.

thanks,
suresh
> ---
>
> arch/s390/defconfig | 1
> include/linux/sched.h | 19 +++++++
> include/linux/topology.h | 6 ++
> kernel/sched.c | 112 ++++++++++++++++++++++++++++++++++++++++++++---
> kernel/sched_fair.c | 11 ++--
> 5 files changed, 137 insertions(+), 12 deletions(-)
>
> diff -urpN linux-2.6/arch/s390/defconfig linux-2.6-patched/arch/s390/defconfig
> --- linux-2.6/arch/s390/defconfig 2010-08-02 00:11:14.000000000 +0200
> +++ linux-2.6-patched/arch/s390/defconfig 2010-08-11 13:47:23.000000000 +0200
> @@ -248,6 +248,7 @@ CONFIG_64BIT=y
> CONFIG_SMP=y
> CONFIG_NR_CPUS=32
> CONFIG_HOTPLUG_CPU=y
> +# CONFIG_SCHED_BOOK is not set
> CONFIG_COMPAT=y
> CONFIG_SYSVIPC_COMPAT=y
> CONFIG_AUDIT_ARCH=y
> diff -urpN linux-2.6/include/linux/sched.h linux-2.6-patched/include/linux/sched.h
> --- linux-2.6/include/linux/sched.h 2010-08-11 13:47:16.000000000 +0200
> +++ linux-2.6-patched/include/linux/sched.h 2010-08-11 13:47:23.000000000 +0200
> @@ -807,7 +807,9 @@ enum powersavings_balance_level {
> MAX_POWERSAVINGS_BALANCE_LEVELS
> };
>
> -extern int sched_mc_power_savings, sched_smt_power_savings;
> +extern int sched_smt_power_savings;
> +extern int sched_mc_power_savings;
> +extern int sched_book_power_savings;
>
> static inline int sd_balance_for_mc_power(void)
> {
> @@ -820,11 +822,23 @@ static inline int sd_balance_for_mc_powe
> return 0;
> }
>
> -static inline int sd_balance_for_package_power(void)
> +static inline int sd_balance_for_book_power(void)
> {
> if (sched_mc_power_savings | sched_smt_power_savings)
> return SD_POWERSAVINGS_BALANCE;
>
> + if (!sched_book_power_savings)
> + return SD_PREFER_SIBLING;
> +
> + return 0;
> +}
> +
> +static inline int sd_balance_for_package_power(void)
> +{
> + if (sched_book_power_savings | sched_mc_power_savings |
> + sched_smt_power_savings)
> + return SD_POWERSAVINGS_BALANCE;
> +
> return SD_PREFER_SIBLING;
> }
>
> @@ -875,6 +889,7 @@ enum sched_domain_level {
> SD_LV_NONE = 0,
> SD_LV_SIBLING,
> SD_LV_MC,
> + SD_LV_BOOK,
> SD_LV_CPU,
> SD_LV_NODE,
> SD_LV_ALLNODES,
> diff -urpN linux-2.6/include/linux/topology.h linux-2.6-patched/include/linux/topology.h
> --- linux-2.6/include/linux/topology.h 2010-08-11 13:47:16.000000000 +0200
> +++ linux-2.6-patched/include/linux/topology.h 2010-08-11 13:47:23.000000000 +0200
> @@ -201,6 +201,12 @@ int arch_update_cpu_topology(void);
> .balance_interval = 64, \
> }
>
> +#ifdef CONFIG_SCHED_BOOK
> +#ifndef SD_BOOK_INIT
> +#error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!!
> +#endif
> +#endif /* CONFIG_SCHED_BOOK */
> +
> #ifdef CONFIG_NUMA
> #ifndef SD_NODE_INIT
> #error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
> diff -urpN linux-2.6/kernel/sched.c linux-2.6-patched/kernel/sched.c
> --- linux-2.6/kernel/sched.c 2010-08-11 13:47:23.000000000 +0200
> +++ linux-2.6-patched/kernel/sched.c 2010-08-11 13:47:23.000000000 +0200
> @@ -6472,7 +6472,9 @@ static void sched_domain_node_span(int n
> }
> #endif /* CONFIG_NUMA */
>
> -int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
> +int sched_smt_power_savings;
> +int sched_mc_power_savings;
> +int sched_book_power_savings;
>
> /*
> * The cpus mask in sched_group and sched_domain hangs off the end.
> @@ -6500,6 +6502,7 @@ struct s_data {
> cpumask_var_t nodemask;
> cpumask_var_t this_sibling_map;
> cpumask_var_t this_core_map;
> + cpumask_var_t this_book_map;
> cpumask_var_t send_covered;
> cpumask_var_t tmpmask;
> struct sched_group **sched_group_nodes;
> @@ -6511,6 +6514,7 @@ enum s_alloc {
> sa_rootdomain,
> sa_tmpmask,
> sa_send_covered,
> + sa_this_book_map,
> sa_this_core_map,
> sa_this_sibling_map,
> sa_nodemask,
> @@ -6564,6 +6568,31 @@ cpu_to_core_group(int cpu, const struct
> }
> #endif /* CONFIG_SCHED_MC */
>
> +/*
> + * book sched-domains:
> + */
> +#ifdef CONFIG_SCHED_BOOK
> +static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
> +static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
> +
> +static int
> +cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
> + struct sched_group **sg, struct cpumask *mask)
> +{
> + int group = cpu;
> +#ifdef CONFIG_SCHED_MC
> + cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
> + group = cpumask_first(mask);
> +#elif defined(CONFIG_SCHED_SMT)
> + cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
> + group = cpumask_first(mask);
> +#endif
> + if (sg)
> + *sg = &per_cpu(sched_group_book, group).sg;
> + return group;
> +}
> +#endif /* CONFIG_SCHED_BOOK */
> +
> static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
> static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
>
> @@ -6572,7 +6601,10 @@ cpu_to_phys_group(int cpu, const struct
> struct sched_group **sg, struct cpumask *mask)
> {
> int group;
> -#ifdef CONFIG_SCHED_MC
> +#ifdef CONFIG_SCHED_BOOK
> + cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
> + group = cpumask_first(mask);
> +#elif defined(CONFIG_SCHED_MC)
> cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
> group = cpumask_first(mask);
> #elif defined(CONFIG_SCHED_SMT)
> @@ -6833,6 +6865,9 @@ SD_INIT_FUNC(CPU)
> #ifdef CONFIG_SCHED_MC
> SD_INIT_FUNC(MC)
> #endif
> +#ifdef CONFIG_SCHED_BOOK
> + SD_INIT_FUNC(BOOK)
> +#endif
>
> static int default_relax_domain_level = -1;
>
> @@ -6882,6 +6917,8 @@ static void __free_domain_allocs(struct
> free_cpumask_var(d->tmpmask); /* fall through */
> case sa_send_covered:
> free_cpumask_var(d->send_covered); /* fall through */
> + case sa_this_book_map:
> + free_cpumask_var(d->this_book_map); /* fall through */
> case sa_this_core_map:
> free_cpumask_var(d->this_core_map); /* fall through */
> case sa_this_sibling_map:
> @@ -6928,8 +6965,10 @@ static enum s_alloc __visit_domain_alloc
> return sa_nodemask;
> if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
> return sa_this_sibling_map;
> - if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
> + if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
> return sa_this_core_map;
> + if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
> + return sa_this_book_map;
> if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
> return sa_send_covered;
> d->rd = alloc_rootdomain();
> @@ -6987,6 +7026,23 @@ static struct sched_domain *__build_cpu_
> return sd;
> }
>
> +static struct sched_domain *__build_book_sched_domain(struct s_data *d,
> + const struct cpumask *cpu_map, struct sched_domain_attr *attr,
> + struct sched_domain *parent, int i)
> +{
> + struct sched_domain *sd = parent;
> +#ifdef CONFIG_SCHED_BOOK
> + sd = &per_cpu(book_domains, i).sd;
> + SD_INIT(sd, BOOK);
> + set_domain_attribute(sd, attr);
> + cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
> + sd->parent = parent;
> + parent->child = sd;
> + cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
> +#endif
> + return sd;
> +}
> +
> static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
> const struct cpumask *cpu_map, struct sched_domain_attr *attr,
> struct sched_domain *parent, int i)
> @@ -7044,6 +7100,15 @@ static void build_sched_groups(struct s_
> d->send_covered, d->tmpmask);
> break;
> #endif
> +#ifdef CONFIG_SCHED_BOOK
> + case SD_LV_BOOK: /* set up book groups */
> + cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
> + if (cpu == cpumask_first(d->this_book_map))
> + init_sched_build_groups(d->this_book_map, cpu_map,
> + &cpu_to_book_group,
> + d->send_covered, d->tmpmask);
> + break;
> +#endif
> case SD_LV_CPU: /* set up physical groups */
> cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
> if (!cpumask_empty(d->nodemask))
> @@ -7091,12 +7156,14 @@ static int __build_sched_domains(const s
>
> sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
> sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
> + sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
> sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
> sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
> }
>
> for_each_cpu(i, cpu_map) {
> build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
> + build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
> build_sched_groups(&d, SD_LV_MC, cpu_map, i);
> }
>
> @@ -7127,6 +7194,12 @@ static int __build_sched_domains(const s
> init_sched_groups_power(i, sd);
> }
> #endif
> +#ifdef CONFIG_SCHED_BOOK
> + for_each_cpu(i, cpu_map) {
> + sd = &per_cpu(book_domains, i).sd;
> + init_sched_groups_power(i, sd);
> + }
> +#endif
>
> for_each_cpu(i, cpu_map) {
> sd = &per_cpu(phys_domains, i).sd;
> @@ -7152,6 +7225,8 @@ static int __build_sched_domains(const s
> sd = &per_cpu(cpu_domains, i).sd;
> #elif defined(CONFIG_SCHED_MC)
> sd = &per_cpu(core_domains, i).sd;
> +#elif defined(CONFIG_SCHED_BOOK)
> + sd = &per_cpu(book_domains, i).sd;
> #else
> sd = &per_cpu(phys_domains, i).sd;
> #endif
> @@ -7368,7 +7443,8 @@ match2:
> mutex_unlock(&sched_domains_mutex);
> }
>
> -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
> +#if defined(CONFIG_SCHED_BOOK) || defined(CONFIG_SCHED_MC) || \
> + defined(CONFIG_SCHED_SMT)
> static void arch_reinit_sched_domains(void)
> {
> get_online_cpus();
> @@ -7405,6 +7481,9 @@ static ssize_t sched_power_savings_store
> case SD_LV_MC:
> sched_mc_power_savings = level;
> break;
> + case SD_LV_BOOK:
> + sched_book_power_savings = level;
> + break;
> default:
> break;
> }
> @@ -7414,6 +7493,24 @@ static ssize_t sched_power_savings_store
> return count;
> }
>
> +#ifdef CONFIG_SCHED_BOOK
> +static ssize_t sched_book_power_savings_show(struct sysdev_class *class,
> + struct sysdev_class_attribute *attr,
> + char *page)
> +{
> + return sprintf(page, "%u\n", sched_book_power_savings);
> +}
> +static ssize_t sched_book_power_savings_store(struct sysdev_class *class,
> + struct sysdev_class_attribute *attr,
> + const char *buf, size_t count)
> +{
> + return sched_power_savings_store(buf, count, SD_LV_BOOK);
> +}
> +static SYSDEV_CLASS_ATTR(sched_book_power_savings, 0644,
> + sched_book_power_savings_show,
> + sched_book_power_savings_store);
> +#endif
> +
> #ifdef CONFIG_SCHED_MC
> static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
> struct sysdev_class_attribute *attr,
> @@ -7464,9 +7561,14 @@ int __init sched_create_sysfs_power_savi
> err = sysfs_create_file(&cls->kset.kobj,
> &attr_sched_mc_power_savings.attr);
> #endif
> +#ifdef CONFIG_SCHED_BOOK
> + if (!err && book_capable())
> + err = sysfs_create_file(&cls->kset.kobj,
> + &attr_sched_book_power_savings.attr);
> +#endif
> return err;
> }
> -#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
> +#endif /* CONFIG_SCHED_BOOK || CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
>
> /*
> * Update cpusets according to cpu_active mask. If cpusets are
> diff -urpN linux-2.6/kernel/sched_fair.c linux-2.6-patched/kernel/sched_fair.c
> --- linux-2.6/kernel/sched_fair.c 2010-08-11 13:47:16.000000000 +0200
> +++ linux-2.6-patched/kernel/sched_fair.c 2010-08-11 13:47:23.000000000 +0200
> @@ -2039,7 +2039,8 @@ struct sd_lb_stats {
> unsigned long busiest_group_capacity;
>
> int group_imb; /* Is there imbalance in this sd */
> -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
> +#if defined(CONFIG_SCHED_BOOK) || defined(CONFIG_SCHED_MC) || \
> + defined(CONFIG_SCHED_SMT)
> int power_savings_balance; /* Is powersave balance needed for this sd */
> struct sched_group *group_min; /* Least loaded group in sd */
> struct sched_group *group_leader; /* Group which relieves group_min */
> @@ -2096,8 +2097,8 @@ static inline int get_sd_load_idx(struct
> return load_idx;
> }
>
> -
> -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
> +#if defined(CONFIG_SCHED_BOOK) || defined(CONFIG_SCHED_MC) || \
> + defined(CONFIG_SCHED_SMT)
> /**
> * init_sd_power_savings_stats - Initialize power savings statistics for
> * the given sched_domain, during load balancing.
> @@ -2217,7 +2218,7 @@ static inline int check_power_save_busie
> return 1;
>
> }
> -#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
> +#else /* CONFIG_SCHED_BOOK || CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
> static inline void init_sd_power_savings_stats(struct sched_domain *sd,
> struct sd_lb_stats *sds, enum cpu_idle_type idle)
> {
> @@ -2235,7 +2236,7 @@ static inline int check_power_save_busie
> {
> return 0;
> }
> -#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
> +#endif /* CONFIG_SCHED_BOOK || CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
>
>
> unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
>

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/