From: Borislav Petkov on 22 Feb 2010 13:50 From: "H. Peter Anvin" <hpa(a)zytor.com> Date: Mon, Feb 22, 2010 at 09:21:05AM -0800 > On 02/22/2010 06:17 AM, Borislav Petkov wrote: > > > > +config ARCH_HWEIGHT_CFLAGS > > + string > > + default "-fcall-saved-rsi -fcall-saved-rdx -fcall-saved-rcx -fcall-saved-r8 -fcall-saved-r9 -fcall-saved-r10 -fcall-saved-r11" if X86_64 > > + > > [...] > > > + > > +#ifdef CONFIG_64BIT > > +/* popcnt %rdi, %rax */ > > +#define POPCNT ".byte 0xf3\n\t.byte 0x48\n\t.byte 0x0f\n\t.byte 0xb8\n\t.byte 0xc7" > > +#define REG_IN "D" > > +#define REG_OUT "a" > > +#else > > Just a note: this still means rdi is clobbered on x86-64, which is > probably fine, but needs to be recorded as such. Since gcc doesn't > support clobbers for registers used as operands (sigh), you have to > create a dummy output and assign it a "=D" constraint. > > I don't know if gcc would handle -fcall-saved-rdi here... and if so, how > reliably. Ok, from looking at kernel/sched.s output it looks like it saves rdi content over the alternative where needed. I'll do some more testing just to make sure. -- From: Borislav Petkov <borislav.petkov(a)amd.com> Date: Thu, 11 Feb 2010 00:48:31 +0100 Subject: [PATCH] x86: Add optimized popcnt variants Add support for the hardware version of the Hamming weight function, popcnt, present in CPUs which advertize it under CPUID, Function 0x0000_0001_ECX[23]. On CPUs which don't support it, we fallback to the default lib/hweight.c sw versions. A synthetic benchmark comparing popcnt with __sw_hweight64 showed almost a 3x speedup on a F10h machine. Signed-off-by: Borislav Petkov <borislav.petkov(a)amd.com> --- arch/x86/Kconfig | 5 ++ arch/x86/include/asm/alternative.h | 9 +++- arch/x86/include/asm/arch_hweight.h | 59 +++++++++++++++++++++++++++++ arch/x86/include/asm/bitops.h | 4 +- include/asm-generic/bitops/arch_hweight.h | 22 +++++++++-- lib/Makefile | 3 + lib/hweight.c | 20 +++++----- scripts/Makefile.lib | 4 ++ 8 files changed, 108 insertions(+), 18 deletions(-) create mode 100644 arch/x86/include/asm/arch_hweight.h diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index eb40925..176950e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -230,6 +230,11 @@ config X86_32_LAZY_GS def_bool y depends on X86_32 && !CC_STACKPROTECTOR +config ARCH_HWEIGHT_CFLAGS + string + default "-fcall-saved-ecx -fcall-saved-edx" if X86_32 + default "-fcall-saved-rsi -fcall-saved-rdx -fcall-saved-rcx -fcall-saved-r8 -fcall-saved-r9 -fcall-saved-r10 -fcall-saved-r11" if X86_64 + config KTIME_SCALAR def_bool X86_32 source "init/Kconfig" diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 69b74a7..0720c96 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -39,9 +39,6 @@ #define LOCK_PREFIX "" #endif -/* This must be included *after* the definition of LOCK_PREFIX */ -#include <asm/cpufeature.h> - struct alt_instr { u8 *instr; /* original instruction */ u8 *replacement; @@ -91,6 +88,12 @@ static inline void alternatives_smp_switch(int smp) {} ".previous" /* + * This must be included *after* the definition of ALTERNATIVE due to + * <asm/arch_hweight.h> + */ +#include <asm/cpufeature.h> + +/* * Alternative instructions for different CPU types or capabilities. * * This allows to use optimized instructions even on generic binary diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h new file mode 100644 index 0000000..cc3a188 --- /dev/null +++ b/arch/x86/include/asm/arch_hweight.h @@ -0,0 +1,59 @@ +#ifndef _ASM_X86_HWEIGHT_H +#define _ASM_X86_HWEIGHT_H + +#ifdef CONFIG_64BIT +/* popcnt %rdi, %rax */ +#define POPCNT ".byte 0xf3\n\t.byte 0x48\n\t.byte 0x0f\n\t.byte 0xb8\n\t.byte 0xc7" +#define REG_IN "D" +#define REG_OUT "a" +#else +/* popcnt %eax, %eax */ +#define POPCNT ".byte 0xf3\n\t.byte 0x0f\n\t.byte 0xb8\n\t.byte 0xc0" +#define REG_IN "a" +#define REG_OUT "a" +#endif + +/* + * __sw_hweightXX are called from within the alternatives below + * and callee-clobbered registers need to be taken care of. See + * ARCH_HWEIGHT_CFLAGS in <arch/x86/Kconfig> for the respective + * compiler switches. + */ +static inline unsigned int __arch_hweight32(unsigned int w) +{ + unsigned int res = 0; + + asm (ALTERNATIVE("call __sw_hweight32", POPCNT, X86_FEATURE_POPCNT) + : "="REG_OUT (res) + : REG_IN (w)); + + return res; +} + +static inline unsigned int __arch_hweight16(unsigned int w) +{ + return __arch_hweight32(w & 0xffff); +} + +static inline unsigned int __arch_hweight8(unsigned int w) +{ + return __arch_hweight32(w & 0xff); +} + +static inline unsigned long __arch_hweight64(__u64 w) +{ + unsigned long res = 0, dummy; + +#ifdef CONFIG_X86_32 + return __arch_hweight32((u32)w) + + __arch_hweight32((u32)(w >> 32)); +#else + asm (ALTERNATIVE("call __sw_hweight64", POPCNT, X86_FEATURE_POPCNT) + : "="REG_OUT (res), "="REG_IN (dummy) + : REG_IN (w)); +#endif /* CONFIG_X86_32 */ + + return res; +} + +#endif diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 02b47a6..545776e 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -444,7 +444,9 @@ static inline int fls(int x) #define ARCH_HAS_FAST_MULTIPLIER 1 -#include <asm-generic/bitops/hweight.h> +#include <asm/arch_hweight.h> + +#include <asm-generic/bitops/const_hweight.h> #endif /* __KERNEL__ */ diff --git a/include/asm-generic/bitops/arch_hweight.h b/include/asm-generic/bitops/arch_hweight.h index 3a7be84..9a81c1e 100644 --- a/include/asm-generic/bitops/arch_hweight.h +++ b/include/asm-generic/bitops/arch_hweight.h @@ -3,9 +3,23 @@ #include <asm/types.h> -extern unsigned int __arch_hweight32(unsigned int w); -extern unsigned int __arch_hweight16(unsigned int w); -extern unsigned int __arch_hweight8(unsigned int w); -extern unsigned long __arch_hweight64(__u64 w); +inline unsigned int __arch_hweight32(unsigned int w) +{ + return __sw_hweight32(w); +} +inline unsigned int __arch_hweight16(unsigned int w) +{ + return __sw_hweight16(w); +} + +inline unsigned int __arch_hweight8(unsigned int w) +{ + return __sw_hweight8(w); +} + +inline unsigned long __arch_hweight64(__u64 w) +{ + return __sw_hweight64(w); +} #endif /* _ASM_GENERIC_BITOPS_HWEIGHT_H_ */ diff --git a/lib/Makefile b/lib/Makefile index 3b0b4a6..e2ad17c 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -39,7 +39,10 @@ lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_GENERIC_FIND_FIRST_BIT) += find_next_bit.o lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o obj-$(CONFIG_GENERIC_FIND_LAST_BIT) += find_last_bit.o + +CFLAGS_hweight.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS)) obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o + obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o obj-$(CONFIG_DEBUG_LIST) += list_debug.o diff --git a/lib/hweight.c b/lib/hweight.c index 9ff86df..f9ce440 100644 --- a/lib/hweight.c +++ b/lib/hweight.c @@ -9,7 +9,7 @@ * The Hamming Weight of a number is the total number of bits set in it. */ -unsigned int __arch_hweight32(unsigned int w) +unsigned int __sw_hweight32(unsigned int w) { unsigned int res = w - ((w >> 1) & 0x55555555); res = (res & 0x33333333) + ((res >> 2) & 0x33333333); @@ -17,30 +17,30 @@ unsigned int __arch_hweight32(unsigned int w) res = res + (res >> 8); return (res + (res >> 16)) & 0x000000FF; } -EXPORT_SYMBOL(__arch_hweight32); +EXPORT_SYMBOL(__sw_hweight32); -unsigned int __arch_hweight16(unsigned int w) +unsigned int __sw_hweight16(unsigned int w) { unsigned int res = w - ((w >> 1) & 0x5555); res = (res & 0x3333) + ((res >> 2) & 0x3333); res = (res + (res >> 4)) & 0x0F0F; return (res + (res >> 8)) & 0x00FF; } -EXPORT_SYMBOL(__arch_hweight16); +EXPORT_SYMBOL(__sw_hweight16); -unsigned int __arch_hweight8(unsigned int w) +unsigned int __sw_hweight8(unsigned int w) { unsigned int res = w - ((w >> 1) & 0x55); res = (res & 0x33) + ((res >> 2) & 0x33); return (res + (res >> 4)) & 0x0F; } -EXPORT_SYMBOL(__arch_hweight8); +EXPORT_SYMBOL(__sw_hweight8); -unsigned long __arch_hweight64(__u64 w) +unsigned long __sw_hweight64(__u64 w) { #if BITS_PER_LONG == 32 - return __arch_hweight32((unsigned int)(w >> 32)) + - __arch_hweight32((unsigned int)w); + return __sw_hweight32((unsigned int)(w >> 32)) + + __sw_hweight32((unsigned int)w); #elif BITS_PER_LONG == 64 #ifdef ARCH_HAS_FAST_MULTIPLIER w -= (w >> 1) & 0x5555555555555555ul; @@ -57,4 +57,4 @@ unsigned long __arch_hweight64(__u64 w) #endif #endif } -EXPORT_SYMBOL(__arch_hweight64); +EXPORT_SYMBOL(__sw_hweight64); diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index f9bdf26..cbcd654 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -245,3 +245,7 @@ quiet_cmd_lzo = LZO $@ cmd_lzo = (cat $(filter-out FORCE,$^) | \ lzop -9 && $(call size_append, $(filter-out FORCE,$^))) > $@ || \ (rm -f $@ ; false) + +# misc stuff +# --------------------------------------------------------------------------- +quote:=" -- 1.6.4.2 -- Regards/Gruss, Boris. -- Advanced Micro Devices, Inc. Operating Systems Research Center -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo(a)vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
From: H. Peter Anvin on 22 Feb 2010 15:50 On 02/22/2010 10:49 AM, Borislav Petkov wrote: >> >> Just a note: this still means rdi is clobbered on x86-64, which is >> probably fine, but needs to be recorded as such. Since gcc doesn't >> support clobbers for registers used as operands (sigh), you have to >> create a dummy output and assign it a "=D" constraint. >> >> I don't know if gcc would handle -fcall-saved-rdi here... and if so, how >> reliably. > > Ok, from looking at kernel/sched.s output it looks like it saves rdi > content over the alternative where needed. I'll do some more testing > just to make sure. > No, you can't rely on behavioral observation. A different version of gcc could behave differently. We need to make sure we tell gcc what the requirements actually are, as opposed to thinking we can just fix them. +#define POPCNT ".byte 0xf3\n\t.byte 0x48\n\t.byte 0x0f\n\t.byte 0xb8\n\t.byte 0xc7" BTW, this can be written: #define POPCNT ".byte 0xf3,0x48,0x0f,0xb8,0xc7" -hpa -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo(a)vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
From: Borislav Petkov on 23 Feb 2010 01:40 From: "H. Peter Anvin" <hpa(a)zytor.com> Date: Mon, Feb 22, 2010 at 11:55:48AM -0800 > On 02/22/2010 10:49 AM, Borislav Petkov wrote: > >> > >> Just a note: this still means rdi is clobbered on x86-64, which is > >> probably fine, but needs to be recorded as such. Since gcc doesn't > >> support clobbers for registers used as operands (sigh), you have to > >> create a dummy output and assign it a "=D" constraint. > >> > >> I don't know if gcc would handle -fcall-saved-rdi here... and if so, how > >> reliably. > > > > Ok, from looking at kernel/sched.s output it looks like it saves rdi > > content over the alternative where needed. I'll do some more testing > > just to make sure. > > > > No, you can't rely on behavioral observation. A different version of > gcc could behave differently. We need to make sure we tell gcc what the > requirements actually are, as opposed to thinking we can just fix them. Ok, I've added the dummy "=D" constraint since it sounded like the more stable thing to do WRT gcc versions. BTW, I left the machine overnight and it is still alive cheerfully building randconfigs. I'll try the -fcall-saved-rdi thing also later today. > +#define POPCNT ".byte 0xf3\n\t.byte 0x48\n\t.byte 0x0f\n\t.byte > 0xb8\n\t.byte 0xc7" > > BTW, this can be written: > > #define POPCNT ".byte 0xf3,0x48,0x0f,0xb8,0xc7" done, updated version coming up. -- Regards/Gruss, Boris. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo(a)vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
From: Borislav Petkov on 23 Feb 2010 11:00 From: "H. Peter Anvin" <hpa(a)zytor.com> Date: Mon, Feb 22, 2010 at 11:55:48AM -0800 > On 02/22/2010 10:49 AM, Borislav Petkov wrote: > >> > >> Just a note: this still means rdi is clobbered on x86-64, which is > >> probably fine, but needs to be recorded as such. Since gcc doesn't > >> support clobbers for registers used as operands (sigh), you have to > >> create a dummy output and assign it a "=D" constraint. > >> > >> I don't know if gcc would handle -fcall-saved-rdi here... and if so, how > >> reliably. Hmm, we cannot do that with the current design since __arch_hweight64 is being inlined into every callsite and AFAICT we would have to build every callsite with "-fcall-saved-rdi" which is clearly too much. The explicit "=D" dummy constraint is straightforward, instead. > > Ok, from looking at kernel/sched.s output it looks like it saves rdi > > content over the alternative where needed. I'll do some more testing > > just to make sure. > > > > No, you can't rely on behavioral observation. A different version of > gcc could behave differently. We need to make sure we tell gcc what the > requirements actually are, as opposed to thinking we can just fix them. Just to make clear: those observations were referring to the version _with_ the dummy "=D" constraint - I was simply verifying the asm output and wasn't relying on behavioral observation. > +#define POPCNT ".byte 0xf3\n\t.byte 0x48\n\t.byte 0x0f\n\t.byte > 0xb8\n\t.byte 0xc7" > > BTW, this can be written: > > #define POPCNT ".byte 0xf3,0x48,0x0f,0xb8,0xc7" Done, here's the latest version, it boots fine and testing is ongoing: -- From: Borislav Petkov <borislav.petkov(a)amd.com> Date: Thu, 11 Feb 2010 00:48:31 +0100 Subject: [PATCH] x86: Add optimized popcnt variants Add support for the hardware version of the Hamming weight function, popcnt, present in CPUs which advertize it under CPUID, Function 0x0000_0001_ECX[23]. On CPUs which don't support it, we fallback to the default lib/hweight.c sw versions. A synthetic benchmark comparing popcnt with __sw_hweight64 showed almost a 3x speedup on a F10h machine. Signed-off-by: Borislav Petkov <borislav.petkov(a)amd.com> --- arch/x86/Kconfig | 5 ++ arch/x86/include/asm/alternative.h | 9 +++- arch/x86/include/asm/arch_hweight.h | 61 +++++++++++++++++++++++++++++ arch/x86/include/asm/bitops.h | 4 +- include/asm-generic/bitops/arch_hweight.h | 22 ++++++++-- lib/Makefile | 3 + lib/hweight.c | 20 +++++----- scripts/Makefile.lib | 4 ++ 8 files changed, 110 insertions(+), 18 deletions(-) create mode 100644 arch/x86/include/asm/arch_hweight.h diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index eb40925..176950e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -230,6 +230,11 @@ config X86_32_LAZY_GS def_bool y depends on X86_32 && !CC_STACKPROTECTOR +config ARCH_HWEIGHT_CFLAGS + string + default "-fcall-saved-ecx -fcall-saved-edx" if X86_32 + default "-fcall-saved-rsi -fcall-saved-rdx -fcall-saved-rcx -fcall-saved-r8 -fcall-saved-r9 -fcall-saved-r10 -fcall-saved-r11" if X86_64 + config KTIME_SCALAR def_bool X86_32 source "init/Kconfig" diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 69b74a7..0720c96 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -39,9 +39,6 @@ #define LOCK_PREFIX "" #endif -/* This must be included *after* the definition of LOCK_PREFIX */ -#include <asm/cpufeature.h> - struct alt_instr { u8 *instr; /* original instruction */ u8 *replacement; @@ -91,6 +88,12 @@ static inline void alternatives_smp_switch(int smp) {} ".previous" /* + * This must be included *after* the definition of ALTERNATIVE due to + * <asm/arch_hweight.h> + */ +#include <asm/cpufeature.h> + +/* * Alternative instructions for different CPU types or capabilities. * * This allows to use optimized instructions even on generic binary diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h new file mode 100644 index 0000000..eaefadc --- /dev/null +++ b/arch/x86/include/asm/arch_hweight.h @@ -0,0 +1,61 @@ +#ifndef _ASM_X86_HWEIGHT_H +#define _ASM_X86_HWEIGHT_H + +#ifdef CONFIG_64BIT +/* popcnt %rdi, %rax */ +#define POPCNT ".byte 0xf3,0x48,0x0f,0xb8,0xc7" +#define REG_IN "D" +#define REG_OUT "a" +#else +/* popcnt %eax, %eax */ +#define POPCNT ".byte 0xf3,0x0f,0xb8,0xc0" +#define REG_IN "a" +#define REG_OUT "a" +#endif + +/* + * __sw_hweightXX are called from within the alternatives below + * and callee-clobbered registers need to be taken care of. See + * ARCH_HWEIGHT_CFLAGS in <arch/x86/Kconfig> for the respective + * compiler switches. + */ +static inline unsigned int __arch_hweight32(unsigned int w) +{ + unsigned int res = 0; + + asm (ALTERNATIVE("call __sw_hweight32", POPCNT, X86_FEATURE_POPCNT) + : "="REG_OUT (res) + : REG_IN (w)); + + return res; +} + +static inline unsigned int __arch_hweight16(unsigned int w) +{ + return __arch_hweight32(w & 0xffff); +} + +static inline unsigned int __arch_hweight8(unsigned int w) +{ + return __arch_hweight32(w & 0xff); +} + +static inline unsigned long __arch_hweight64(__u64 w) +{ + unsigned long res = 0; + /* tell gcc that %rdi is clobbered as an input operand */ + unsigned long dummy; + +#ifdef CONFIG_X86_32 + return __arch_hweight32((u32)w) + + __arch_hweight32((u32)(w >> 32)); +#else + asm (ALTERNATIVE("call __sw_hweight64", POPCNT, X86_FEATURE_POPCNT) + : "="REG_OUT (res), "="REG_IN (dummy) + : REG_IN (w)); +#endif /* CONFIG_X86_32 */ + + return res; +} + +#endif diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 02b47a6..545776e 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -444,7 +444,9 @@ static inline int fls(int x) #define ARCH_HAS_FAST_MULTIPLIER 1 -#include <asm-generic/bitops/hweight.h> +#include <asm/arch_hweight.h> + +#include <asm-generic/bitops/const_hweight.h> #endif /* __KERNEL__ */ diff --git a/include/asm-generic/bitops/arch_hweight.h b/include/asm-generic/bitops/arch_hweight.h index 3a7be84..9a81c1e 100644 --- a/include/asm-generic/bitops/arch_hweight.h +++ b/include/asm-generic/bitops/arch_hweight.h @@ -3,9 +3,23 @@ #include <asm/types.h> -extern unsigned int __arch_hweight32(unsigned int w); -extern unsigned int __arch_hweight16(unsigned int w); -extern unsigned int __arch_hweight8(unsigned int w); -extern unsigned long __arch_hweight64(__u64 w); +inline unsigned int __arch_hweight32(unsigned int w) +{ + return __sw_hweight32(w); +} +inline unsigned int __arch_hweight16(unsigned int w) +{ + return __sw_hweight16(w); +} + +inline unsigned int __arch_hweight8(unsigned int w) +{ + return __sw_hweight8(w); +} + +inline unsigned long __arch_hweight64(__u64 w) +{ + return __sw_hweight64(w); +} #endif /* _ASM_GENERIC_BITOPS_HWEIGHT_H_ */ diff --git a/lib/Makefile b/lib/Makefile index 3b0b4a6..e2ad17c 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -39,7 +39,10 @@ lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_GENERIC_FIND_FIRST_BIT) += find_next_bit.o lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o obj-$(CONFIG_GENERIC_FIND_LAST_BIT) += find_last_bit.o + +CFLAGS_hweight.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS)) obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o + obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o obj-$(CONFIG_DEBUG_LIST) += list_debug.o diff --git a/lib/hweight.c b/lib/hweight.c index 9ff86df..f9ce440 100644 --- a/lib/hweight.c +++ b/lib/hweight.c @@ -9,7 +9,7 @@ * The Hamming Weight of a number is the total number of bits set in it. */ -unsigned int __arch_hweight32(unsigned int w) +unsigned int __sw_hweight32(unsigned int w) { unsigned int res = w - ((w >> 1) & 0x55555555); res = (res & 0x33333333) + ((res >> 2) & 0x33333333); @@ -17,30 +17,30 @@ unsigned int __arch_hweight32(unsigned int w) res = res + (res >> 8); return (res + (res >> 16)) & 0x000000FF; } -EXPORT_SYMBOL(__arch_hweight32); +EXPORT_SYMBOL(__sw_hweight32); -unsigned int __arch_hweight16(unsigned int w) +unsigned int __sw_hweight16(unsigned int w) { unsigned int res = w - ((w >> 1) & 0x5555); res = (res & 0x3333) + ((res >> 2) & 0x3333); res = (res + (res >> 4)) & 0x0F0F; return (res + (res >> 8)) & 0x00FF; } -EXPORT_SYMBOL(__arch_hweight16); +EXPORT_SYMBOL(__sw_hweight16); -unsigned int __arch_hweight8(unsigned int w) +unsigned int __sw_hweight8(unsigned int w) { unsigned int res = w - ((w >> 1) & 0x55); res = (res & 0x33) + ((res >> 2) & 0x33); return (res + (res >> 4)) & 0x0F; } -EXPORT_SYMBOL(__arch_hweight8); +EXPORT_SYMBOL(__sw_hweight8); -unsigned long __arch_hweight64(__u64 w) +unsigned long __sw_hweight64(__u64 w) { #if BITS_PER_LONG == 32 - return __arch_hweight32((unsigned int)(w >> 32)) + - __arch_hweight32((unsigned int)w); + return __sw_hweight32((unsigned int)(w >> 32)) + + __sw_hweight32((unsigned int)w); #elif BITS_PER_LONG == 64 #ifdef ARCH_HAS_FAST_MULTIPLIER w -= (w >> 1) & 0x5555555555555555ul; @@ -57,4 +57,4 @@ unsigned long __arch_hweight64(__u64 w) #endif #endif } -EXPORT_SYMBOL(__arch_hweight64); +EXPORT_SYMBOL(__sw_hweight64); diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index f9bdf26..cbcd654 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -245,3 +245,7 @@ quiet_cmd_lzo = LZO $@ cmd_lzo = (cat $(filter-out FORCE,$^) | \ lzop -9 && $(call size_append, $(filter-out FORCE,$^))) > $@ || \ (rm -f $@ ; false) + +# misc stuff +# --------------------------------------------------------------------------- +quote:=" -- 1.6.4.2 -- Regards/Gruss, Boris. -- Advanced Micro Devices, Inc. Operating Systems Research Center -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo(a)vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
From: H. Peter Anvin on 23 Feb 2010 12:40
On 02/23/2010 07:58 AM, Borislav Petkov wrote: > > Hmm, we cannot do that with the current design since __arch_hweight64 > is being inlined into every callsite and AFAICT we would have to build > every callsite with "-fcall-saved-rdi" which is clearly too much. The > explicit "=D" dummy constraint is straightforward, instead. > Uh... the -fcall-saved-rdi would go with all the other ones. Assuming it can actually work and that gcc doesn't choke on an inbound argument being saved. -hpa -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo(a)vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ |