From: Alok Kataria on
Hi,

This patch adds a hook for architectures to specify their own delay calibration
routine. VMware platform uses it to calculate the lpj value from the tsc_khz &
HZ value for all the processors.

Please note that this is a partial revert of -
commit 3da757daf86e498872855f0b5e101f763ba79499
x86: use cpu_khz for loops_per_jiffy calculation

where I added the lpj_fine variable to generic code, so that we can do this
lpj calibration trick just for the BP. It was considered wrong to apply this
trick for the AP's since on physical systems we can have cases where the AP
is brought up at a lower freq than the maximum possible for power reasons.
On VMware's platform we have VCPU's always running at the same
clockspeed as the TSC frequency so we can extend this for all cpus.

Please note that, though the original approach of doing this for just the BP
was safe to get around the "IO-APIC + timer doesn't work" on VMware, we still
need the AP's to have the correct lpj values for the timeouts to work correctly
on our platform for all vcpus.

Please consider this for the x86 tree, applies on the tip.

Signed-off-by: Alok N Kataria <akataria(a)vmware.com>

Index: linux-x86-tree.git/arch/x86/kernel/cpu/vmware.c
===================================================================
--- linux-x86-tree.git.orig/arch/x86/kernel/cpu/vmware.c 2010-07-08 13:53:33.000000000 -0700
+++ linux-x86-tree.git/arch/x86/kernel/cpu/vmware.c 2010-07-19 16:47:53.000000000 -0700
@@ -23,6 +23,7 @@

#include <linux/dmi.h>
#include <linux/module.h>
+#include <linux/delay.h>
#include <asm/div64.h>
#include <asm/x86_init.h>
#include <asm/hypervisor.h>
@@ -42,6 +43,8 @@
"2"(VMWARE_HYPERVISOR_PORT), "3"(UINT_MAX) : \
"memory");

+static unsigned long lpj_fine;
+
static inline int __vmware_platform(void)
{
uint32_t eax, ebx, ecx, edx;
@@ -51,7 +54,7 @@ static inline int __vmware_platform(void

static unsigned long vmware_get_tsc_khz(void)
{
- uint64_t tsc_hz;
+ uint64_t tsc_hz, lpj;
uint32_t eax, ebx, ecx, edx;

VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
@@ -62,18 +65,35 @@ static unsigned long vmware_get_tsc_khz(
printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n",
(unsigned long) tsc_hz / 1000,
(unsigned long) tsc_hz % 1000);
+
+ lpj = ((u64)tsc_hz * 1000);
+ do_div(lpj, HZ);
+ lpj_fine = lpj;
+
return tsc_hz;
}

+/*
+ * We can skip the delay calibration and assign it a value calculated based on
+ * the timer frequency. On VMware's platform all the cpu's run at the same
+ * frequency as the timer frequency, so use this value for all the processors.
+ */
+static unsigned long vmware_calibrate_delay(void)
+{
+ BUG_ON(!lpj_fine);
+ return lpj_fine;
+}
+
static void __init vmware_platform_setup(void)
{
uint32_t eax, ebx, ecx, edx;

VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);

- if (ebx != UINT_MAX)
+ if (ebx != UINT_MAX) {
x86_platform.calibrate_tsc = vmware_get_tsc_khz;
- else
+ arch_calibrate_delay = vmware_calibrate_delay;
+ } else
printk(KERN_WARNING
"Failed to get TSC freq from the hypervisor\n");
}
Index: linux-x86-tree.git/include/linux/delay.h
===================================================================
--- linux-x86-tree.git.orig/include/linux/delay.h 2008-06-26 15:29:48.000000000 -0700
+++ linux-x86-tree.git/include/linux/delay.h 2010-07-19 16:31:21.000000000 -0700
@@ -41,7 +41,7 @@ static inline void ndelay(unsigned long
#define ndelay(x) ndelay(x)
#endif

-extern unsigned long lpj_fine;
+extern unsigned long (*arch_calibrate_delay)(void);
void calibrate_delay(void);
void msleep(unsigned int msecs);
unsigned long msleep_interruptible(unsigned int msecs);
Index: linux-x86-tree.git/init/calibrate.c
===================================================================
--- linux-x86-tree.git.orig/init/calibrate.c 2010-02-07 16:38:44.000000000 -0800
+++ linux-x86-tree.git/init/calibrate.c 2010-07-19 17:00:04.000000000 -0700
@@ -10,8 +10,9 @@
#include <linux/timex.h>
#include <linux/smp.h>

-unsigned long lpj_fine;
unsigned long preset_lpj;
+unsigned long (*arch_calibrate_delay)(void);
+
static int __init lpj_setup(char *str)
{
preset_lpj = simple_strtoul(str,NULL,0);
@@ -130,10 +131,11 @@ void __cpuinit calibrate_delay(void)
if (!printed)
pr_info("Calibrating delay loop (skipped) "
"preset value.. ");
- } else if ((!printed) && lpj_fine) {
- loops_per_jiffy = lpj_fine;
- pr_info("Calibrating delay loop (skipped), "
- "value calculated using timer frequency.. ");
+ } else if (arch_calibrate_delay) {
+ loops_per_jiffy = arch_calibrate_delay();
+ if (!printed)
+ pr_info("Calibrating delay using platform "
+ "specific routine.. ");
} else if ((loops_per_jiffy = calibrate_delay_direct()) != 0) {
if (!printed)
pr_info("Calibrating delay using timer "
Index: linux-x86-tree.git/arch/x86/kernel/tsc.c
===================================================================
--- linux-x86-tree.git.orig/arch/x86/kernel/tsc.c 2010-07-19 16:30:35.000000000 -0700
+++ linux-x86-tree.git/arch/x86/kernel/tsc.c 2010-07-19 16:46:51.000000000 -0700
@@ -913,7 +913,6 @@ static inline unsigned long calibrate_cp

void __init tsc_init(void)
{
- u64 lpj;
int cpu;

x86_init.timers.tsc_pre_init();
@@ -952,10 +951,6 @@ void __init tsc_init(void)
/* now allow native_sched_clock() to use rdtsc */
tsc_disabled = 0;

- lpj = ((u64)tsc_khz * 1000);
- do_div(lpj, HZ);
- lpj_fine = lpj;
-
use_tsc_delay();
/* Check and install the TSC clocksource */
dmi_check_system(bad_tsc_dmi_table);


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/