Prev: [PATCH 1/3] [resend] usbtouchscreen: convert from usb_device to usb_interface
Next: MIPS: Wire up recvmmsg syscall
From: Randy Dunlap on 1 Dec 2009 12:30 Borislav Petkov wrote: >> EDAC DEBUG: in drivers/edac/amd64_edac.c, line at 2367: DRAM MEM-CTL PCI Bus ID: 0000:00:18.2 >> EDAC DEBUG: in drivers/edac/amd64_edac.c, line at 2369: Misc device PCI Bus ID: 0000:00:18.3 >> calling alsa_pcm_init+0x0/0x71 [snd_pcm] @ 1402 >> initcall alsa_pcm_init+0x0/0x71 [snd_pcm] returned 0 after 17 usecs >> EDAC amd64: ECC is enabled by BIOS. >> get_cpus_on_this_dct_cpumask: nid: 0, cpu: 0 >> get_cpus_on_this_dct_cpumask: nid: 0, cpu: 2 >> amd64_nb_mce_bank_enabled_on_node: weight: 2 >> EDAC DEBUG: in drivers/edac/amd64_edac.c, line at 2776: core: 0, MCG_CTL: 0x1f, NB MSR is enabled >> EDAC DEBUG: in drivers/edac/amd64_edac.c, line at 2776: core: 2, MCG_CTL: 0x0, NB MSR is disabled >> ============================================================================= >> BUG kmalloc-16: Redzone overwritten >> ----------------------------------------------------------------------------- > > Hmm, I think I know what happens. This machine has non-contigious > core enumeration on a node (e.g. 0,2 on node 0 instead of 0,1) but > rdmsr_on_cpus assumes the former. Therefore we write outside of the > allocated msrs struct and thus the redzone overwrite. Here's a simple > fix that should take care of it. Please apply on top of the debugging > patch and catch the output again so that we could verify it. > > I'll fix this properly when I get back and then maybe even backport it > depending on the intrusiveness of the changes. Hi, Here's the new log file (attached). thanks. -- ~Randy
From: Borislav Petkov on 2 Dec 2009 06:00 On Tue, Dec 01, 2009 at 09:19:31AM -0800, Randy Dunlap wrote: > Here's the new log file (attached). Thanks for testing. Meanwhile, I noticed that the other places where rdmsr_on_cpus() gets called with non-contigious cpumasks need fixing too. Here's a version that takes care of that, I'd be nice if you could give it a run too (patch against today's upstream). You could also enforce the module loading by setting 'ecc_enable_override=1' to verify the other rdmsr_on_cpus calls. Thanks. --- diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index a38831c..da2428b 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2618,6 +2618,9 @@ static int amd64_init_csrows(struct mem_ctl_info *mci) return empty; } +static struct msr *alloc_msrs(const cpumask_t *mask); +static void free_msrs(struct msr *msrs); + /* * Only if 'ecc_enable_override' is set AND BIOS had ECC disabled, do "we" * enable it. @@ -2627,14 +2630,16 @@ static void amd64_enable_ecc_error_reporting(struct mem_ctl_info *mci) struct amd64_pvt *pvt = mci->pvt_info; const cpumask_t *cpumask = cpumask_of_node(pvt->mc_node_id); int cpu, idx = 0, err = 0; - struct msr msrs[cpumask_weight(cpumask)]; + struct msr *msrs; u32 value; u32 mask = K8_NBCTL_CECCEn | K8_NBCTL_UECCEn; if (!ecc_enable_override) return; - memset(msrs, 0, sizeof(msrs)); + msrs = alloc_msrs(cpumask); + if (!msrs) + return; amd64_printk(KERN_WARNING, "'ecc_enable_override' parameter is active, " @@ -2697,20 +2702,24 @@ static void amd64_enable_ecc_error_reporting(struct mem_ctl_info *mci) (value & K8_NBCFG_ECC_ENABLE) ? "Enabled" : "Disabled"); pvt->ctl_error_info.nbcfg = value; + + free_msrs(msrs); } static void amd64_restore_ecc_error_reporting(struct amd64_pvt *pvt) { const cpumask_t *cpumask = cpumask_of_node(pvt->mc_node_id); int cpu, idx = 0, err = 0; - struct msr msrs[cpumask_weight(cpumask)]; + struct msr *msrs; u32 value; u32 mask = K8_NBCTL_CECCEn | K8_NBCTL_UECCEn; if (!pvt->nbctl_mcgctl_saved) return; - memset(msrs, 0, sizeof(msrs)); + msrs = alloc_msrs(cpumask); + if (!msrs) + return; err = pci_read_config_dword(pvt->misc_f3_ctl, K8_NBCTL, &value); if (err) @@ -2731,6 +2740,8 @@ static void amd64_restore_ecc_error_reporting(struct amd64_pvt *pvt) } wrmsr_on_cpus(cpumask, K8_MSR_MCGCTL, msrs); + + free_msrs(msrs); } /* get all cores on this DCT */ @@ -2743,6 +2754,40 @@ static void get_cpus_on_this_dct_cpumask(cpumask_t *mask, int nid) cpumask_set_cpu(cpu, mask); } +/* + * Allocate enough msr structs for the supplied cpumask. Also, take care of + * non-contigious bitmasks. + */ +static struct msr *alloc_msrs(const cpumask_t *mask) +{ + struct msr *msrs; + int i, first_cpu, last_cpu = 0; + + if (cpumask_empty(mask)) { + amd64_printk(KERN_WARNING, "%s: Empty cpumask!\n", __func__); + return NULL; + } + + first_cpu = cpumask_first(mask); + for (i = first_cpu; i < nr_cpu_ids; i++) + if (cpumask_test_cpu(i, mask)) + last_cpu = i; + + msrs = kzalloc(sizeof(*msrs) * (last_cpu - first_cpu + 1), GFP_KERNEL); + if (!msrs) { + amd64_printk(KERN_WARNING, "%s: error allocating msrs\n", + __func__); + return NULL; + } + + return msrs; +} + +static void free_msrs(struct msr *msrs) +{ + kfree(msrs); +} + /* check MCG_CTL on all the cpus on this node */ static bool amd64_nb_mce_bank_enabled_on_node(int nid) { @@ -2755,12 +2800,9 @@ static bool amd64_nb_mce_bank_enabled_on_node(int nid) get_cpus_on_this_dct_cpumask(&mask, nid); - msrs = kzalloc(sizeof(struct msr) * cpumask_weight(&mask), GFP_KERNEL); - if (!msrs) { - amd64_printk(KERN_WARNING, "%s: error allocating msrs\n", - __func__); - return false; - } + msrs = alloc_msrs(&mask); + if (!msrs) + goto out_err; rdmsr_on_cpus(&mask, MSR_IA32_MCG_CTL, msrs); @@ -2779,7 +2821,9 @@ static bool amd64_nb_mce_bank_enabled_on_node(int nid) ret = true; out: - kfree(msrs); + free_msrs(msrs); + +out_err: return ret; } -- Regards/Gruss, Boris. Operating | Advanced Micro Devices GmbH System | Karl-Hammerschmidt-Str. 34, 85609 Dornach b. M�nchen, Germany Research | Gesch�ftsf�hrer: Andrew Bowd, Thomas M. McCoy, Giuliano Meroni Center | Sitz: Dornach, Gemeinde Aschheim, Landkreis M�nchen (OSRC) | Registergericht M�nchen, HRB Nr. 43632 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo(a)vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
From: Randy Dunlap on 2 Dec 2009 13:20 On Wed, 2 Dec 2009 11:58:38 +0100 Borislav Petkov wrote: > On Tue, Dec 01, 2009 at 09:19:31AM -0800, Randy Dunlap wrote: > > Here's the new log file (attached). > > Thanks for testing. Meanwhile, I noticed that the other places where > rdmsr_on_cpus() gets called with non-contigious cpumasks need fixing > too. Here's a version that takes care of that, I'd be nice if you could > give it a run too (patch against today's upstream). You could also > enforce the module loading by setting 'ecc_enable_override=1' to verify > the other rdmsr_on_cpus calls. > > Thanks. This patch also works for me. Thanks. Acked-by: Randy Dunlap <randy.dunlap(a)oracle.com> boot log attached. > --- > diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c > index a38831c..da2428b 100644 > --- a/drivers/edac/amd64_edac.c > +++ b/drivers/edac/amd64_edac.c > @@ -2618,6 +2618,9 @@ static int amd64_init_csrows(struct mem_ctl_info *mci) > return empty; > } > > +static struct msr *alloc_msrs(const cpumask_t *mask); > +static void free_msrs(struct msr *msrs); > + > /* > * Only if 'ecc_enable_override' is set AND BIOS had ECC disabled, do "we" > * enable it. > @@ -2627,14 +2630,16 @@ static void amd64_enable_ecc_error_reporting(struct mem_ctl_info *mci) > struct amd64_pvt *pvt = mci->pvt_info; > const cpumask_t *cpumask = cpumask_of_node(pvt->mc_node_id); > int cpu, idx = 0, err = 0; > - struct msr msrs[cpumask_weight(cpumask)]; > + struct msr *msrs; > u32 value; > u32 mask = K8_NBCTL_CECCEn | K8_NBCTL_UECCEn; > > if (!ecc_enable_override) > return; > > - memset(msrs, 0, sizeof(msrs)); > + msrs = alloc_msrs(cpumask); > + if (!msrs) > + return; > > amd64_printk(KERN_WARNING, > "'ecc_enable_override' parameter is active, " > @@ -2697,20 +2702,24 @@ static void amd64_enable_ecc_error_reporting(struct mem_ctl_info *mci) > (value & K8_NBCFG_ECC_ENABLE) ? "Enabled" : "Disabled"); > > pvt->ctl_error_info.nbcfg = value; > + > + free_msrs(msrs); > } > > static void amd64_restore_ecc_error_reporting(struct amd64_pvt *pvt) > { > const cpumask_t *cpumask = cpumask_of_node(pvt->mc_node_id); > int cpu, idx = 0, err = 0; > - struct msr msrs[cpumask_weight(cpumask)]; > + struct msr *msrs; > u32 value; > u32 mask = K8_NBCTL_CECCEn | K8_NBCTL_UECCEn; > > if (!pvt->nbctl_mcgctl_saved) > return; > > - memset(msrs, 0, sizeof(msrs)); > + msrs = alloc_msrs(cpumask); > + if (!msrs) > + return; > > err = pci_read_config_dword(pvt->misc_f3_ctl, K8_NBCTL, &value); > if (err) > @@ -2731,6 +2740,8 @@ static void amd64_restore_ecc_error_reporting(struct amd64_pvt *pvt) > } > > wrmsr_on_cpus(cpumask, K8_MSR_MCGCTL, msrs); > + > + free_msrs(msrs); > } > > /* get all cores on this DCT */ > @@ -2743,6 +2754,40 @@ static void get_cpus_on_this_dct_cpumask(cpumask_t *mask, int nid) > cpumask_set_cpu(cpu, mask); > } > > +/* > + * Allocate enough msr structs for the supplied cpumask. Also, take care of > + * non-contigious bitmasks. > + */ > +static struct msr *alloc_msrs(const cpumask_t *mask) > +{ > + struct msr *msrs; > + int i, first_cpu, last_cpu = 0; > + > + if (cpumask_empty(mask)) { > + amd64_printk(KERN_WARNING, "%s: Empty cpumask!\n", __func__); > + return NULL; > + } > + > + first_cpu = cpumask_first(mask); > + for (i = first_cpu; i < nr_cpu_ids; i++) > + if (cpumask_test_cpu(i, mask)) > + last_cpu = i; > + > + msrs = kzalloc(sizeof(*msrs) * (last_cpu - first_cpu + 1), GFP_KERNEL); > + if (!msrs) { > + amd64_printk(KERN_WARNING, "%s: error allocating msrs\n", > + __func__); > + return NULL; > + } > + > + return msrs; > +} > + > +static void free_msrs(struct msr *msrs) > +{ > + kfree(msrs); > +} > + > /* check MCG_CTL on all the cpus on this node */ > static bool amd64_nb_mce_bank_enabled_on_node(int nid) > { > @@ -2755,12 +2800,9 @@ static bool amd64_nb_mce_bank_enabled_on_node(int nid) > > get_cpus_on_this_dct_cpumask(&mask, nid); > > - msrs = kzalloc(sizeof(struct msr) * cpumask_weight(&mask), GFP_KERNEL); > - if (!msrs) { > - amd64_printk(KERN_WARNING, "%s: error allocating msrs\n", > - __func__); > - return false; > - } > + msrs = alloc_msrs(&mask); > + if (!msrs) > + goto out_err; > > rdmsr_on_cpus(&mask, MSR_IA32_MCG_CTL, msrs); > > @@ -2779,7 +2821,9 @@ static bool amd64_nb_mce_bank_enabled_on_node(int nid) > ret = true; > > out: > - kfree(msrs); > + free_msrs(msrs); > + > +out_err: > return ret; > } > > > -- > Regards/Gruss, > Boris. > > Operating | Advanced Micro Devices GmbH > System | Karl-Hammerschmidt-Str. 34, 85609 Dornach b. München, Germany > Research | Geschäftsführer: Andrew Bowd, Thomas M. McCoy, Giuliano Meroni > Center | Sitz: Dornach, Gemeinde Aschheim, Landkreis München > (OSRC) | Registergericht München, HRB Nr. 43632 > --- ~Randy
From: Doug Thompson on 2 Dec 2009 17:20
--- On Wed, 12/2/09, Randy Dunlap <randy.dunlap(a)oracle.com> wrote: > From: Randy Dunlap <randy.dunlap(a)oracle.com> > Subject: Re: 2.6.32-rc8: amd64_edac slub error > To: "Borislav Petkov" <borislav.petkov(a)amd.com> > Cc: "Borislav Petkov" <petkovbb(a)googlemail.com>, "LKML" <linux-kernel(a)vger.kernel.org>, "Doug Thompson" <dougthompson(a)xmission.com> > Date: Wednesday, December 2, 2009, 11:11 AM > On Wed, 2 Dec 2009 11:58:38 +0100 > Borislav Petkov wrote: > > > On Tue, Dec 01, 2009 at 09:19:31AM -0800, Randy Dunlap > wrote: > > > Here's the new log file (attached). > > > > Thanks for testing. Meanwhile, I noticed that the > other places where > > rdmsr_on_cpus() gets called with non-contigious > cpumasks need fixing > > too. Here's a version that takes care of that, I'd be > nice if you could > > give it a run too (patch against today's upstream). > You could also > > enforce the module loading by setting > 'ecc_enable_override=1' to verify > > the other rdmsr_on_cpus calls. > > > > Thanks. > > This patch also works for me.� Thanks. > > Acked-by: Randy Dunlap <randy.dunlap(a)oracle.com> Acked-by: Doug Thompson <dougthompson(a)xmission.com> > > > boot log attached. > > > --- > > diff --git a/drivers/edac/amd64_edac.c > b/drivers/edac/amd64_edac.c > > index a38831c..da2428b 100644 > > --- a/drivers/edac/amd64_edac.c > > +++ b/drivers/edac/amd64_edac.c > > @@ -2618,6 +2618,9 @@ static int > amd64_init_csrows(struct mem_ctl_info *mci) > >� ��� return empty; > >� } > >� > > +static struct msr *alloc_msrs(const cpumask_t > *mask); > > +static void free_msrs(struct msr *msrs); > > + > >� /* > >���* Only if 'ecc_enable_override' is > set AND BIOS had ECC disabled, do "we" > >���* enable it. > > @@ -2627,14 +2630,16 @@ static void > amd64_enable_ecc_error_reporting(struct mem_ctl_info *mci) > >� ��� struct amd64_pvt *pvt = > mci->pvt_info; > >� ��� const cpumask_t *cpumask = > cpumask_of_node(pvt->mc_node_id); > >� ��� int cpu, idx = 0, err = 0; > > -��� struct msr > msrs[cpumask_weight(cpumask)]; > > +��� struct msr *msrs; > >� ��� u32 value; > >� ��� u32 mask = K8_NBCTL_CECCEn | > K8_NBCTL_UECCEn; > >� > >� ��� if (!ecc_enable_override) > >� ��� ��� return; > >� > > -��� memset(msrs, 0, sizeof(msrs)); > > +��� msrs = alloc_msrs(cpumask); > > +��� if (!msrs) > > +��� ��� return; > >� > >� ��� amd64_printk(KERN_WARNING, > >� ��� ��� > "'ecc_enable_override' parameter is active, " > > @@ -2697,20 +2702,24 @@ static void > amd64_enable_ecc_error_reporting(struct mem_ctl_info *mci) > >� ��� ��� (value > & K8_NBCFG_ECC_ENABLE) ? "Enabled" : "Disabled"); > >� > >� ��� pvt->ctl_error_info.nbcfg > = value; > > + > > +��� free_msrs(msrs); > >� } > >� > >� static void > amd64_restore_ecc_error_reporting(struct amd64_pvt *pvt) > >� { > >� ��� const cpumask_t *cpumask = > cpumask_of_node(pvt->mc_node_id); > >� ��� int cpu, idx = 0, err = 0; > > -��� struct msr > msrs[cpumask_weight(cpumask)]; > > +��� struct msr *msrs; > >� ��� u32 value; > >� ��� u32 mask = K8_NBCTL_CECCEn | > K8_NBCTL_UECCEn; > >� > >� ��� if > (!pvt->nbctl_mcgctl_saved) > >� ��� ��� return; > >� > > -��� memset(msrs, 0, sizeof(msrs)); > > +��� msrs = alloc_msrs(cpumask); > > +��� if (!msrs) > > +��� ��� return; > >� > >� ��� err = > pci_read_config_dword(pvt->misc_f3_ctl, K8_NBCTL, > &value); > >� ��� if (err) > > @@ -2731,6 +2740,8 @@ static void > amd64_restore_ecc_error_reporting(struct amd64_pvt *pvt) > >� ��� } > >� > >� ��� wrmsr_on_cpus(cpumask, > K8_MSR_MCGCTL, msrs); > > + > > +��� free_msrs(msrs); > >� } > >� > >� /* get all cores on this DCT */ > > @@ -2743,6 +2754,40 @@ static void > get_cpus_on_this_dct_cpumask(cpumask_t *mask, int nid) > >� ��� ��� > ��� cpumask_set_cpu(cpu, mask); > >� } > >� > > +/* > > + * Allocate enough msr structs for the supplied > cpumask. Also, take care of > > + * non-contigious bitmasks. > > + */ > > +static struct msr *alloc_msrs(const cpumask_t *mask) > > +{ > > +��� struct msr *msrs; > > +��� int i, first_cpu, last_cpu = 0; > > + > > +��� if (cpumask_empty(mask)) { > > +��� ��� > amd64_printk(KERN_WARNING, "%s: Empty cpumask!\n", > __func__); > > +��� ��� return NULL; > > +��� } > > + > > +��� first_cpu = cpumask_first(mask); > > +��� for (i = first_cpu; i < > nr_cpu_ids; i++) > > +��� ��� if > (cpumask_test_cpu(i, mask)) > > +��� ��� > ��� last_cpu = i; > > + > > +��� msrs = kzalloc(sizeof(*msrs) * > (last_cpu - first_cpu + 1), GFP_KERNEL); > > +��� if (!msrs) { > > +��� ��� > amd64_printk(KERN_WARNING, "%s: error allocating msrs\n", > > +��� ��� > ��� � � � __func__); > > +��� > �����return NULL; > > +��� } > > + > > +��� return msrs; > > +} > > + > > +static void free_msrs(struct msr *msrs) > > +{ > > +�����kfree(msrs); > > +} > > + > >� /* check MCG_CTL on all the cpus on this node > */ > >� static bool > amd64_nb_mce_bank_enabled_on_node(int nid) > >� { > > @@ -2755,12 +2800,9 @@ static bool > amd64_nb_mce_bank_enabled_on_node(int nid) > >� > >� ��� > get_cpus_on_this_dct_cpumask(&mask, nid); > >� > > -��� msrs = kzalloc(sizeof(struct msr) > * cpumask_weight(&mask), GFP_KERNEL); > > -��� if (!msrs) { > > -��� ��� > amd64_printk(KERN_WARNING, "%s: error allocating msrs\n", > > -��� ��� > ��� � � � __func__); > > -��� > �����return false; > > -��� } > > +��� msrs = alloc_msrs(&mask); > > +��� if (!msrs) > > +��� ��� goto out_err; > >� > >� ��� rdmsr_on_cpus(&mask, > MSR_IA32_MCG_CTL, msrs); > >� > > @@ -2779,7 +2821,9 @@ static bool > amd64_nb_mce_bank_enabled_on_node(int nid) > >� ��� ret = true; > >� > >� out: > > -��� kfree(msrs); > > +��� free_msrs(msrs); > > + > > +out_err: > >� ��� return ret; > >� } > >� > > > > -- > > Regards/Gruss, > > Boris. > > > > Operating | Advanced Micro Devices GmbH > >���System� | > Karl-Hammerschmidt-Str. 34, 85609 Dornach b. M�nchen, > Germany > >� Research | Gesch�ftsf�hrer: Andrew Bowd, > Thomas M. McCoy, Giuliano Meroni > >���Center� | Sitz: Dornach, > Gemeinde Aschheim, Landkreis M�nchen > >���(OSRC)� | Registergericht > M�nchen, HRB Nr. 43632 > > > > > --- > ~Randy > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo(a)vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ |