From: Benjamin Herrenschmidt on 21 Mar 2010 22:40 On Sun, 2010-03-21 at 00:13 -0700, Yinghai Lu wrote: > move it to kernel/fw_memmap.c from arch/x86/kernel/e820.c > > -v2: add fw_memmap wrapper to some func... > move some functions back to e820.c NAK This is even worse than before. You are now moving that entire pile of x86 gunk into "generic" code, but even keep it names e820 there ! What happened to the discussion we had earlier, which iirc concluded that a better approach would be to adapt x86 to use LMB ? Cheers, Ben. > Signed-off-by: Yinghai Lu <yinghai(a)kernel.org> > --- > arch/x86/include/asm/e820.h | 176 ++++++------- > arch/x86/kernel/e820.c | 638 ++---------------------------------------- > include/linux/bootmem.h | 2 +- > include/linux/early_res.h | 1 + > include/linux/fw_memmap.h | 40 +++ > kernel/Makefile | 2 +- > kernel/fw_memmap.c | 625 +++++++++++++++++++++++++++++++++++++++++ > kernel/fw_memmap_internals.h | 49 ++++ > 8 files changed, 822 insertions(+), 711 deletions(-) > create mode 100644 include/linux/fw_memmap.h > create mode 100644 kernel/fw_memmap.c > create mode 100644 kernel/fw_memmap_internals.h > > diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h > index 71c0348..c038616 100644 > --- a/arch/x86/include/asm/e820.h > +++ b/arch/x86/include/asm/e820.h > @@ -1,65 +1,10 @@ > #ifndef _ASM_X86_E820_H > #define _ASM_X86_E820_H > -#define E820MAP 0x2d0 /* our map */ > -#define E820MAX 128 /* number of entries in E820MAP */ > - > -/* > - * Legacy E820 BIOS limits us to 128 (E820MAX) nodes due to the > - * constrained space in the zeropage. If we have more nodes than > - * that, and if we've booted off EFI firmware, then the EFI tables > - * passed us from the EFI firmware can list more nodes. Size our > - * internal memory map tables to have room for these additional > - * nodes, based on up to three entries per node for which the > - * kernel was built: MAX_NUMNODES == (1 << CONFIG_NODES_SHIFT), > - * plus E820MAX, allowing space for the possible duplicate E820 > - * entries that might need room in the same arrays, prior to the > - * call to sanitize_e820_map() to remove duplicates. The allowance > - * of three memory map entries per node is "enough" entries for > - * the initial hardware platform motivating this mechanism to make > - * use of additional EFI map entries. Future platforms may want > - * to allow more than three entries per node or otherwise refine > - * this size. > - */ > - > -/* > - * Odd: 'make headers_check' complains about numa.h if I try > - * to collapse the next two #ifdef lines to a single line: > - * #if defined(__KERNEL__) && defined(CONFIG_EFI) > - */ > -#ifdef __KERNEL__ > -#ifdef CONFIG_EFI > -#include <linux/numa.h> > -#define E820_X_MAX (E820MAX + 3 * MAX_NUMNODES) > -#else /* ! CONFIG_EFI */ > -#define E820_X_MAX E820MAX > -#endif > -#else /* ! __KERNEL__ */ > -#define E820_X_MAX E820MAX > -#endif > - > -#define E820NR 0x1e8 /* # entries in E820MAP */ > - > -#define E820_RAM 1 > -#define E820_RESERVED 2 > -#define E820_ACPI 3 > -#define E820_NVS 4 > -#define E820_UNUSABLE 5 > > /* reserved RAM used by kernel itself */ > #define E820_RESERVED_KERN 128 > > #ifndef __ASSEMBLY__ > -#include <linux/types.h> > -struct e820entry { > - __u64 addr; /* start of memory segment */ > - __u64 size; /* size of memory segment */ > - __u32 type; /* type of memory segment */ > -} __attribute__((packed)); > - > -struct e820map { > - __u32 nr_map; > - struct e820entry map[E820_X_MAX]; > -}; > > #define ISA_START_ADDRESS 0xa0000 > #define ISA_END_ADDRESS 0x100000 > @@ -69,32 +14,18 @@ struct e820map { > > #ifdef __KERNEL__ > > -#ifdef CONFIG_X86_OOSTORE > -extern int centaur_ram_top; > -void get_centaur_ram_top(void); > +#include <linux/fw_memmap.h> > + > +#ifdef CONFIG_MEMTEST > +extern void early_memtest(unsigned long start, unsigned long end); > #else > -static inline void get_centaur_ram_top(void) > +static inline void early_memtest(unsigned long start, unsigned long end) > { > } > #endif > > extern unsigned long pci_mem_start; > -extern int e820_any_mapped(u64 start, u64 end, unsigned type); > -extern int e820_all_mapped(u64 start, u64 end, unsigned type); > -extern void e820_add_region(u64 start, u64 size, int type); > -extern void e820_print_map(char *who); > -int sanitize_e820_map(void); > -void save_e820_map(void); > -extern u64 e820_update_range(u64 start, u64 size, unsigned old_type, > - unsigned new_type); > -extern u64 e820_remove_range(u64 start, u64 size, unsigned old_type, > - int checktype); > -extern void update_e820(void); > extern void e820_setup_gap(void); > -extern int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, > - unsigned long start_addr, unsigned long long end_addr); > -struct setup_data; > -extern void parse_e820_ext(struct setup_data *data, unsigned long pa_data); > > #if defined(CONFIG_X86_64) || \ > (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION)) > @@ -105,37 +36,80 @@ static inline void e820_mark_nosave_regions(unsigned long limit_pfn) > } > #endif > > -#ifdef CONFIG_MEMTEST > -extern void early_memtest(unsigned long start, unsigned long end); > -#else > -static inline void early_memtest(unsigned long start, unsigned long end) > +static inline void e820_add_region(u64 start, u64 size, int type) > { > + fw_memmap_add_region(start, size, type); > +} > + > +static inline void e820_print_map(char *who) > +{ > + fw_memmap_print_map(who); > +} > + > +static inline int sanitize_e820_map(void) > +{ > + return sanitize_fw_memmap(); > +} > + > +static inline void finish_e820_parsing(void) > +{ > + finish_fw_memmap_parsing(); > +} > + > +static inline void e820_register_active_regions(int nid, > + unsigned long start_pfn, > + unsigned long end_pfn) > +{ > + fw_memmap_register_active_regions(nid, start_pfn, end_pfn); > +} > + > +static inline u64 e820_hole_size(u64 start, u64 end) > +{ > + return fw_memmap_hole_size(start, end); > +} > + > +static inline u64 find_e820_area(u64 start, u64 end, u64 size, u64 align) > +{ > + return find_fw_memmap_area(start, end, size, align); > +} > + > +static inline u64 find_e820_area_node(int nid, u64 start, u64 end, > + u64 size, u64 align) > +{ > + return find_fw_memmap_area_node(nid, start, end, size, align); > } > -#endif > > -extern unsigned long end_user_pfn; > +static inline unsigned long e820_end_of_ram_pfn(void) > +{ > + return fw_memmap_end_of_ram_pfn(); > +} > + > +void clear_e820_map(void); > + > +extern u64 e820_remove_range(u64 start, u64 size, unsigned old_type, > + int checktype); > +struct e820entry; > +int __sanitize_e820_map(struct e820entry *biosmap, int max_nr, u32 *pnr_map); > +extern unsigned long e820_end_of_low_ram_pfn(void); > + > +extern int e820_any_mapped(u64 start, u64 end, unsigned type); > +extern int e820_all_mapped(u64 start, u64 end, unsigned type); > +extern u64 e820_update_range(u64 start, u64 size, unsigned old_type, > + unsigned new_type); > + > +extern void update_e820(void); > +void save_e820_map(void); > +struct setup_data; > +extern void parse_e820_ext(struct setup_data *data, unsigned long pa_data); > +extern char *default_machine_specific_memory_setup(void); > +extern void setup_memory_map(void); > > -extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align); > extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align); > -u64 find_e820_area_node(int nid, u64 start, u64 end, u64 size, u64 align); > + > extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align); > -#include <linux/early_res.h> > > -extern unsigned long e820_end_of_ram_pfn(void); > -extern unsigned long e820_end_of_low_ram_pfn(void); > -extern int e820_find_active_region(const struct e820entry *ei, > - unsigned long start_pfn, > - unsigned long last_pfn, > - unsigned long *ei_startpfn, > - unsigned long *ei_endpfn); > -extern void e820_register_active_regions(int nid, unsigned long start_pfn, > - unsigned long end_pfn); > -extern u64 e820_hole_size(u64 start, u64 end); > -extern void finish_e820_parsing(void); > extern void e820_reserve_resources(void); > extern void e820_reserve_resources_late(void); > -extern void setup_memory_map(void); > -extern char *default_machine_specific_memory_setup(void); > > /* > * Returns true iff the specified range [s,e) is completely contained inside > @@ -146,7 +120,17 @@ static inline bool is_ISA_range(u64 s, u64 e) > return s >= ISA_START_ADDRESS && e <= ISA_END_ADDRESS; > } > > +#ifdef CONFIG_X86_OOSTORE > +extern int centaur_ram_top; > +void get_centaur_ram_top(void); > +#else > +static inline void get_centaur_ram_top(void) > +{ > +} > +#endif > + > #endif /* __KERNEL__ */ > + > #endif /* __ASSEMBLY__ */ > > #ifdef __KERNEL__ > diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c > index a558609..9f125ca 100644 > --- a/arch/x86/kernel/e820.c > +++ b/arch/x86/kernel/e820.c > @@ -12,18 +12,15 @@ > #include <linux/types.h> > #include <linux/init.h> > #include <linux/bootmem.h> > -#include <linux/pfn.h> > #include <linux/suspend.h> > #include <linux/firmware-map.h> > > #include <asm/e820.h> > -#include <asm/proto.h> > #include <asm/setup.h> > > +#include "../../../kernel/fw_memmap_internals.h" > + > /* > - * The e820 map is the map that gets modified e.g. with command line parameters > - * and that is also registered with modifications in the kernel resource tree > - * with the iomem_resource as parent. > * > * The e820_saved is directly saved after the BIOS-provided memory map is > * copied. It doesn't get modified afterwards. It's registered for the > @@ -34,7 +31,6 @@ > * user can e.g. boot the original kernel with mem=1G while still booting the > * next kernel with full memory. > */ > -static struct e820map __initdata e820; > static struct e820map __initdata e820_saved; > > /* For PCI or other memory-mapped resources */ > @@ -99,295 +95,6 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type) > return 0; > } > > -/* > - * Add a memory region to the kernel e820 map. > - */ > -static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, > - int type) > -{ > - int x = e820x->nr_map; > - > - if (x >= ARRAY_SIZE(e820x->map)) { > - printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); > - return; > - } > - > - e820x->map[x].addr = start; > - e820x->map[x].size = size; > - e820x->map[x].type = type; > - e820x->nr_map++; > -} > - > -void __init e820_add_region(u64 start, u64 size, int type) > -{ > - __e820_add_region(&e820, start, size, type); > -} > - > -static void __init e820_print_type(u32 type) > -{ > - switch (type) { > - case E820_RAM: > - case E820_RESERVED_KERN: > - printk(KERN_CONT "(usable)"); > - break; > - case E820_RESERVED: > - printk(KERN_CONT "(reserved)"); > - break; > - case E820_ACPI: > - printk(KERN_CONT "(ACPI data)"); > - break; > - case E820_NVS: > - printk(KERN_CONT "(ACPI NVS)"); > - break; > - case E820_UNUSABLE: > - printk(KERN_CONT "(unusable)"); > - break; > - default: > - printk(KERN_CONT "type %u", type); > - break; > - } > -} > - > -void __init e820_print_map(char *who) > -{ > - int i; > - > - for (i = 0; i < e820.nr_map; i++) { > - printk(KERN_INFO " %s: %016Lx - %016Lx ", who, > - (unsigned long long) e820.map[i].addr, > - (unsigned long long) > - (e820.map[i].addr + e820.map[i].size)); > - e820_print_type(e820.map[i].type); > - printk(KERN_CONT "\n"); > - } > -} > - > -/* > - * Sanitize the BIOS e820 map. > - * > - * Some e820 responses include overlapping entries. The following > - * replaces the original e820 map with a new one, removing overlaps, > - * and resolving conflicting memory types in favor of highest > - * numbered type. > - * > - * The input parameter biosmap points to an array of 'struct > - * e820entry' which on entry has elements in the range [0, *pnr_map) > - * valid, and which has space for up to max_nr_map entries. > - * On return, the resulting sanitized e820 map entries will be in > - * overwritten in the same location, starting at biosmap. > - * > - * The integer pointed to by pnr_map must be valid on entry (the > - * current number of valid entries located at biosmap) and will > - * be updated on return, with the new number of valid entries > - * (something no more than max_nr_map.) > - * > - * The return value from sanitize_e820_map() is zero if it > - * successfully 'sanitized' the map entries passed in, and is -1 > - * if it did nothing, which can happen if either of (1) it was > - * only passed one map entry, or (2) any of the input map entries > - * were invalid (start + size < start, meaning that the size was > - * so big the described memory range wrapped around through zero.) > - * > - * Visually we're performing the following > - * (1,2,3,4 = memory types)... > - * > - * Sample memory map (w/overlaps): > - * ____22__________________ > - * ______________________4_ > - * ____1111________________ > - * _44_____________________ > - * 11111111________________ > - * ____________________33__ > - * ___________44___________ > - * __________33333_________ > - * ______________22________ > - * ___________________2222_ > - * _________111111111______ > - * _____________________11_ > - * _________________4______ > - * > - * Sanitized equivalent (no overlap): > - * 1_______________________ > - * _44_____________________ > - * ___1____________________ > - * ____22__________________ > - * ______11________________ > - * _________1______________ > - * __________3_____________ > - * ___________44___________ > - * _____________33_________ > - * _______________2________ > - * ________________1_______ > - * _________________4______ > - * ___________________2____ > - * ____________________33__ > - * ______________________4_ > - */ > - > -static int __init __sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, > - u32 *pnr_map) > -{ > - struct change_member { > - struct e820entry *pbios; /* pointer to original bios entry */ > - unsigned long long addr; /* address for this change point */ > - }; > - static struct change_member change_point_list[2*E820_X_MAX] __initdata; > - static struct change_member *change_point[2*E820_X_MAX] __initdata; > - static struct e820entry *overlap_list[E820_X_MAX] __initdata; > - static struct e820entry new_bios[E820_X_MAX] __initdata; > - struct change_member *change_tmp; > - unsigned long current_type, last_type; > - unsigned long long last_addr; > - int chgidx, still_changing; > - int overlap_entries; > - int new_bios_entry; > - int old_nr, new_nr, chg_nr; > - int i; > - > - /* if there's only one memory region, don't bother */ > - if (*pnr_map < 2) > - return -1; > - > - old_nr = *pnr_map; > - BUG_ON(old_nr > max_nr_map); > - > - /* bail out if we find any unreasonable addresses in bios map */ > - for (i = 0; i < old_nr; i++) > - if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) > - return -1; > - > - /* create pointers for initial change-point information (for sorting) */ > - for (i = 0; i < 2 * old_nr; i++) > - change_point[i] = &change_point_list[i]; > - > - /* record all known change-points (starting and ending addresses), > - omitting those that are for empty memory regions */ > - chgidx = 0; > - for (i = 0; i < old_nr; i++) { > - if (biosmap[i].size != 0) { > - change_point[chgidx]->addr = biosmap[i].addr; > - change_point[chgidx++]->pbios = &biosmap[i]; > - change_point[chgidx]->addr = biosmap[i].addr + > - biosmap[i].size; > - change_point[chgidx++]->pbios = &biosmap[i]; > - } > - } > - chg_nr = chgidx; > - > - /* sort change-point list by memory addresses (low -> high) */ > - still_changing = 1; > - while (still_changing) { > - still_changing = 0; > - for (i = 1; i < chg_nr; i++) { > - unsigned long long curaddr, lastaddr; > - unsigned long long curpbaddr, lastpbaddr; > - > - curaddr = change_point[i]->addr; > - lastaddr = change_point[i - 1]->addr; > - curpbaddr = change_point[i]->pbios->addr; > - lastpbaddr = change_point[i - 1]->pbios->addr; > - > - /* > - * swap entries, when: > - * > - * curaddr > lastaddr or > - * curaddr == lastaddr and curaddr == curpbaddr and > - * lastaddr != lastpbaddr > - */ > - if (curaddr < lastaddr || > - (curaddr == lastaddr && curaddr == curpbaddr && > - lastaddr != lastpbaddr)) { > - change_tmp = change_point[i]; > - change_point[i] = change_point[i-1]; > - change_point[i-1] = change_tmp; > - still_changing = 1; > - } > - } > - } > - > - /* create a new bios memory map, removing overlaps */ > - overlap_entries = 0; /* number of entries in the overlap table */ > - new_bios_entry = 0; /* index for creating new bios map entries */ > - last_type = 0; /* start with undefined memory type */ > - last_addr = 0; /* start with 0 as last starting address */ > - > - /* loop through change-points, determining affect on the new bios map */ > - for (chgidx = 0; chgidx < chg_nr; chgidx++) { > - /* keep track of all overlapping bios entries */ > - if (change_point[chgidx]->addr == > - change_point[chgidx]->pbios->addr) { > - /* > - * add map entry to overlap list (> 1 entry > - * implies an overlap) > - */ > - overlap_list[overlap_entries++] = > - change_point[chgidx]->pbios; > - } else { > - /* > - * remove entry from list (order independent, > - * so swap with last) > - */ > - for (i = 0; i < overlap_entries; i++) { > - if (overlap_list[i] == > - change_point[chgidx]->pbios) > - overlap_list[i] = > - overlap_list[overlap_entries-1]; > - } > - overlap_entries--; > - } > - /* > - * if there are overlapping entries, decide which > - * "type" to use (larger value takes precedence -- > - * 1=usable, 2,3,4,4+=unusable) > - */ > - current_type = 0; > - for (i = 0; i < overlap_entries; i++) > - if (overlap_list[i]->type > current_type) > - current_type = overlap_list[i]->type; > - /* > - * continue building up new bios map based on this > - * information > - */ > - if (current_type != last_type) { > - if (last_type != 0) { > - new_bios[new_bios_entry].size = > - change_point[chgidx]->addr - last_addr; > - /* > - * move forward only if the new size > - * was non-zero > - */ > - if (new_bios[new_bios_entry].size != 0) > - /* > - * no more space left for new > - * bios entries ? > - */ > - if (++new_bios_entry >= max_nr_map) > - break; > - } > - if (current_type != 0) { > - new_bios[new_bios_entry].addr = > - change_point[chgidx]->addr; > - new_bios[new_bios_entry].type = current_type; > - last_addr = change_point[chgidx]->addr; > - } > - last_type = current_type; > - } > - } > - /* retain count for new bios entries */ > - new_nr = new_bios_entry; > - > - /* copy new bios mapping into original location */ > - memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry)); > - *pnr_map = new_nr; > - > - return 0; > -} > - > -int __init sanitize_e820_map(void) > -{ > - return __sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); > -} > - > static int __init __append_e820_map(struct e820entry *biosmap, int nr_map) > { > while (nr_map) { > @@ -509,52 +216,6 @@ static u64 __init e820_update_range_saved(u64 start, u64 size, > new_type); > } > > -/* make e820 not cover the range */ > -u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, > - int checktype) > -{ > - int i; > - u64 end; > - u64 real_removed_size = 0; > - > - if (size > (ULLONG_MAX - start)) > - size = ULLONG_MAX - start; > - > - end = start + size; > - printk(KERN_DEBUG "e820 remove range: %016Lx - %016Lx ", > - (unsigned long long) start, > - (unsigned long long) end); > - e820_print_type(old_type); > - printk(KERN_CONT "\n"); > - > - for (i = 0; i < e820.nr_map; i++) { > - struct e820entry *ei = &e820.map[i]; > - u64 final_start, final_end; > - > - if (checktype && ei->type != old_type) > - continue; > - /* totally covered? */ > - if (ei->addr >= start && > - (ei->addr + ei->size) <= (start + size)) { > - real_removed_size += ei->size; > - memset(ei, 0, sizeof(struct e820entry)); > - continue; > - } > - /* partially covered */ > - final_start = max(start, ei->addr); > - final_end = min(start + size, ei->addr + ei->size); > - if (final_start >= final_end) > - continue; > - real_removed_size += final_end - final_start; > - > - ei->size -= final_end - final_start; > - if (ei->addr < final_start) > - continue; > - ei->addr = final_end; > - } > - return real_removed_size; > -} > - > void __init update_e820(void) > { > u32 nr_map; > @@ -566,20 +227,24 @@ void __init update_e820(void) > printk(KERN_INFO "modified physical RAM map:\n"); > e820_print_map("modified"); > } > + > static void __init update_e820_saved(void) > { > u32 nr_map; > + int max_nr_map = ARRAY_SIZE(e820_saved.map); > > nr_map = e820_saved.nr_map; > - if (__sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map)) > + if (__sanitize_e820_map(e820_saved.map, max_nr_map, &nr_map)) > return; > e820_saved.nr_map = nr_map; > } > + > #define MAX_GAP_END 0x100000000ull > /* > * Search for a gap in the e820 memory space from start_addr to end_addr. > */ > -__init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, > +static int __init > +e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, > unsigned long start_addr, unsigned long long end_addr) > { > unsigned long long last; > @@ -726,37 +391,6 @@ static int __init e820_mark_nvs_memory(void) > core_initcall(e820_mark_nvs_memory); > #endif > > -/* > - * Find a free area with specified alignment in a specific range. > - */ > -u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align) > -{ > - int i; > - > - for (i = 0; i < e820.nr_map; i++) { > - struct e820entry *ei = &e820.map[i]; > - u64 addr; > - u64 ei_start, ei_last; > - > - if (ei->type != E820_RAM) > - continue; > - > - ei_last = ei->addr + ei->size; > - ei_start = ei->addr; > - addr = find_early_area(ei_start, ei_last, start, end, > - size, align); > - > - if (addr != -1ULL) > - return addr; > - } > - return -1ULL; > -} > - > -u64 __init find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align) > -{ > - return find_e820_area(start, end, size, align); > -} > - > u64 __init get_max_mapped(void) > { > u64 end = max_pfn_mapped; > @@ -765,6 +399,7 @@ u64 __init get_max_mapped(void) > > return end; > } > + > /* > * Find next free range after *start > */ > @@ -792,21 +427,6 @@ u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align) > return -1ULL; > } > > -u64 __init find_e820_area_node(int nid, u64 start, u64 end, u64 size, u64 align) > -{ > - u64 addr; > - /* > - * need to call this function after e820_register_active_regions > - * so early_node_map[] is set > - */ > - addr = find_memory_core_early(nid, size, align, start, end); > - if (addr != -1ULL) > - return addr; > - > - /* fallback, should already have start end in the node range */ > - return find_e820_area(start, end, size, align); > -} > - > /* > * pre allocated 4k and reserved it in e820 > */ > @@ -843,220 +463,6 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align) > return addr; > } > > -#ifdef CONFIG_X86_32 > -# ifdef CONFIG_X86_PAE > -# define MAX_ARCH_PFN (1ULL<<(36-PAGE_SHIFT)) > -# else > -# define MAX_ARCH_PFN (1ULL<<(32-PAGE_SHIFT)) > -# endif > -#else /* CONFIG_X86_32 */ > -# define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT > -#endif > - > -/* > - * Find the highest page frame number we have available > - */ > -static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) > -{ > - int i; > - unsigned long last_pfn = 0; > - unsigned long max_arch_pfn = MAX_ARCH_PFN; > - > - for (i = 0; i < e820.nr_map; i++) { > - struct e820entry *ei = &e820.map[i]; > - unsigned long start_pfn; > - unsigned long end_pfn; > - > - if (ei->type != type) > - continue; > - > - start_pfn = ei->addr >> PAGE_SHIFT; > - end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT; > - > - if (start_pfn >= limit_pfn) > - continue; > - if (end_pfn > limit_pfn) { > - last_pfn = limit_pfn; > - break; > - } > - if (end_pfn > last_pfn) > - last_pfn = end_pfn; > - } > - > - if (last_pfn > max_arch_pfn) > - last_pfn = max_arch_pfn; > - > - printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n", > - last_pfn, max_arch_pfn); > - return last_pfn; > -} > -unsigned long __init e820_end_of_ram_pfn(void) > -{ > - return e820_end_pfn(MAX_ARCH_PFN, E820_RAM); > -} > - > -unsigned long __init e820_end_of_low_ram_pfn(void) > -{ > - return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM); > -} > -/* > - * Finds an active region in the address range from start_pfn to last_pfn and > - * returns its range in ei_startpfn and ei_endpfn for the e820 entry. > - */ > -int __init e820_find_active_region(const struct e820entry *ei, > - unsigned long start_pfn, > - unsigned long last_pfn, > - unsigned long *ei_startpfn, > - unsigned long *ei_endpfn) > -{ > - u64 align = PAGE_SIZE; > - > - *ei_startpfn = round_up(ei->addr, align) >> PAGE_SHIFT; > - *ei_endpfn = round_down(ei->addr + ei->size, align) >> PAGE_SHIFT; > - > - /* Skip map entries smaller than a page */ > - if (*ei_startpfn >= *ei_endpfn) > - return 0; > - > - /* Skip if map is outside the node */ > - if (ei->type != E820_RAM || *ei_endpfn <= start_pfn || > - *ei_startpfn >= last_pfn) > - return 0; > - > - /* Check for overlaps */ > - if (*ei_startpfn < start_pfn) > - *ei_startpfn = start_pfn; > - if (*ei_endpfn > last_pfn) > - *ei_endpfn = last_pfn; > - > - return 1; > -} > - > -/* Walk the e820 map and register active regions within a node */ > -void __init e820_register_active_regions(int nid, unsigned long start_pfn, > - unsigned long last_pfn) > -{ > - unsigned long ei_startpfn; > - unsigned long ei_endpfn; > - int i; > - > - for (i = 0; i < e820.nr_map; i++) > - if (e820_find_active_region(&e820.map[i], > - start_pfn, last_pfn, > - &ei_startpfn, &ei_endpfn)) > - add_active_range(nid, ei_startpfn, ei_endpfn); > -} > - > -/* > - * Find the hole size (in bytes) in the memory range. > - * @start: starting address of the memory range to scan > - * @end: ending address of the memory range to scan > - */ > -u64 __init e820_hole_size(u64 start, u64 end) > -{ > - unsigned long start_pfn = start >> PAGE_SHIFT; > - unsigned long last_pfn = end >> PAGE_SHIFT; > - unsigned long ei_startpfn, ei_endpfn, ram = 0; > - int i; > - > - for (i = 0; i < e820.nr_map; i++) { > - if (e820_find_active_region(&e820.map[i], > - start_pfn, last_pfn, > - &ei_startpfn, &ei_endpfn)) > - ram += ei_endpfn - ei_startpfn; > - } > - return end - start - ((u64)ram << PAGE_SHIFT); > -} > - > -static void early_panic(char *msg) > -{ > - early_printk(msg); > - panic(msg); > -} > - > -static int userdef __initdata; > - > -/* "mem=nopentium" disables the 4MB page tables. */ > -static int __init parse_memopt(char *p) > -{ > - u64 mem_size; > - > - if (!p) > - return -EINVAL; > - > -#ifdef CONFIG_X86_32 > - if (!strcmp(p, "nopentium")) { > - setup_clear_cpu_cap(X86_FEATURE_PSE); > - return 0; > - } > -#endif > - > - userdef = 1; > - mem_size = memparse(p, &p); > - e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); > - > - return 0; > -} > -early_param("mem", parse_memopt); > - > -static int __init parse_memmap_opt(char *p) > -{ > - char *oldp; > - u64 start_at, mem_size; > - > - if (!p) > - return -EINVAL; > - > - if (!strncmp(p, "exactmap", 8)) { > -#ifdef CONFIG_CRASH_DUMP > - /* > - * If we are doing a crash dump, we still need to know > - * the real mem size before original memory map is > - * reset. > - */ > - saved_max_pfn = e820_end_of_ram_pfn(); > -#endif > - e820.nr_map = 0; > - userdef = 1; > - return 0; > - } > - > - oldp = p; > - mem_size = memparse(p, &p); > - if (p == oldp) > - return -EINVAL; > - > - userdef = 1; > - if (*p == '@') { > - start_at = memparse(p+1, &p); > - e820_add_region(start_at, mem_size, E820_RAM); > - } else if (*p == '#') { > - start_at = memparse(p+1, &p); > - e820_add_region(start_at, mem_size, E820_ACPI); > - } else if (*p == '$') { > - start_at = memparse(p+1, &p); > - e820_add_region(start_at, mem_size, E820_RESERVED); > - } else > - e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); > - > - return *p == '\0' ? 0 : -EINVAL; > -} > -early_param("memmap", parse_memmap_opt); > - > -void __init finish_e820_parsing(void) > -{ > - if (userdef) { > - u32 nr = e820.nr_map; > - > - if (__sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0) > - early_panic("Invalid user supplied memory map"); > - e820.nr_map = nr; > - > - printk(KERN_INFO "user-defined physical RAM map:\n"); > - e820_print_map("user"); > - } > -} > - > static inline const char *e820_type_to_string(int e820_type) > { > switch (e820_type) { > @@ -1098,7 +504,8 @@ void __init e820_reserve_resources(void) > * pci device BAR resource and insert them later in > * pcibios_resource_survey() > */ > - if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20)) { > + if (e820.map[i].type != E820_RESERVED || > + res->start < (1ULL<<20)) { > res->flags |= IORESOURCE_BUSY; > insert_resource(&iomem_resource, res); > } > @@ -1114,7 +521,7 @@ void __init e820_reserve_resources(void) > } > > /* How much should we pad RAM ending depending on where it is? */ > -static unsigned long ram_alignment(resource_size_t pos) > +static unsigned long __init ram_alignment(resource_size_t pos) > { > unsigned long mb = pos >> 20; > > @@ -1196,7 +603,7 @@ char *__init default_machine_specific_memory_setup(void) > who = "BIOS-e801"; > } > > - e820.nr_map = 0; > + clear_e820_map(); > e820_add_region(0, LOWMEMSIZE(), E820_RAM); > e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM); > } > @@ -1204,7 +611,6 @@ char *__init default_machine_specific_memory_setup(void) > /* In case someone cares... */ > return who; > } > - > void __init save_e820_map(void) > { > memcpy(&e820_saved, &e820, sizeof(struct e820map)); > @@ -1221,20 +627,18 @@ void __init setup_memory_map(void) > } > > #ifdef CONFIG_X86_OOSTORE > + > /* > * Figure what we can cover with MCR's > * > * Shortcut: We know you can't put 4Gig of RAM on a winchip > */ > -void __init get_centaur_ram_top(void) > +static void __init __get_special_low_ram_top(void) > { > u32 clip = 0xFFFFFFFFUL; > u32 top = 0; > int i; > > - if (boot_cpu_data.x86_vendor != X86_VENDOR_CENTAUR) > - return; > - > for (i = 0; i < e820.nr_map; i++) { > unsigned long start, end; > > @@ -1272,7 +676,15 @@ void __init get_centaur_ram_top(void) > if (top > clip) > top = clip; > > - centaur_ram_top = top; > + return top; > } > -#endif > > +int centaur_ram_top; > +void __init get_centaur_ram_top(void) > +{ > + if (boot_cpu_data.x86_vendor != X86_VENDOR_CENTAUR) > + return; > + > + centaur_ram_top = __get_special_low_ram_top(); > +} > +#endif > diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h > index 266ab92..c341c18 100644 > --- a/include/linux/bootmem.h > +++ b/include/linux/bootmem.h > @@ -6,7 +6,7 @@ > > #include <linux/mmzone.h> > #include <asm/dma.h> > - > +#include <linux/early_res.h> > /* > * simple boot-time physical memory area allocator. > */ > diff --git a/include/linux/early_res.h b/include/linux/early_res.h > index 29c09f5..0f4590f 100644 > --- a/include/linux/early_res.h > +++ b/include/linux/early_res.h > @@ -14,6 +14,7 @@ u64 find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end, > u64 find_early_area_size(u64 ei_start, u64 ei_last, u64 start, > u64 *sizep, u64 align); > u64 find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align); > +u64 find_fw_memmap_area_node(int nid, u64 start, u64 end, u64 size, u64 align); > u64 get_max_mapped(void); > #include <linux/range.h> > int get_free_all_memory_range(struct range **rangep, int nodeid); > diff --git a/include/linux/fw_memmap.h b/include/linux/fw_memmap.h > new file mode 100644 > index 0000000..e0fcc1b > --- /dev/null > +++ b/include/linux/fw_memmap.h > @@ -0,0 +1,40 @@ > +#ifndef _LINUX_FW_MEMMAP_H > +#define _LINUX_FW_MEMMAP_H > +#define E820MAX 128 /* number of entries in E820MAP */ > + > +#define FW_MEMMAP_RAM 1 > +#define FW_MEMMAP_RESERVED 2 > + > +#define E820_RAM FW_MEMMAP_RAM > +#define E820_RESERVED FW_MEMMAP_RESERVED > + > +#define E820_ACPI 3 > +#define E820_NVS 4 > +#define E820_UNUSABLE 5 > + > +#ifndef __ASSEMBLY__ > +#include <linux/types.h> > +struct e820entry { > + __u64 addr; /* start of memory segment */ > + __u64 size; /* size of memory segment */ > + __u32 type; /* type of memory segment */ > +} __attribute__((packed)); > + > +#ifdef __KERNEL__ > + > +void fw_memmap_add_region(u64 start, u64 size, int type); > +void fw_memmap_print_map(char *who); > +int sanitize_fw_memmap(void); > +void finish_fw_memmap_parsing(void); > + > +#include <linux/early_res.h> > + > +unsigned long fw_memmap_end_of_ram_pfn(void); > +void fw_memmap_register_active_regions(int nid, unsigned long start_pfn, > + unsigned long end_pfn); > +u64 fw_memmap_hole_size(u64 start, u64 end); > + > +#endif /* __KERNEL__ */ > +#endif /* __ASSEMBLY__ */ > + > +#endif /* _LINUX_FW_MEMMAP_H */ > diff --git a/kernel/Makefile b/kernel/Makefile > index d5c3006..b0afaa5 100644 > --- a/kernel/Makefile > +++ b/kernel/Makefile > @@ -11,7 +11,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ > hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ > notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ > async.o range.o > -obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o > +obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o fw_memmap.o > obj-y += groups.o > > ifdef CONFIG_FUNCTION_TRACER > diff --git a/kernel/fw_memmap.c b/kernel/fw_memmap.c > new file mode 100644 > index 0000000..11067f3 > --- /dev/null > +++ b/kernel/fw_memmap.c > @@ -0,0 +1,625 @@ > +/* > + * Handle the memory map. > + * The functions here do the job until bootmem takes over. > + * > + * Getting sanitize_e820_map() in sync with i386 version by applying change: > + * - Provisions for empty E820 memory regions (reported by certain BIOSes). > + * Alex Achenbach <xela(a)slit.de>, December 2002. > + * Venkatesh Pallipadi <venkatesh.pallipadi(a)intel.com> > + * > + */ > +#include <linux/kernel.h> > +#include <linux/types.h> > +#include <linux/init.h> > +#include <linux/bootmem.h> > +#include <linux/suspend.h> > +#include <linux/ioport.h> > + > +#include <linux/fw_memmap.h> > +#include "fw_memmap_internals.h" > + > +/* > + * The e820 map is the map that gets modified e.g. with command line parameters > + * and that is also registered with modifications in the kernel resource tree > + * with the iomem_resource as parent. > + */ > +struct e820map __initdata e820; > + > +/* > + * Add a memory region to the kernel e820 map. > + */ > +void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, > + int type) > +{ > + int x = e820x->nr_map; > + > + if (x >= ARRAY_SIZE(e820x->map)) { > + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); > + return; > + } > + > + e820x->map[x].addr = start; > + e820x->map[x].size = size; > + e820x->map[x].type = type; > + e820x->nr_map++; > +} > + > +void __init fw_memmap_add_region(u64 start, u64 size, int type) > +{ > + __e820_add_region(&e820, start, size, type); > +} > + > +/* make e820 not cover the range */ > +u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, > + int checktype) > +{ > + int i; > + u64 end; > + u64 real_removed_size = 0; > + > + if (size > (ULLONG_MAX - start)) > + size = ULLONG_MAX - start; > + > + end = start + size; > + printk(KERN_DEBUG "e820 remove range: %016Lx - %016Lx ", > + (unsigned long long) start, > + (unsigned long long) end); > + e820_print_type(old_type); > + printk(KERN_CONT "\n"); > + > + for (i = 0; i < e820.nr_map; i++) { > + struct e820entry *ei = &e820.map[i]; > + u64 final_start, final_end; > + > + if (checktype && ei->type != old_type) > + continue; > + /* totally covered? */ > + if (ei->addr >= start && > + (ei->addr + ei->size) <= (start + size)) { > + real_removed_size += ei->size; > + memset(ei, 0, sizeof(struct e820entry)); > + continue; > + } > + /* partially covered */ > + final_start = max(start, ei->addr); > + final_end = min(start + size, ei->addr + ei->size); > + if (final_start >= final_end) > + continue; > + real_removed_size += final_end - final_start; > + > + ei->size -= final_end - final_start; > + if (ei->addr < final_start) > + continue; > + ei->addr = final_end; > + } > + return real_removed_size; > +} > + > +void __init e820_print_type(u32 type) > +{ > + switch (type) { > + case E820_RAM: > + case E820_RESERVED_KERN: > + printk(KERN_CONT "(usable)"); > + break; > + case E820_RESERVED: > + printk(KERN_CONT "(reserved)"); > + break; > + case E820_ACPI: > + printk(KERN_CONT "(ACPI data)"); > + break; > + case E820_NVS: > + printk(KERN_CONT "(ACPI NVS)"); > + break; > + case E820_UNUSABLE: > + printk(KERN_CONT "(unusable)"); > + break; > + default: > + printk(KERN_CONT "type %u", type); > + break; > + } > +} > + > +void __init fw_memmap_print_map(char *who) > +{ > + int i; > + > + for (i = 0; i < e820.nr_map; i++) { > + printk(KERN_INFO " %s: %016Lx - %016Lx ", who, > + (unsigned long long) e820.map[i].addr, > + (unsigned long long) > + (e820.map[i].addr + e820.map[i].size)); > + e820_print_type(e820.map[i].type); > + printk(KERN_CONT "\n"); > + } > +} > + > +/* > + * Sanitize the BIOS e820 map. > + * > + * Some e820 responses include overlapping entries. The following > + * replaces the original e820 map with a new one, removing overlaps, > + * and resolving conflicting memory types in favor of highest > + * numbered type. > + * > + * The input parameter biosmap points to an array of 'struct > + * e820entry' which on entry has elements in the range [0, *pnr_map) > + * valid, and which has space for up to max_nr_map entries. > + * On return, the resulting sanitized e820 map entries will be in > + * overwritten in the same location, starting at biosmap. > + * > + * The integer pointed to by pnr_map must be valid on entry (the > + * current number of valid entries located at biosmap) and will > + * be updated on return, with the new number of valid entries > + * (something no more than max_nr_map.) > + * > + * The return value from sanitize_e820_map() is zero if it > + * successfully 'sanitized' the map entries passed in, and is -1 > + * if it did nothing, which can happen if either of (1) it was > + * only passed one map entry, or (2) any of the input map entries > + * were invalid (start + size < start, meaning that the size was > + * so big the described memory range wrapped around through zero.) > + * > + * Visually we're performing the following > + * (1,2,3,4 = memory types)... > + * > + * Sample memory map (w/overlaps): > + * ____22__________________ > + * ______________________4_ > + * ____1111________________ > + * _44_____________________ > + * 11111111________________ > + * ____________________33__ > + * ___________44___________ > + * __________33333_________ > + * ______________22________ > + * ___________________2222_ > + * _________111111111______ > + * _____________________11_ > + * _________________4______ > + * > + * Sanitized equivalent (no overlap): > + * 1_______________________ > + * _44_____________________ > + * ___1____________________ > + * ____22__________________ > + * ______11________________ > + * _________1______________ > + * __________3_____________ > + * ___________44___________ > + * _____________33_________ > + * _______________2________ > + * ________________1_______ > + * _________________4______ > + * ___________________2____ > + * ____________________33__ > + * ______________________4_ > + */ > + > +int __init __sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, > + u32 *pnr_map) > +{ > + struct change_member { > + struct e820entry *pbios; /* pointer to original bios entry */ > + unsigned long long addr; /* address for this change point */ > + }; > + static struct change_member change_point_list[2*E820_X_MAX] __initdata; > + static struct change_member *change_point[2*E820_X_MAX] __initdata; > + static struct e820entry *overlap_list[E820_X_MAX] __initdata; > + static struct e820entry new_bios[E820_X_MAX] __initdata; > + struct change_member *change_tmp; > + unsigned long current_type, last_type; > + unsigned long long last_addr; > + int chgidx, still_changing; > + int overlap_entries; > + int new_bios_entry; > + int old_nr, new_nr, chg_nr; > + int i; > + > + /* if there's only one memory region, don't bother */ > + if (*pnr_map < 2) > + return -1; > + > + old_nr = *pnr_map; > + BUG_ON(old_nr > max_nr_map); > + > + /* bail out if we find any unreasonable addresses in bios map */ > + for (i = 0; i < old_nr; i++) > + if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) > + return -1; > + > + /* create pointers for initial change-point information (for sorting) */ > + for (i = 0; i < 2 * old_nr; i++) > + change_point[i] = &change_point_list[i]; > + > + /* record all known change-points (starting and ending addresses), > + omitting those that are for empty memory regions */ > + chgidx = 0; > + for (i = 0; i < old_nr; i++) { > + if (biosmap[i].size != 0) { > + change_point[chgidx]->addr = biosmap[i].addr; > + change_point[chgidx++]->pbios = &biosmap[i]; > + change_point[chgidx]->addr = biosmap[i].addr + > + biosmap[i].size; > + change_point[chgidx++]->pbios = &biosmap[i]; > + } > + } > + chg_nr = chgidx; > + > + /* sort change-point list by memory addresses (low -> high) */ > + still_changing = 1; > + while (still_changing) { > + still_changing = 0; > + for (i = 1; i < chg_nr; i++) { > + unsigned long long curaddr, lastaddr; > + unsigned long long curpbaddr, lastpbaddr; > + > + curaddr = change_point[i]->addr; > + lastaddr = change_point[i - 1]->addr; > + curpbaddr = change_point[i]->pbios->addr; > + lastpbaddr = change_point[i - 1]->pbios->addr; > + > + /* > + * swap entries, when: > + * > + * curaddr > lastaddr or > + * curaddr == lastaddr and curaddr == curpbaddr and > + * lastaddr != lastpbaddr > + */ > + if (curaddr < lastaddr || > + (curaddr == lastaddr && curaddr == curpbaddr && > + lastaddr != lastpbaddr)) { > + change_tmp = change_point[i]; > + change_point[i] = change_point[i-1]; > + change_point[i-1] = change_tmp; > + still_changing = 1; > + } > + } > + } > + > + /* create a new bios memory map, removing overlaps */ > + overlap_entries = 0; /* number of entries in the overlap table */ > + new_bios_entry = 0; /* index for creating new bios map entries */ > + last_type = 0; /* start with undefined memory type */ > + last_addr = 0; /* start with 0 as last starting address */ > + > + /* loop through change-points, determining affect on the new bios map */ > + for (chgidx = 0; chgidx < chg_nr; chgidx++) { > + /* keep track of all overlapping bios entries */ > + if (change_point[chgidx]->addr == > + change_point[chgidx]->pbios->addr) { > + /* > + * add map entry to overlap list (> 1 entry > + * implies an overlap) > + */ > + overlap_list[overlap_entries++] = > + change_point[chgidx]->pbios; > + } else { > + /* > + * remove entry from list (order independent, > + * so swap with last) > + */ > + for (i = 0; i < overlap_entries; i++) { > + if (overlap_list[i] == > + change_point[chgidx]->pbios) > + overlap_list[i] = > + overlap_list[overlap_entries-1]; > + } > + overlap_entries--; > + } > + /* > + * if there are overlapping entries, decide which > + * "type" to use (larger value takes precedence -- > + * 1=usable, 2,3,4,4+=unusable) > + */ > + current_type = 0; > + for (i = 0; i < overlap_entries; i++) > + if (overlap_list[i]->type > current_type) > + current_type = overlap_list[i]->type; > + /* > + * continue building up new bios map based on this > + * information > + */ > + if (current_type != last_type) { > + if (last_type != 0) { > + new_bios[new_bios_entry].size = > + change_point[chgidx]->addr - last_addr; > + /* > + * move forward only if the new size > + * was non-zero > + */ > + if (new_bios[new_bios_entry].size != 0) > + /* > + * no more space left for new > + * bios entries ? > + */ > + if (++new_bios_entry >= max_nr_map) > + break; > + } > + if (current_type != 0) { > + new_bios[new_bios_entry].addr = > + change_point[chgidx]->addr; > + new_bios[new_bios_entry].type = current_type; > + last_addr = change_point[chgidx]->addr; > + } > + last_type = current_type; > + } > + } > + /* retain count for new bios entries */ > + new_nr = new_bios_entry; > + > + /* copy new bios mapping into original location */ > + memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry)); > + *pnr_map = new_nr; > + > + return 0; > +} > + > +int __init sanitize_fw_memmap(void) > +{ > + int max_nr_map = ARRAY_SIZE(e820.map); > + > + return __sanitize_e820_map(e820.map, max_nr_map, &e820.nr_map); > +} > + > +void __init clear_e820_map(void) > +{ > + e820.nr_map = 0; > +} > + > +static int userdef __initdata; > + > +/* "mem=nopentium" disables the 4MB page tables. */ > +static int __init parse_memopt(char *p) > +{ > + u64 mem_size; > + > + if (!p) > + return -EINVAL; > + > +#ifdef CONFIG_X86_32 > + if (!strcmp(p, "nopentium")) { > + setup_clear_cpu_cap(X86_FEATURE_PSE); > + return 0; > + } > +#endif > + > + userdef = 1; > + mem_size = memparse(p, &p); > + e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); > + > + return 0; > +} > +early_param("mem", parse_memopt); > + > +static int __init parse_memmap_opt(char *p) > +{ > + char *oldp; > + u64 start_at, mem_size; > + > + if (!p) > + return -EINVAL; > + > + if (!strncmp(p, "exactmap", 8)) { > +#ifdef CONFIG_CRASH_DUMP > + /* > + * If we are doing a crash dump, we still need to know > + * the real mem size before original memory map is > + * reset. > + */ > + saved_max_pfn = fw_memmap_end_of_ram_pfn(); > +#endif > + e820.nr_map = 0; > + userdef = 1; > + return 0; > + } > + > + oldp = p; > + mem_size = memparse(p, &p); > + if (p == oldp) > + return -EINVAL; > + > + userdef = 1; > + if (*p == '@') { > + start_at = memparse(p+1, &p); > + e820_add_region(start_at, mem_size, E820_RAM); > + } else if (*p == '#') { > + start_at = memparse(p+1, &p); > + e820_add_region(start_at, mem_size, E820_ACPI); > + } else if (*p == '$') { > + start_at = memparse(p+1, &p); > + e820_add_region(start_at, mem_size, E820_RESERVED); > + } else > + e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); > + > + return *p == '\0' ? 0 : -EINVAL; > +} > +early_param("memmap", parse_memmap_opt); > + > +static void early_panic(char *msg) > +{ > + early_printk(msg); > + panic(msg); > +} > + > +void __init finish_fw_memmap_parsing(void) > +{ > + if (userdef) { > + u32 nr = e820.nr_map; > + int max_nr_map = ARRAY_SIZE(e820.map); > + > + if (__sanitize_e820_map(e820.map, max_nr_map, &nr) < 0) > + early_panic("Invalid user supplied memory map"); > + e820.nr_map = nr; > + > + printk(KERN_INFO "user-defined physical RAM map:\n"); > + e820_print_map("user"); > + } > +} > + > +/* > + * Find a free area with specified alignment in a specific range. > + */ > +u64 __init find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align) > +{ > + int i; > + > + for (i = 0; i < e820.nr_map; i++) { > + struct e820entry *ei = &e820.map[i]; > + u64 addr; > + u64 ei_start, ei_last; > + > + if (ei->type != E820_RAM) > + continue; > + > + ei_last = ei->addr + ei->size; > + ei_start = ei->addr; > + addr = find_early_area(ei_start, ei_last, start, end, > + size, align); > + > + if (addr != -1ULL) > + return addr; > + } > + return -1ULL; > +} > + > +u64 __init > +find_fw_memmap_area_node(int nid, u64 start, u64 end, u64 size, u64 align) > +{ > + u64 addr; > + /* > + * need to call this function after e820_register_active_regions > + * so early_node_map[] is set > + */ > + addr = find_memory_core_early(nid, size, align, start, end); > + if (addr != -1ULL) > + return addr; > + > + /* fallback, should already have start end in the node range */ > + return find_fw_memmap_area(start, end, size, align); > +} > + > +#ifdef CONFIG_X86_32 > +# ifdef CONFIG_X86_PAE > +# define MAX_ARCH_PFN (1ULL<<(36-PAGE_SHIFT)) > +# else > +# define MAX_ARCH_PFN (1ULL<<(32-PAGE_SHIFT)) > +# endif > +#else /* CONFIG_X86_32 */ > +# define MAX_ARCH_PFN (MAXMEM>>PAGE_SHIFT) > +#endif > + > +/* > + * Find the highest page frame number we have available > + */ > +static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) > +{ > + int i; > + unsigned long last_pfn = 0; > + unsigned long max_arch_pfn = MAX_ARCH_PFN; > + > + for (i = 0; i < e820.nr_map; i++) { > + struct e820entry *ei = &e820.map[i]; > + unsigned long start_pfn; > + unsigned long end_pfn; > + > + if (ei->type != type) > + continue; > + > + start_pfn = ei->addr >> PAGE_SHIFT; > + end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT; > + > + if (start_pfn >= limit_pfn) > + continue; > + if (end_pfn > limit_pfn) { > + last_pfn = limit_pfn; > + break; > + } > + if (end_pfn > last_pfn) > + last_pfn = end_pfn; > + } > + > + if (last_pfn > max_arch_pfn) > + last_pfn = max_arch_pfn; > + > + printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n", > + last_pfn, max_arch_pfn); > + return last_pfn; > +} > +unsigned long __init fw_memmap_end_of_ram_pfn(void) > +{ > + return e820_end_pfn(MAX_ARCH_PFN, E820_RAM); > +} > + > +unsigned long __init e820_end_of_low_ram_pfn(void) > +{ > + return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM); > +} > +/* > + * Finds an active region in the address range from start_pfn to last_pfn and > + * returns its range in ei_startpfn and ei_endpfn for the e820 entry. > + */ > +static int __init e820_find_active_region(const struct e820entry *ei, > + unsigned long start_pfn, > + unsigned long last_pfn, > + unsigned long *ei_startpfn, > + unsigned long *ei_endpfn) > +{ > + u64 align = PAGE_SIZE; > + > + *ei_startpfn = round_up(ei->addr, align) >> PAGE_SHIFT; > + *ei_endpfn = round_down(ei->addr + ei->size, align) >> PAGE_SHIFT; > + > + /* Skip map entries smaller than a page */ > + if (*ei_startpfn >= *ei_endpfn) > + return 0; > + > + /* Skip if map is outside the node */ > + if (ei->type != E820_RAM || *ei_endpfn <= start_pfn || > + *ei_startpfn >= last_pfn) > + return 0; > + > + /* Check for overlaps */ > + if (*ei_startpfn < start_pfn) > + *ei_startpfn = start_pfn; > + if (*ei_endpfn > last_pfn) > + *ei_endpfn = last_pfn; > + > + return 1; > +} > + > +/* Walk the e820 map and register active regions within a node */ > +void __init fw_memmap_register_active_regions(int nid, unsigned long start_pfn, > + unsigned long last_pfn) > +{ > + unsigned long ei_startpfn; > + unsigned long ei_endpfn; > + int i; > + > + for (i = 0; i < e820.nr_map; i++) > + if (e820_find_active_region(&e820.map[i], > + start_pfn, last_pfn, > + &ei_startpfn, &ei_endpfn)) > + add_active_range(nid, ei_startpfn, ei_endpfn); > +} > + > +/* > + * Find the hole size (in bytes) in the memory range. > + * @start: starting address of the memory range to scan > + * @end: ending address of the memory range to scan > + */ > +u64 __init fw_memmap_hole_size(u64 start, u64 end) > +{ > + unsigned long start_pfn = start >> PAGE_SHIFT; > + unsigned long last_pfn = end >> PAGE_SHIFT; > + unsigned long ei_startpfn, ei_endpfn, ram = 0; > + int i; > + > + for (i = 0; i < e820.nr_map; i++) { > + if (e820_find_active_region(&e820.map[i], > + start_pfn, last_pfn, > + &ei_startpfn, &ei_endpfn)) > + ram += ei_endpfn - ei_startpfn; > + } > + return end - start - ((u64)ram << PAGE_SHIFT); > +} > diff --git a/kernel/fw_memmap_internals.h b/kernel/fw_memmap_internals.h > new file mode 100644 > index 0000000..f217602 > --- /dev/null > +++ b/kernel/fw_memmap_internals.h > @@ -0,0 +1,49 @@ > +#ifndef __KERNEL_FW_MEMMAP_INTERNALS_H > +#define __KERNEL_FW_MEMMAP_INTERNALS_H > + > +/* > + * Legacy E820 BIOS limits us to 128 (E820MAX) nodes due to the > + * constrained space in the zeropage. If we have more nodes than > + * that, and if we've booted off EFI firmware, then the EFI tables > + * passed us from the EFI firmware can list more nodes. Size our > + * internal memory map tables to have room for these additional > + * nodes, based on up to three entries per node for which the > + * kernel was built: MAX_NUMNODES == (1 << CONFIG_NODES_SHIFT), > + * plus E820MAX, allowing space for the possible duplicate E820 > + * entries that might need room in the same arrays, prior to the > + * call to sanitize_e820_map() to remove duplicates. The allowance > + * of three memory map entries per node is "enough" entries for > + * the initial hardware platform motivating this mechanism to make > + * use of additional EFI map entries. Future platforms may want > + * to allow more than three entries per node or otherwise refine > + * this size. > + */ > + > +/* > + * Odd: 'make headers_check' complains about numa.h if I try > + * to collapse the next two #ifdef lines to a single line: > + * #if defined(__KERNEL__) && defined(CONFIG_EFI) > + */ > +#ifdef __KERNEL__ > +#ifdef CONFIG_EFI > +#include <linux/numa.h> > +#define E820_X_MAX (E820MAX + 3 * MAX_NUMNODES) > +#else /* ! CONFIG_EFI */ > +#define E820_X_MAX E820MAX > +#endif > +#else /* ! __KERNEL__ */ > +#define E820_X_MAX E820MAX > +#endif > + > +#ifndef __ASSEMBLY__ > +struct e820map { > + __u32 nr_map; > + struct e820entry map[E820_X_MAX]; > +}; > +#endif > + > +extern struct e820map __initdata e820; > +void e820_print_type(u32 type); > +void __e820_add_region(struct e820map *e820x, u64 start, u64 size, int type); > + > +#endif -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo(a)vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
From: David Miller on 22 Mar 2010 00:10 You really aren't listening to us at all. We told you a thousand times to investigate using LMB for all of this. Instead you are posting the sparc64 conversion to the e820 stuff again. That action means you absolutely don't value our feedback at all. You seem to really not care what a mess you are (unnecessarily) making. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo(a)vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
From: Benjamin Herrenschmidt on 22 Mar 2010 01:20 On Sun, 2010-03-21 at 20:56 -0700, Yinghai Lu wrote: > On 03/21/2010 07:37 PM, Benjamin Herrenschmidt wrote: > > On Sun, 2010-03-21 at 00:13 -0700, Yinghai Lu wrote: > >> move it to kernel/fw_memmap.c from arch/x86/kernel/e820.c > >> > >> -v2: add fw_memmap wrapper to some func... > >> move some functions back to e820.c > > > > NAK > at this point, kernel/early_res.c and kernel/fw_memmap.c is protected with HAVE_EARLY_RES > > obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o fw_memmap.o > > so it will not increase size that doesn't support early_res/bootmem yet. I'm still not at all happy with it. It's not only about increasing the size of the kernel. It's about moving some x86 specific stuff and more or less arbitrarily deciding that everybody has to convert to that model now, despite the fact that more suited alternatives have existed for years, rather than thinking about doing the logical thing, which is to convert x86 over to lmb, eventually adding the missing functionalities in lmb if need be. Also, there's something just plain gross about the choice of names. fw_memmap is something I wouldn't wish my enemies to have to type on a keyboard, it looks ugly, and it lends to way too long function names. In addition to the fact that your "generic" facility is still all cluttered with the e820 names and other very x86 centric definitions. It -may- well be that adapting x86 to lmb isn't a practical approach, but if that was the case, then please justify why with precise technical reasons, which we can discuss then in details and make a decision based on that. Cheers, Ben. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo(a)vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
From: Paul Mackerras on 22 Mar 2010 07:40 On Mon, Mar 22, 2010 at 10:28:09AM +0100, Ingo Molnar wrote: > ( Cc:-ed Andrew and Linus as this is a general design/policy matter wrt. > memory management. ) [snip] > Please also realize the difficulties Yinghai has gone through already: > converting and generalizing _all_ of the x86 early allocation code to a more > generic core kernel approach, with essentially zero interest from _any_ > non-x86 person ... It still seemed to have a lot that was x86-specific - in particular it seemed to have a lot of code to cope with various mistakes that firmware might have made in the memory map. That adds code which is basically just bloat on architectures where those problems don't arise. The fw_memmap.c code also still seemed to be tied to the x86 e820 data structures and layouts. > Those early_res patches were posted all over on lkml, it was literally > hundreds of difficult patches, and now, months down the line, after we've > tested and upstreamed it (with many nasty regressions fixed on x86 during the > development of it) you come with a rigid "do it some other way, convert all of > x86 over again or else" position. Well I personally don't mind if x86 uses early_res or whatever other code in arch/x86 to handle the problems that arise from deficient firmware. I just don't see any value in converting powerpc or sparc64 over to using ~2000 lines of early_res/fw_memmap code where the existing ~500 lines of lmb code is working just fine. And I don't see the point of moving the x86 e820 stuff into the kernel directory. Does any other platform use e820 tables? > I really wish non-x86 architectures apprecitated (and helped) the core kernel > work x86 is doing, because it is subsidizing non-x86 architectures all the > time. We do help with core kernel work. Coping with deficient x86 firmware doesn't really feel like core kernel work to me though. Paul. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo(a)vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
From: Benjamin Herrenschmidt on 22 Mar 2010 17:10
On Mon, 2010-03-22 at 10:28 +0100, Ingo Molnar wrote: > Those early_res patches were posted all over on lkml, it was literally > hundreds of difficult patches, and now, months down the line, after we've > tested and upstreamed it (with many nasty regressions fixed on x86 during the > development of it) you come with a rigid "do it some other way, convert all of > x86 over again or else" position. There's an easy solution here. Leave that gunk in arch/x86 where it belongs and if you want to unify things a bit, then do it at the -API- level only, and leave the implementation where it is. > I really wish non-x86 architectures apprecitated (and helped) the core kernel > work x86 is doing, because it is subsidizing non-x86 architectures all the > time. I'm not even going to bother replying to that one Ben. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo(a)vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ |