From: Shaohua Li on
On Tue, Apr 06, 2010 at 01:03:25PM +0800, Wu, Fengguang wrote:
> Shaohua,
>
> > + scan = zone_nr_lru_pages(zone, sc, l);
> > + if (priority) {
> > + scan >>= priority;
> > + scan = (scan * fraction[file] / denominator[file]);
>
> Ah, the (scan * fraction[file]) may overflow in 32bit kernel!
good catch. will change it to u64.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Shaohua Li on
On Tue, Apr 06, 2010 at 01:03:25PM +0800, Wu, Fengguang wrote:
> Shaohua,
>
> > + scan = zone_nr_lru_pages(zone, sc, l);
> > + if (priority) {
> > + scan >>= priority;
> > + scan = (scan * fraction[file] / denominator[file]);
>
> Ah, the (scan * fraction[file]) may overflow in 32bit kernel!
I updated the patch to address previous issues.
is it possible to put this to -mm tree to see if there is anything wield happen?



get_scan_ratio() calculates percentage and if the percentage is < 1%, it will
round percentage down to 0% and cause we completely ignore scanning anon/file
pages to reclaim memory even the total anon/file pages are very big.

To avoid underflow, we don't use percentage, instead we directly calculate
how many pages should be scaned. In this way, we should get several scanned pages
for < 1% percent.

This has some benefits:
1. increase our calculation precision
2. making our scan more smoothly. Without this, if percent[x] is underflow,
shrink_zone() doesn't scan any pages and suddenly it scans all pages when priority
is zero. With this, even priority isn't zero, shrink_zone() gets chance to scan
some pages.

Note, this patch doesn't really change logics, but just increase precision. For
system with a lot of memory, this might slightly changes behavior. For example,
in a sequential file read workload, without the patch, we don't swap any anon
pages. With it, if anon memory size is bigger than 16G, we will see one anon page
swapped. The 16G is calculated as PAGE_SIZE * priority(4096) * (fp/ap). fp/ap
is assumed to be 1024 which is common in this workload. So the impact sounds not
a big deal.

Signed-off-by: Shaohua Li <shaohua.li(a)intel.com>
Cc: KOSAKI Motohiro <kosaki.motohiro(a)jp.fujitsu.com>
Cc: Rik van Riel <riel(a)redhat.com>
Cc: Wu Fengguang <fengguang.wu(a)intel.com>

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3ff3311..1070f83 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1519,21 +1519,52 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
}

/*
+ * Smallish @nr_to_scan's are deposited in @nr_saved_scan,
+ * until we collected @swap_cluster_max pages to scan.
+ */
+static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
+ unsigned long *nr_saved_scan)
+{
+ unsigned long nr;
+
+ *nr_saved_scan += nr_to_scan;
+ nr = *nr_saved_scan;
+
+ if (nr >= SWAP_CLUSTER_MAX)
+ *nr_saved_scan = 0;
+ else
+ nr = 0;
+
+ return nr;
+}
+
+/*
* Determine how aggressively the anon and file LRU lists should be
* scanned. The relative value of each set of LRU lists is determined
* by looking at the fraction of the pages scanned we did rotate back
* onto the active list instead of evict.
*
- * percent[0] specifies how much pressure to put on ram/swap backed
- * memory, while percent[1] determines pressure on the file LRUs.
+ * nr[0] = anon pages to scan; nr[1] = file pages to scan
*/
-static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
- unsigned long *percent)
+static void get_scan_count(struct zone *zone, struct scan_control *sc,
+ unsigned long *nr, int priority)
{
unsigned long anon, file, free;
unsigned long anon_prio, file_prio;
unsigned long ap, fp;
struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
+ u64 fraction[2], denominator;
+ enum lru_list l;
+ int noswap = 0;
+
+ /* If we have no swap space, do not bother scanning anon pages. */
+ if (!sc->may_swap || (nr_swap_pages <= 0)) {
+ noswap = 1;
+ fraction[0] = 0;
+ fraction[1] = 1;
+ denominator = 1;
+ goto out;
+ }

anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
@@ -1545,9 +1576,10 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
/* If we have very few page cache pages,
force-scan anon pages. */
if (unlikely(file + free <= high_wmark_pages(zone))) {
- percent[0] = 100;
- percent[1] = 0;
- return;
+ fraction[0] = 1;
+ fraction[1] = 0;
+ denominator = 1;
+ goto out;
}
}

@@ -1594,29 +1626,22 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
fp /= reclaim_stat->recent_rotated[1] + 1;

- /* Normalize to percentages */
- percent[0] = 100 * ap / (ap + fp + 1);
- percent[1] = 100 - percent[0];
-}
-
-/*
- * Smallish @nr_to_scan's are deposited in @nr_saved_scan,
- * until we collected @swap_cluster_max pages to scan.
- */
-static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
- unsigned long *nr_saved_scan)
-{
- unsigned long nr;
-
- *nr_saved_scan += nr_to_scan;
- nr = *nr_saved_scan;
-
- if (nr >= SWAP_CLUSTER_MAX)
- *nr_saved_scan = 0;
- else
- nr = 0;
+ fraction[0] = ap;
+ fraction[1] = fp;
+ denominator = ap + fp + 1;
+out:
+ for_each_evictable_lru(l) {
+ int file = is_file_lru(l);
+ unsigned long scan;

- return nr;
+ scan = zone_nr_lru_pages(zone, sc, l);
+ if (priority || noswap) {
+ scan >>= priority;
+ scan = div64_u64(scan * fraction[file], denominator);
+ }
+ nr[l] = nr_scan_try_batch(scan,
+ &reclaim_stat->nr_saved_scan[l]);
+ }
}

/*
@@ -1627,33 +1652,11 @@ static void shrink_zone(int priority, struct zone *zone,
{
unsigned long nr[NR_LRU_LISTS];
unsigned long nr_to_scan;
- unsigned long percent[2]; /* anon @ 0; file @ 1 */
enum lru_list l;
unsigned long nr_reclaimed = sc->nr_reclaimed;
unsigned long nr_to_reclaim = sc->nr_to_reclaim;
- struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
- int noswap = 0;
-
- /* If we have no swap space, do not bother scanning anon pages. */
- if (!sc->may_swap || (nr_swap_pages <= 0)) {
- noswap = 1;
- percent[0] = 0;
- percent[1] = 100;
- } else
- get_scan_ratio(zone, sc, percent);

- for_each_evictable_lru(l) {
- int file = is_file_lru(l);
- unsigned long scan;
-
- scan = zone_nr_lru_pages(zone, sc, l);
- if (priority || noswap) {
- scan >>= priority;
- scan = (scan * percent[file]) / 100;
- }
- nr[l] = nr_scan_try_batch(scan,
- &reclaim_stat->nr_saved_scan[l]);
- }
+ get_scan_count(zone, sc, nr, priority);

while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
nr[LRU_INACTIVE_FILE]) {
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Rik van Riel on
On 04/09/2010 05:20 PM, Andrew Morton wrote:

> Come to that, it's not obvious that we need this in 2.6.34 either. What
> is the user-visible impact here?

I suspect very little impact, especially during workloads
where we can just reclaim clean page cache at DEF_PRIORITY.
FWIW, the patch looks good to me, so:

Acked-by: Rik van Riel <riel(a)redhat.com>

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Andrew Morton on
On Fri, 9 Apr 2010 14:51:04 +0800
Shaohua Li <shaohua.li(a)intel.com> wrote:

> get_scan_ratio() calculates percentage and if the percentage is < 1%, it will
> round percentage down to 0% and cause we completely ignore scanning anon/file
> pages to reclaim memory even the total anon/file pages are very big.
>
> To avoid underflow, we don't use percentage, instead we directly calculate
> how many pages should be scaned. In this way, we should get several scanned pages
> for < 1% percent.
>
> This has some benefits:
> 1. increase our calculation precision
> 2. making our scan more smoothly. Without this, if percent[x] is underflow,
> shrink_zone() doesn't scan any pages and suddenly it scans all pages when priority
> is zero. With this, even priority isn't zero, shrink_zone() gets chance to scan
> some pages.
>
> Note, this patch doesn't really change logics, but just increase precision. For
> system with a lot of memory, this might slightly changes behavior. For example,
> in a sequential file read workload, without the patch, we don't swap any anon
> pages. With it, if anon memory size is bigger than 16G, we will see one anon page
> swapped. The 16G is calculated as PAGE_SIZE * priority(4096) * (fp/ap). fp/ap
> is assumed to be 1024 which is common in this workload. So the impact sounds not
> a big deal.

I grabbed this.

Did we decide that this needed to be backported into 2.6.33.x? If so,
some words explaining the reasoning would be needed.

Come to that, it's not obvious that we need this in 2.6.34 either. What
is the user-visible impact here?

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Shaohua Li on
On Sat, Apr 10, 2010 at 05:20:57AM +0800, Andrew Morton wrote:
> On Fri, 9 Apr 2010 14:51:04 +0800
> Shaohua Li <shaohua.li(a)intel.com> wrote:
>
> > get_scan_ratio() calculates percentage and if the percentage is < 1%, it will
> > round percentage down to 0% and cause we completely ignore scanning anon/file
> > pages to reclaim memory even the total anon/file pages are very big.
> >
> > To avoid underflow, we don't use percentage, instead we directly calculate
> > how many pages should be scaned. In this way, we should get several scanned pages
> > for < 1% percent.
> >
> > This has some benefits:
> > 1. increase our calculation precision
> > 2. making our scan more smoothly. Without this, if percent[x] is underflow,
> > shrink_zone() doesn't scan any pages and suddenly it scans all pages when priority
> > is zero. With this, even priority isn't zero, shrink_zone() gets chance to scan
> > some pages.
> >
> > Note, this patch doesn't really change logics, but just increase precision. For
> > system with a lot of memory, this might slightly changes behavior. For example,
> > in a sequential file read workload, without the patch, we don't swap any anon
> > pages. With it, if anon memory size is bigger than 16G, we will see one anon page
> > swapped. The 16G is calculated as PAGE_SIZE * priority(4096) * (fp/ap). fp/ap
> > is assumed to be 1024 which is common in this workload. So the impact sounds not
> > a big deal.
>
> I grabbed this.
>
> Did we decide that this needed to be backported into 2.6.33.x? If so,
> some words explaining the reasoning would be needed.
>
> Come to that, it's not obvious that we need this in 2.6.34 either.
Not needed.

> is the user-visible impact here?
Should be very small I think.

Thanks,
Shaohua
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/