memcg: dirty pages instrumentation [Kernel]

Prev: [announce] gujin GPL bootloader version 2.8
Next: [Bug #14792] Misdetection of the TV output

From: Peter Zijlstra on 22 Feb 2010 13:30

On Sun, 2010-02-21 at 16:18 +0100, Andrea Righi wrote:
> @@ -137,10 +137,11 @@ static struct prop_descriptor vm_dirties;
> */
> static int calc_period_shift(void)
> {
> - unsigned long dirty_total;
> + unsigned long dirty_total, dirty_bytes;
>
> - if (vm_dirty_bytes)
> - dirty_total = vm_dirty_bytes / PAGE_SIZE;
> + dirty_bytes = mem_cgroup_dirty_bytes();
> + if (dirty_bytes)
> + dirty_total = dirty_bytes / PAGE_SIZE;
> else
> dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
> 100;
> @@ -406,14 +407,20 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
> */
> unsigned long determine_dirtyable_memory(void)
> {
> - unsigned long x;
> -
> - x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
> -
> + unsigned long memcg_memory, memory;
> +
> + memory = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
> + memcg_memory = mem_cgroup_page_state(MEMCG_NR_FREE_PAGES);
> + if (memcg_memory > 0) {
> + memcg_memory +=
> + mem_cgroup_page_state(MEMCG_NR_RECLAIMABLE_PAGES);
> + if (memcg_memory < memory)
> + return memcg_memory;
> + }
> if (!vm_highmem_is_dirtyable)
> - x -= highmem_dirtyable_memory(x);
> + memory -= highmem_dirtyable_memory(memory);
>
> - return x + 1; /* Ensure that we never return 0 */
> + return memory + 1; /* Ensure that we never return 0 */
> }
>
> void
> @@ -421,12 +428,13 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
> unsigned long *pbdi_dirty, struct backing_dev_info *bdi)
> {
> unsigned long background;
> - unsigned long dirty;
> + unsigned long dirty, dirty_bytes;
> unsigned long available_memory = determine_dirtyable_memory();
> struct task_struct *tsk;
>
> - if (vm_dirty_bytes)
> - dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
> + dirty_bytes = mem_cgroup_dirty_bytes();
> + if (dirty_bytes)
> + dirty = DIV_ROUND_UP(dirty_bytes, PAGE_SIZE);
> else {
> int dirty_ratio;
>
> @@ -505,9 +513,17 @@ static void balance_dirty_pages(struct address_space *mapping,
> get_dirty_limits(&background_thresh, &dirty_thresh,
> &bdi_thresh, bdi);
>
> - nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
> + nr_reclaimable = mem_cgroup_page_state(MEMCG_NR_FILE_DIRTY);
> + if (nr_reclaimable == 0) {
> + nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
> global_page_state(NR_UNSTABLE_NFS);
> - nr_writeback = global_page_state(NR_WRITEBACK);
> + nr_writeback = global_page_state(NR_WRITEBACK);
> + } else {
> + nr_reclaimable +=
> + mem_cgroup_page_state(MEMCG_NR_UNSTABLE_NFS);
> + nr_writeback =
> + mem_cgroup_page_state(MEMCG_NR_WRITEBACK);
> + }
>
> bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
> bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
> @@ -660,6 +676,8 @@ void throttle_vm_writeout(gfp_t gfp_mask)
> unsigned long dirty_thresh;
>
> for ( ; ; ) {
> + unsigned long dirty;
> +
> get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
>
> /*
> @@ -668,10 +686,15 @@ void throttle_vm_writeout(gfp_t gfp_mask)
> */
> dirty_thresh += dirty_thresh / 10; /* wheeee... */
>
> - if (global_page_state(NR_UNSTABLE_NFS) +
> - global_page_state(NR_WRITEBACK) <= dirty_thresh)
> - break;
> - congestion_wait(BLK_RW_ASYNC, HZ/10);
> + dirty = mem_cgroup_page_state(MEMCG_NR_WRITEBACK);
> + if (dirty < 0)
> + dirty = global_page_state(NR_UNSTABLE_NFS) +
> + global_page_state(NR_WRITEBACK);
> + else
> + dirty += mem_cgroup_page_state(MEMCG_NR_UNSTABLE_NFS);
> + if (dirty <= dirty_thresh)
> + break;
> + congestion_wait(BLK_RW_ASYNC, HZ/10);
>
> /*
> * The caller might hold locks which can prevent IO completion

This stuff looks really rather horrible,

Relying on these cgroup functions returning 0 seems fragile, some of
them can really be 0. Also sprinkling all that if cgroup foo all over
the place leads to these ugly indentation problems you have.

How about pulling all these things into separate functions, and using a
proper mem_cgroup_has_dirty() function to select on?

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

From: Andrea Righi on 23 Feb 2010 04:50

On Mon, Feb 22, 2010 at 07:20:09PM +0100, Peter Zijlstra wrote:
> On Sun, 2010-02-21 at 16:18 +0100, Andrea Righi wrote:
> > @@ -137,10 +137,11 @@ static struct prop_descriptor vm_dirties;
> > */
> > static int calc_period_shift(void)
> > {
> > - unsigned long dirty_total;
> > + unsigned long dirty_total, dirty_bytes;
> >
> > - if (vm_dirty_bytes)
> > - dirty_total = vm_dirty_bytes / PAGE_SIZE;
> > + dirty_bytes = mem_cgroup_dirty_bytes();
> > + if (dirty_bytes)
> > + dirty_total = dirty_bytes / PAGE_SIZE;
> > else
> > dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
> > 100;
> > @@ -406,14 +407,20 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
> > */
> > unsigned long determine_dirtyable_memory(void)
> > {
> > - unsigned long x;
> > -
> > - x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
> > -
> > + unsigned long memcg_memory, memory;
> > +
> > + memory = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
> > + memcg_memory = mem_cgroup_page_state(MEMCG_NR_FREE_PAGES);
> > + if (memcg_memory > 0) {
> > + memcg_memory +=
> > + mem_cgroup_page_state(MEMCG_NR_RECLAIMABLE_PAGES);
> > + if (memcg_memory < memory)
> > + return memcg_memory;
> > + }
> > if (!vm_highmem_is_dirtyable)
> > - x -= highmem_dirtyable_memory(x);
> > + memory -= highmem_dirtyable_memory(memory);
> >
> > - return x + 1; /* Ensure that we never return 0 */
> > + return memory + 1; /* Ensure that we never return 0 */
> > }
> >
> > void
> > @@ -421,12 +428,13 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
> > unsigned long *pbdi_dirty, struct backing_dev_info *bdi)
> > {
> > unsigned long background;
> > - unsigned long dirty;
> > + unsigned long dirty, dirty_bytes;
> > unsigned long available_memory = determine_dirtyable_memory();
> > struct task_struct *tsk;
> >
> > - if (vm_dirty_bytes)
> > - dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
> > + dirty_bytes = mem_cgroup_dirty_bytes();
> > + if (dirty_bytes)
> > + dirty = DIV_ROUND_UP(dirty_bytes, PAGE_SIZE);
> > else {
> > int dirty_ratio;
> >
> > @@ -505,9 +513,17 @@ static void balance_dirty_pages(struct address_space *mapping,
> > get_dirty_limits(&background_thresh, &dirty_thresh,
> > &bdi_thresh, bdi);
> >
> > - nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
> > + nr_reclaimable = mem_cgroup_page_state(MEMCG_NR_FILE_DIRTY);
> > + if (nr_reclaimable == 0) {
> > + nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
> > global_page_state(NR_UNSTABLE_NFS);
> > - nr_writeback = global_page_state(NR_WRITEBACK);
> > + nr_writeback = global_page_state(NR_WRITEBACK);
> > + } else {
> > + nr_reclaimable +=
> > + mem_cgroup_page_state(MEMCG_NR_UNSTABLE_NFS);
> > + nr_writeback =
> > + mem_cgroup_page_state(MEMCG_NR_WRITEBACK);
> > + }
> >
> > bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
> > bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
> > @@ -660,6 +676,8 @@ void throttle_vm_writeout(gfp_t gfp_mask)
> > unsigned long dirty_thresh;
> >
> > for ( ; ; ) {
> > + unsigned long dirty;
> > +
> > get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
> >
> > /*
> > @@ -668,10 +686,15 @@ void throttle_vm_writeout(gfp_t gfp_mask)
> > */
> > dirty_thresh += dirty_thresh / 10; /* wheeee... */
> >
> > - if (global_page_state(NR_UNSTABLE_NFS) +
> > - global_page_state(NR_WRITEBACK) <= dirty_thresh)
> > - break;
> > - congestion_wait(BLK_RW_ASYNC, HZ/10);
> > + dirty = mem_cgroup_page_state(MEMCG_NR_WRITEBACK);
> > + if (dirty < 0)
> > + dirty = global_page_state(NR_UNSTABLE_NFS) +
> > + global_page_state(NR_WRITEBACK);
> > + else
> > + dirty += mem_cgroup_page_state(MEMCG_NR_UNSTABLE_NFS);
> > + if (dirty <= dirty_thresh)
> > + break;
> > + congestion_wait(BLK_RW_ASYNC, HZ/10);
> >
> > /*
> > * The caller might hold locks which can prevent IO completion
>
>
> This stuff looks really rather horrible,
>
> Relying on these cgroup functions returning 0 seems fragile, some of
> them can really be 0. Also sprinkling all that if cgroup foo all over
> the place leads to these ugly indentation problems you have.
>
> How about pulling all these things into separate functions, and using a
> proper mem_cgroup_has_dirty() function to select on?

Agreed. Will do in the next version of the patch.

Thanks,
-Andrea
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

From: Andrea Righi on 23 Feb 2010 04:50

On Tue, Feb 23, 2010 at 10:40:40AM +0100, Andrea Righi wrote:
> > If vm_highmem_is_dirtyable=0, In that case, we can still return with
> > "memcg_memory" which can be more than "memory". IOW, highmem is not
> > dirtyable system wide but still we can potetially return back saying
> > for this cgroup we can dirty more pages which can potenailly be acutally
> > be more that system wide allowed?
> >
> > Because you have modified dirtyable_memory() and made it per cgroup, I
> > think it automatically takes care of the cases of per cgroup dirty ratio,
> > I mentioned in my previous mail. So we will use system wide dirty ratio
> > to calculate the allowed dirty pages in this cgroup (dirty_ratio *
> > available_memory()) and if this cgroup wrote too many pages start
> > writeout?
>
> OK, if I've understood well, you're proposing to use per-cgroup
> dirty_ratio interface and do something like:
>
> unsigned long determine_dirtyable_memory(void)
> {
> unsigned long memcg_memory, memory;
>
> memory = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
> if (!vm_highmem_is_dirtyable)
> memory -= highmem_dirtyable_memory(memory);
>
> memcg_memory = mem_cgroup_page_state(MEMCG_NR_FREE_PAGES);
> if (!memcg_memory)
> return memory + 1; /* Ensure that we never return 0 */
> memcg_memory += mem_cgroup_page_state(MEMCG_NR_RECLAIMABLE_PAGES);
> if (!vm_highmem_is_dirtyable)
> memcg_memory -= highmem_dirtyable_memory(memory) *
> mem_cgroup_dirty_ratio() / 100;

ok, this is wrong:

> if (memcg_memory < memory)
> return memcg_memory;
> }

return min(memcg_memory, memory);

-Andrea
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

From: Vivek Goyal on 23 Feb 2010 15:00

On Tue, Feb 23, 2010 at 10:40:40AM +0100, Andrea Righi wrote:
> On Mon, Feb 22, 2010 at 11:52:15AM -0500, Vivek Goyal wrote:
> > > unsigned long determine_dirtyable_memory(void)
> > > {
> > > - unsigned long x;
> > > -
> > > - x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
> > > -
> > > + unsigned long memcg_memory, memory;
> > > +
> > > + memory = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
> > > + memcg_memory = mem_cgroup_page_state(MEMCG_NR_FREE_PAGES);
> > > + if (memcg_memory > 0) {
> >
> > it could be just
> >
> > if (memcg_memory) {
>
> Agreed.
>
> > }
> >
> > > + memcg_memory +=
> > > + mem_cgroup_page_state(MEMCG_NR_RECLAIMABLE_PAGES);
> > > + if (memcg_memory < memory)
> > > + return memcg_memory;
> > > + }
> > > if (!vm_highmem_is_dirtyable)
> > > - x -= highmem_dirtyable_memory(x);
> > > + memory -= highmem_dirtyable_memory(memory);
> > >
> >
> > If vm_highmem_is_dirtyable=0, In that case, we can still return with
> > "memcg_memory" which can be more than "memory". IOW, highmem is not
> > dirtyable system wide but still we can potetially return back saying
> > for this cgroup we can dirty more pages which can potenailly be acutally
> > be more that system wide allowed?
> >
> > Because you have modified dirtyable_memory() and made it per cgroup, I
> > think it automatically takes care of the cases of per cgroup dirty ratio,
> > I mentioned in my previous mail. So we will use system wide dirty ratio
> > to calculate the allowed dirty pages in this cgroup (dirty_ratio *
> > available_memory()) and if this cgroup wrote too many pages start
> > writeout?
>
> OK, if I've understood well, you're proposing to use per-cgroup
> dirty_ratio interface and do something like:

I think we can use system wide dirty_ratio for per cgroup (instead of
providing configurable dirty_ratio for each cgroup where each memory
cgroup can have different dirty ratio. Can't think of a use case
immediately).
>
> unsigned long determine_dirtyable_memory(void)
> {
> unsigned long memcg_memory, memory;
>
> memory = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
> if (!vm_highmem_is_dirtyable)
> memory -= highmem_dirtyable_memory(memory);
>
> memcg_memory = mem_cgroup_page_state(MEMCG_NR_FREE_PAGES);
> if (!memcg_memory)
> return memory + 1; /* Ensure that we never return 0 */
> memcg_memory += mem_cgroup_page_state(MEMCG_NR_RECLAIMABLE_PAGES);
> if (!vm_highmem_is_dirtyable)
> memcg_memory -= highmem_dirtyable_memory(memory) *
> mem_cgroup_dirty_ratio() / 100;
> if (memcg_memory < memory)
> return memcg_memory;
> }
>

This one is tricky and I don't have good answers. I have concerns though.

- While calculating system wide dirtyable memory, we rely on actual memory
available. (NR_FREE_PAGES + reclaimable_pages). In case of per memory
cgroup available free pages, we are relying on not necessarily on
actually available dirtyable memory but based on a user configurable
limit (LIMIT - USAGE = cgroup_dirtyable_memory).

This is good as long as total sum of limits of all cgroups is not more
than available memory. But if somebody sets the "limit" to a high value,
we will allow lots of write from that cgroup without being throttled.

So if memory cgroups were not configured right so that limit total
represents the actual memory in system, then we might end up having lot
more dirty pages in the system.

- Subtracting high memory pages from dirtyable memory is tricky. Because
how to account it in per cgroup calculation. May be we can just do
following.

calculate_memcg_memory;
memory = memory - highmem_dirtyable_memory();
if (memcg_memory < memory)
return memcg_memory;

Not sure. This is very crude and leaves the scope of more pages being
dirty than otherwise would have been. Ideas?

Vivek

>
> >
> > > - return x + 1; /* Ensure that we never return 0 */
> > > + return memory + 1; /* Ensure that we never return 0 */
> > > }
> > >
> > > void
> > > @@ -421,12 +428,13 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
> > > unsigned long *pbdi_dirty, struct backing_dev_info *bdi)
> > > {
> > > unsigned long background;
> > > - unsigned long dirty;
> > > + unsigned long dirty, dirty_bytes;
> > > unsigned long available_memory = determine_dirtyable_memory();
> > > struct task_struct *tsk;
> > >
> > > - if (vm_dirty_bytes)
> > > - dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
> > > + dirty_bytes = mem_cgroup_dirty_bytes();
> > > + if (dirty_bytes)
> > > + dirty = DIV_ROUND_UP(dirty_bytes, PAGE_SIZE);
> > > else {
> > > int dirty_ratio;
> > >
> > > @@ -505,9 +513,17 @@ static void balance_dirty_pages(struct address_space *mapping,
> > > get_dirty_limits(&background_thresh, &dirty_thresh,
> > > &bdi_thresh, bdi);
> > >
> > > - nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
> > > + nr_reclaimable = mem_cgroup_page_state(MEMCG_NR_FILE_DIRTY);
> > > + if (nr_reclaimable == 0) {
> > > + nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
> > > global_page_state(NR_UNSTABLE_NFS);
> > > - nr_writeback = global_page_state(NR_WRITEBACK);
> > > + nr_writeback = global_page_state(NR_WRITEBACK);
> > > + } else {
> > > + nr_reclaimable +=
> > > + mem_cgroup_page_state(MEMCG_NR_UNSTABLE_NFS);
> > > + nr_writeback =
> > > + mem_cgroup_page_state(MEMCG_NR_WRITEBACK);
> > > + }
> > >
> > > bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
> > > bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
> > > @@ -660,6 +676,8 @@ void throttle_vm_writeout(gfp_t gfp_mask)
> > > unsigned long dirty_thresh;
> > >
> > > for ( ; ; ) {
> > > + unsigned long dirty;
> > > +
> > > get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
> > >
> > > /*
> > > @@ -668,10 +686,15 @@ void throttle_vm_writeout(gfp_t gfp_mask)
> > > */
> > > dirty_thresh += dirty_thresh / 10; /* wheeee... */
> > >
> > > - if (global_page_state(NR_UNSTABLE_NFS) +
> > > - global_page_state(NR_WRITEBACK) <= dirty_thresh)
> > > - break;
> > > - congestion_wait(BLK_RW_ASYNC, HZ/10);
> > > + dirty = mem_cgroup_page_state(MEMCG_NR_WRITEBACK);
> > > + if (dirty < 0)
> >
> > dirty is unsigned long. Will above condition be ever true?
> >
> > Are you expecting that NR_WRITEBACK can go negative?
>
> No, this is a bug, indeed. The right check is just "if (dirty)".
>
> Thanks!
> -Andrea
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

From: Vivek Goyal on 23 Feb 2010 16:40

On Sun, Feb 21, 2010 at 04:18:45PM +0100, Andrea Righi wrote:

[..]
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index 0b19943..c9ff1cd 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -137,10 +137,11 @@ static struct prop_descriptor vm_dirties;
> */
> static int calc_period_shift(void)
> {
> - unsigned long dirty_total;
> + unsigned long dirty_total, dirty_bytes;
>
> - if (vm_dirty_bytes)
> - dirty_total = vm_dirty_bytes / PAGE_SIZE;
> + dirty_bytes = mem_cgroup_dirty_bytes();
> + if (dirty_bytes)
> + dirty_total = dirty_bytes / PAGE_SIZE;
> else
> dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
> 100;

Ok, I don't understand this so I better ask. Can you explain a bit how memory
cgroup dirty ratio is going to play with per BDI dirty proportion thing.

Currently we seem to be calculating per BDI proportion (based on recently
completed events), of system wide dirty ratio and decide whether a process
should be throttled or not.

Because throttling decision is also based on BDI and its proportion, how
are we going to fit it with mem cgroup? Is it going to be BDI proportion
of dirty memory with-in memory cgroup (and not system wide)?

Thanks
Vivek
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

First | Prev | Next | Last
Pages: 1 2 3 4 5 6 7 8 9 10 11 12
Prev: [announce] gujin GPL bootloader version 2.8
Next: [Bug #14792] Misdetection of the TV output