From: Andrea Righi on 2 Mar 2010 17:20 On Tue, Mar 02, 2010 at 02:48:56PM +0100, Peter Zijlstra wrote: > On Mon, 2010-03-01 at 22:23 +0100, Andrea Righi wrote: > > Apply the cgroup dirty pages accounting and limiting infrastructure to > > the opportune kernel functions. > > > > Signed-off-by: Andrea Righi <arighi(a)develer.com> > > --- > > > diff --git a/mm/page-writeback.c b/mm/page-writeback.c > > index 5a0f8f3..d83f41c 100644 > > --- a/mm/page-writeback.c > > +++ b/mm/page-writeback.c > > @@ -137,13 +137,14 @@ static struct prop_descriptor vm_dirties; > > */ > > static int calc_period_shift(void) > > { > > - unsigned long dirty_total; > > + unsigned long dirty_total, dirty_bytes; > > > > - if (vm_dirty_bytes) > > - dirty_total = vm_dirty_bytes / PAGE_SIZE; > > + dirty_bytes = mem_cgroup_dirty_bytes(); > > + if (dirty_bytes) > > So you don't think 0 is a valid max dirty amount? A value of 0 means "disabled". It's used to select between dirty_ratio or dirty_bytes. It's the same for the gloabl vm_dirty_* parameters. > > > + dirty_total = dirty_bytes / PAGE_SIZE; > > else > > - dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / > > - 100; > > + dirty_total = (mem_cgroup_dirty_ratio() * > > + determine_dirtyable_memory()) / 100; > > return 2 + ilog2(dirty_total - 1); > > } > > > > @@ -408,14 +409,16 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) > > */ > > unsigned long determine_dirtyable_memory(void) > > { > > - unsigned long x; > > - > > - x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); > > + unsigned long memory; > > + s64 memcg_memory; > > > > + memory = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); > > if (!vm_highmem_is_dirtyable) > > - x -= highmem_dirtyable_memory(x); > > - > > - return x + 1; /* Ensure that we never return 0 */ > > + memory -= highmem_dirtyable_memory(memory); > > + memcg_memory = mem_cgroup_page_stat(MEMCG_NR_DIRTYABLE_PAGES); > > + if (memcg_memory < 0) > > And here you somehow return negative? > > > + return memory + 1; > > + return min((unsigned long)memcg_memory, memory + 1); > > } > > > > void > > @@ -423,26 +426,28 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, > > unsigned long *pbdi_dirty, struct backing_dev_info *bdi) > > { > > unsigned long background; > > - unsigned long dirty; > > + unsigned long dirty, dirty_bytes, dirty_background; > > unsigned long available_memory = determine_dirtyable_memory(); > > struct task_struct *tsk; > > > > - if (vm_dirty_bytes) > > - dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); > > + dirty_bytes = mem_cgroup_dirty_bytes(); > > + if (dirty_bytes) > > zero not valid again > > > + dirty = DIV_ROUND_UP(dirty_bytes, PAGE_SIZE); > > else { > > int dirty_ratio; > > > > - dirty_ratio = vm_dirty_ratio; > > + dirty_ratio = mem_cgroup_dirty_ratio(); > > if (dirty_ratio < 5) > > dirty_ratio = 5; > > dirty = (dirty_ratio * available_memory) / 100; > > } > > > > - if (dirty_background_bytes) > > - background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); > > + dirty_background = mem_cgroup_dirty_background_bytes(); > > + if (dirty_background) > > idem > > > + background = DIV_ROUND_UP(dirty_background, PAGE_SIZE); > > else > > - background = (dirty_background_ratio * available_memory) / 100; > > - > > + background = (mem_cgroup_dirty_background_ratio() * > > + available_memory) / 100; > > if (background >= dirty) > > background = dirty / 2; > > tsk = current; > > @@ -508,9 +513,13 @@ static void balance_dirty_pages(struct address_space *mapping, > > get_dirty_limits(&background_thresh, &dirty_thresh, > > &bdi_thresh, bdi); > > > > - nr_reclaimable = global_page_state(NR_FILE_DIRTY) + > > + nr_reclaimable = mem_cgroup_page_stat(MEMCG_NR_RECLAIM_PAGES); > > + nr_writeback = mem_cgroup_page_stat(MEMCG_NR_WRITEBACK); > > + if ((nr_reclaimable < 0) || (nr_writeback < 0)) { > > + nr_reclaimable = global_page_state(NR_FILE_DIRTY) + > > global_page_state(NR_UNSTABLE_NFS); > > ??? why would a page_state be negative.. I see you return -ENOMEM on ! > cgroup, but how can one specify no dirty limit with this compiled in? > > > - nr_writeback = global_page_state(NR_WRITEBACK); > > + nr_writeback = global_page_state(NR_WRITEBACK); > > + } > > > > bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY); > > if (bdi_cap_account_unstable(bdi)) { > > @@ -611,10 +620,12 @@ static void balance_dirty_pages(struct address_space *mapping, > > * In normal mode, we start background writeout at the lower > > * background_thresh, to keep the amount of dirty memory low. > > */ > > + nr_reclaimable = mem_cgroup_page_stat(MEMCG_NR_RECLAIM_PAGES); > > + if (nr_reclaimable < 0) > > + nr_reclaimable = global_page_state(NR_FILE_DIRTY) + > > + global_page_state(NR_UNSTABLE_NFS); > > Again.. > > > if ((laptop_mode && pages_written) || > > - (!laptop_mode && ((global_page_state(NR_FILE_DIRTY) > > - + global_page_state(NR_UNSTABLE_NFS)) > > - > background_thresh))) > > + (!laptop_mode && (nr_reclaimable > background_thresh))) > > bdi_start_writeback(bdi, NULL, 0); > > } > > > > @@ -678,6 +689,8 @@ void throttle_vm_writeout(gfp_t gfp_mask) > > unsigned long dirty_thresh; > > > > for ( ; ; ) { > > + unsigned long dirty; > > + > > get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); > > > > /* > > @@ -686,10 +699,14 @@ void throttle_vm_writeout(gfp_t gfp_mask) > > */ > > dirty_thresh += dirty_thresh / 10; /* wheeee... */ > > > > - if (global_page_state(NR_UNSTABLE_NFS) + > > - global_page_state(NR_WRITEBACK) <= dirty_thresh) > > - break; > > - congestion_wait(BLK_RW_ASYNC, HZ/10); > > + > > + dirty = mem_cgroup_page_stat(MEMCG_NR_DIRTY_WRITEBACK_PAGES); > > + if (dirty < 0) > > + dirty = global_page_state(NR_UNSTABLE_NFS) + > > + global_page_state(NR_WRITEBACK); > > and again.. > > > + if (dirty <= dirty_thresh) > > + break; > > + congestion_wait(BLK_RW_ASYNC, HZ/10); > > > > /* > > * The caller might hold locks which can prevent IO completion > > This is ugly and broken.. I thought you'd agreed to something like: > > if (mem_cgroup_has_dirty_limit(cgroup)) > use mem_cgroup numbers > else > use global numbers I agree mem_cgroup_has_dirty_limit() is nicer. But we must do that under RCU, so something like: rcu_read_lock(); if (mem_cgroup_has_dirty_limit()) mem_cgroup_get_page_stat() else global_page_state() rcu_read_unlock(); That is bad when mem_cgroup_has_dirty_limit() always returns false (e.g., when memory cgroups are disabled). So I fallback to the old interface. What do you think about: mem_cgroup_lock(); if (mem_cgroup_has_dirty_limit()) mem_cgroup_get_page_stat() else global_page_state() mem_cgroup_unlock(); Where mem_cgroup_read_lock/unlock() simply expand to nothing when memory cgroups are disabled. > > That allows for a 0 dirty limit (which should work and basically makes > all io synchronous). IMHO it is better to reserve 0 for the special value "disabled" like the global settings. A synchronous IO can be also achieved using a dirty limit of 1. > > Also, I'd put each of those in a separate function, like: > > unsigned long reclaimable_pages(cgroup) > { > if (mem_cgroup_has_dirty_limit(cgroup)) > return mem_cgroup_page_stat(MEMCG_NR_RECLAIM_PAGES); > > return global_page_state(NR_FILE_DIRTY) + global_page_state(NR_NFS_UNSTABLE); > } Agreed. > > Which raises another question, you should probably rebase on top of > Trond's patches, which removes BDI_RECLAIMABLE, suggesting you also > loose MEMCG_NR_RECLAIM_PAGES in favour of the DIRTY+UNSTABLE split. OK, will look at Trond's work. Thanks, -Andrea -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo(a)vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
From: Andrea Righi on 2 Mar 2010 17:30 On Tue, Mar 02, 2010 at 10:05:29AM -0500, Vivek Goyal wrote: > On Mon, Mar 01, 2010 at 11:18:31PM +0100, Andrea Righi wrote: > > On Mon, Mar 01, 2010 at 05:02:08PM -0500, Vivek Goyal wrote: > > > > @@ -686,10 +699,14 @@ void throttle_vm_writeout(gfp_t gfp_mask) > > > > */ > > > > dirty_thresh += dirty_thresh / 10; /* wheeee... */ > > > > > > > > - if (global_page_state(NR_UNSTABLE_NFS) + > > > > - global_page_state(NR_WRITEBACK) <= dirty_thresh) > > > > - break; > > > > - congestion_wait(BLK_RW_ASYNC, HZ/10); > > > > + > > > > + dirty = mem_cgroup_page_stat(MEMCG_NR_DIRTY_WRITEBACK_PAGES); > > > > + if (dirty < 0) > > > > + dirty = global_page_state(NR_UNSTABLE_NFS) + > > > > + global_page_state(NR_WRITEBACK); > > > > > > dirty is unsigned long. As mentioned last time, above will never be true? > > > In general these patches look ok to me. I will do some testing with these. > > > > Re-introduced the same bug. My bad. :( > > > > The value returned from mem_cgroup_page_stat() can be negative, i.e. > > when memory cgroup is disabled. We could simply use a long for dirty, > > the unit is in # of pages so s64 should be enough. Or cast dirty to long > > only for the check (see below). > > > > Thanks! > > -Andrea > > > > Signed-off-by: Andrea Righi <arighi(a)develer.com> > > --- > > mm/page-writeback.c | 2 +- > > 1 files changed, 1 insertions(+), 1 deletions(-) > > > > diff --git a/mm/page-writeback.c b/mm/page-writeback.c > > index d83f41c..dbee976 100644 > > --- a/mm/page-writeback.c > > +++ b/mm/page-writeback.c > > @@ -701,7 +701,7 @@ void throttle_vm_writeout(gfp_t gfp_mask) > > > > > > dirty = mem_cgroup_page_stat(MEMCG_NR_DIRTY_WRITEBACK_PAGES); > > - if (dirty < 0) > > + if ((long)dirty < 0) > > This will also be problematic as on 32bit systems, your uppper limit of > dirty memory will be 2G? > > I guess, I will prefer one of the two. > > - return the error code from function and pass a pointer to store stats > in as function argument. > > - Or Peter's suggestion of checking mem_cgroup_has_dirty_limit() and if > per cgroup dirty control is enabled, then use per cgroup stats. In that > case you don't have to return negative values. > > Only tricky part will be careful accouting so that none of the stats go > negative in corner cases of migration etc. What do you think about Peter's suggestion + the locking stuff? (see the previous email). Otherwise, I'll choose the other solution, passing a pointer and always return the error code is not bad. Thanks, -Andrea -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo(a)vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
From: Daisuke Nishimura on 2 Mar 2010 18:30 On Tue, 2 Mar 2010 23:18:23 +0100, Andrea Righi <arighi(a)develer.com> wrote: > On Tue, Mar 02, 2010 at 07:20:26PM +0530, Balbir Singh wrote: > > * KAMEZAWA Hiroyuki <kamezawa.hiroyu(a)jp.fujitsu.com> [2010-03-02 17:23:16]: > > > > > On Tue, 2 Mar 2010 09:01:58 +0100 > > > Andrea Righi <arighi(a)develer.com> wrote: > > > > > > > On Tue, Mar 02, 2010 at 09:23:09AM +0900, KAMEZAWA Hiroyuki wrote: > > > > > On Mon, 1 Mar 2010 22:23:40 +0100 > > > > > Andrea Righi <arighi(a)develer.com> wrote: > > > > > > > > > > > Apply the cgroup dirty pages accounting and limiting infrastructure to > > > > > > the opportune kernel functions. > > > > > > > > > > > > Signed-off-by: Andrea Righi <arighi(a)develer.com> > > > > > > > > > > Seems nice. > > > > > > > > > > Hmm. the last problem is moving account between memcg. > > > > > > > > > > Right ? > > > > > > > > Correct. This was actually the last item of the TODO list. Anyway, I'm > > > > still considering if it's correct to move dirty pages when a task is > > > > migrated from a cgroup to another. Currently, dirty pages just remain in > > > > the original cgroup and are flushed depending on the original cgroup > > > > settings. That is not totally wrong... at least moving the dirty pages > > > > between memcgs should be optional (move_charge_at_immigrate?). > > > > > > > > > > My concern is > > > - migration between memcg is already suppoted > > > - at task move > > > - at rmdir > > > > > > Then, if you leave DIRTY_PAGE accounting to original cgroup, > > > the new cgroup (migration target)'s Dirty page accounting may > > > goes to be negative, or incorrect value. Please check FILE_MAPPED > > > implementation in __mem_cgroup_move_account() > > > > > > As > > > if (page_mapped(page) && !PageAnon(page)) { > > > /* Update mapped_file data for mem_cgroup */ > > > preempt_disable(); > > > __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); > > > __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); > > > preempt_enable(); > > > } > > > then, FILE_MAPPED never goes negative. > > > > > > > Absolutely! I am not sure how complex dirty memory migration will be, > > but one way of working around it would be to disable migration of > > charges when the feature is enabled (dirty* is set in the memory > > cgroup). We might need additional logic to allow that to happen. > > I've started to look at dirty memory migration. First attempt is to add > DIRTY, WRITEBACK, etc. to page_cgroup flags and handle them in > __mem_cgroup_move_account(). Probably I'll have something ready for the > next version of the patch. I still need to figure if this can work as > expected... > I agree it's a right direction(in fact, I have been planning to post a patch in that direction), so I leave it to you. Can you add PCG_FILE_MAPPED flag too ? I think this flag can be handled in the same way as other flags you're trying to add, and we can change "if (page_mapped(page) && !PageAnon(page))" to "if (PageCgroupFileMapped(pc)" in __mem_cgroup_move_account(). It would be cleaner than current code, IMHO. Thanks, Daisuke Nishimura. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo(a)vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
From: Vivek Goyal on 2 Mar 2010 19:10 On Tue, Mar 02, 2010 at 11:22:48PM +0100, Andrea Righi wrote: > On Tue, Mar 02, 2010 at 10:05:29AM -0500, Vivek Goyal wrote: > > On Mon, Mar 01, 2010 at 11:18:31PM +0100, Andrea Righi wrote: > > > On Mon, Mar 01, 2010 at 05:02:08PM -0500, Vivek Goyal wrote: > > > > > @@ -686,10 +699,14 @@ void throttle_vm_writeout(gfp_t gfp_mask) > > > > > */ > > > > > dirty_thresh += dirty_thresh / 10; /* wheeee... */ > > > > > > > > > > - if (global_page_state(NR_UNSTABLE_NFS) + > > > > > - global_page_state(NR_WRITEBACK) <= dirty_thresh) > > > > > - break; > > > > > - congestion_wait(BLK_RW_ASYNC, HZ/10); > > > > > + > > > > > + dirty = mem_cgroup_page_stat(MEMCG_NR_DIRTY_WRITEBACK_PAGES); > > > > > + if (dirty < 0) > > > > > + dirty = global_page_state(NR_UNSTABLE_NFS) + > > > > > + global_page_state(NR_WRITEBACK); > > > > > > > > dirty is unsigned long. As mentioned last time, above will never be true? > > > > In general these patches look ok to me. I will do some testing with these. > > > > > > Re-introduced the same bug. My bad. :( > > > > > > The value returned from mem_cgroup_page_stat() can be negative, i.e. > > > when memory cgroup is disabled. We could simply use a long for dirty, > > > the unit is in # of pages so s64 should be enough. Or cast dirty to long > > > only for the check (see below). > > > > > > Thanks! > > > -Andrea > > > > > > Signed-off-by: Andrea Righi <arighi(a)develer.com> > > > --- > > > mm/page-writeback.c | 2 +- > > > 1 files changed, 1 insertions(+), 1 deletions(-) > > > > > > diff --git a/mm/page-writeback.c b/mm/page-writeback.c > > > index d83f41c..dbee976 100644 > > > --- a/mm/page-writeback.c > > > +++ b/mm/page-writeback.c > > > @@ -701,7 +701,7 @@ void throttle_vm_writeout(gfp_t gfp_mask) > > > > > > > > > dirty = mem_cgroup_page_stat(MEMCG_NR_DIRTY_WRITEBACK_PAGES); > > > - if (dirty < 0) > > > + if ((long)dirty < 0) > > > > This will also be problematic as on 32bit systems, your uppper limit of > > dirty memory will be 2G? > > > > I guess, I will prefer one of the two. > > > > - return the error code from function and pass a pointer to store stats > > in as function argument. > > > > - Or Peter's suggestion of checking mem_cgroup_has_dirty_limit() and if > > per cgroup dirty control is enabled, then use per cgroup stats. In that > > case you don't have to return negative values. > > > > Only tricky part will be careful accouting so that none of the stats go > > negative in corner cases of migration etc. > > What do you think about Peter's suggestion + the locking stuff? (see the > previous email). Otherwise, I'll choose the other solution, passing a > pointer and always return the error code is not bad. > Ok, so you are worried about that by the we finish mem_cgroup_has_dirty_limit() call, task might change cgroup and later we might call mem_cgroup_get_page_stat() on a different cgroup altogether which might or might not have dirty limits specified? But in what cases you don't want to use memory cgroup specified limit? I thought cgroup disabled what the only case where we need to use global limits. Otherwise a memory cgroup will have either dirty_bytes specified or by default inherit global dirty_ratio which is a valid number. If that's the case then you don't have to take rcu_lock() outside get_page_stat()? IOW, apart from cgroup being disabled, what are the other cases where you expect to not use cgroup's page stat and use global stats? Thanks Vivek > Thanks, > -Andrea -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo(a)vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
From: Daisuke Nishimura on 2 Mar 2010 21:20
> diff --git a/mm/filemap.c b/mm/filemap.c > index fe09e51..f85acae 100644 > --- a/mm/filemap.c > +++ b/mm/filemap.c > @@ -135,6 +135,7 @@ void __remove_from_page_cache(struct page *page) > * having removed the page entirely. > */ > if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { > + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_DIRTY, -1); > dec_zone_page_state(page, NR_FILE_DIRTY); > dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTY); > } (snip) > @@ -1096,6 +1113,7 @@ int __set_page_dirty_no_writeback(struct page *page) > void account_page_dirtied(struct page *page, struct address_space *mapping) > { > if (mapping_cap_account_dirty(mapping)) { > + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_DIRTY, 1); > __inc_zone_page_state(page, NR_FILE_DIRTY); > __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTY); > task_dirty_inc(current); As long as I can see, those two functions(at least) calls mem_cgroup_update_state(), which acquires page cgroup lock, under mapping->tree_lock. But as I fixed before in commit e767e056, page cgroup lock must not acquired under mapping->tree_lock. hmm, we should call those mem_cgroup_update_state() outside mapping->tree_lock, or add local_irq_save/restore() around lock/unlock_page_cgroup() to avoid dead-lock. Thanks, Daisuke Nishimura. On Mon, 1 Mar 2010 22:23:40 +0100, Andrea Righi <arighi(a)develer.com> wrote: > Apply the cgroup dirty pages accounting and limiting infrastructure to > the opportune kernel functions. > > Signed-off-by: Andrea Righi <arighi(a)develer.com> > --- > fs/fuse/file.c | 5 +++ > fs/nfs/write.c | 4 ++ > fs/nilfs2/segment.c | 10 +++++- > mm/filemap.c | 1 + > mm/page-writeback.c | 84 ++++++++++++++++++++++++++++++++------------------ > mm/rmap.c | 4 +- > mm/truncate.c | 2 + > 7 files changed, 76 insertions(+), 34 deletions(-) > > diff --git a/fs/fuse/file.c b/fs/fuse/file.c > index a9f5e13..dbbdd53 100644 > --- a/fs/fuse/file.c > +++ b/fs/fuse/file.c > @@ -11,6 +11,7 @@ > #include <linux/pagemap.h> > #include <linux/slab.h> > #include <linux/kernel.h> > +#include <linux/memcontrol.h> > #include <linux/sched.h> > #include <linux/module.h> > > @@ -1129,6 +1130,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) > > list_del(&req->writepages_entry); > dec_bdi_stat(bdi, BDI_WRITEBACK); > + mem_cgroup_update_stat(req->pages[0], > + MEM_CGROUP_STAT_WRITEBACK_TEMP, -1); > dec_zone_page_state(req->pages[0], NR_WRITEBACK_TEMP); > bdi_writeout_inc(bdi); > wake_up(&fi->page_waitq); > @@ -1240,6 +1243,8 @@ static int fuse_writepage_locked(struct page *page) > req->inode = inode; > > inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK); > + mem_cgroup_update_stat(tmp_page, > + MEM_CGROUP_STAT_WRITEBACK_TEMP, 1); > inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); > end_page_writeback(page); > > diff --git a/fs/nfs/write.c b/fs/nfs/write.c > index b753242..7316f7a 100644 > --- a/fs/nfs/write.c > +++ b/fs/nfs/write.c > @@ -439,6 +439,7 @@ nfs_mark_request_commit(struct nfs_page *req) > req->wb_index, > NFS_PAGE_TAG_COMMIT); > spin_unlock(&inode->i_lock); > + mem_cgroup_update_stat(req->wb_page, MEM_CGROUP_STAT_UNSTABLE_NFS, 1); > inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); > inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_UNSTABLE); > __mark_inode_dirty(inode, I_DIRTY_DATASYNC); > @@ -450,6 +451,7 @@ nfs_clear_request_commit(struct nfs_page *req) > struct page *page = req->wb_page; > > if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) { > + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_UNSTABLE_NFS, -1); > dec_zone_page_state(page, NR_UNSTABLE_NFS); > dec_bdi_stat(page->mapping->backing_dev_info, BDI_UNSTABLE); > return 1; > @@ -1273,6 +1275,8 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how) > req = nfs_list_entry(head->next); > nfs_list_remove_request(req); > nfs_mark_request_commit(req); > + mem_cgroup_update_stat(req->wb_page, > + MEM_CGROUP_STAT_UNSTABLE_NFS, -1); > dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); > dec_bdi_stat(req->wb_page->mapping->backing_dev_info, > BDI_UNSTABLE); > diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c > index ada2f1b..aef6d13 100644 > --- a/fs/nilfs2/segment.c > +++ b/fs/nilfs2/segment.c > @@ -1660,8 +1660,11 @@ nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out) > } while (bh = bh->b_this_page, bh2 = bh2->b_this_page, bh != head); > kunmap_atomic(kaddr, KM_USER0); > > - if (!TestSetPageWriteback(clone_page)) > + if (!TestSetPageWriteback(clone_page)) { > + mem_cgroup_update_stat(clone_page, > + MEM_CGROUP_STAT_WRITEBACK, 1); > inc_zone_page_state(clone_page, NR_WRITEBACK); > + } > unlock_page(clone_page); > > return 0; > @@ -1783,8 +1786,11 @@ static void __nilfs_end_page_io(struct page *page, int err) > } > > if (buffer_nilfs_allocated(page_buffers(page))) { > - if (TestClearPageWriteback(page)) > + if (TestClearPageWriteback(page)) { > + mem_cgroup_update_stat(clone_page, > + MEM_CGROUP_STAT_WRITEBACK, -1); > dec_zone_page_state(page, NR_WRITEBACK); > + } > } else > end_page_writeback(page); > } > diff --git a/mm/filemap.c b/mm/filemap.c > index fe09e51..f85acae 100644 > --- a/mm/filemap.c > +++ b/mm/filemap.c > @@ -135,6 +135,7 @@ void __remove_from_page_cache(struct page *page) > * having removed the page entirely. > */ > if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { > + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_DIRTY, -1); > dec_zone_page_state(page, NR_FILE_DIRTY); > dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTY); > } > diff --git a/mm/page-writeback.c b/mm/page-writeback.c > index 5a0f8f3..d83f41c 100644 > --- a/mm/page-writeback.c > +++ b/mm/page-writeback.c > @@ -137,13 +137,14 @@ static struct prop_descriptor vm_dirties; > */ > static int calc_period_shift(void) > { > - unsigned long dirty_total; > + unsigned long dirty_total, dirty_bytes; > > - if (vm_dirty_bytes) > - dirty_total = vm_dirty_bytes / PAGE_SIZE; > + dirty_bytes = mem_cgroup_dirty_bytes(); > + if (dirty_bytes) > + dirty_total = dirty_bytes / PAGE_SIZE; > else > - dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / > - 100; > + dirty_total = (mem_cgroup_dirty_ratio() * > + determine_dirtyable_memory()) / 100; > return 2 + ilog2(dirty_total - 1); > } > > @@ -408,14 +409,16 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) > */ > unsigned long determine_dirtyable_memory(void) > { > - unsigned long x; > - > - x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); > + unsigned long memory; > + s64 memcg_memory; > > + memory = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); > if (!vm_highmem_is_dirtyable) > - x -= highmem_dirtyable_memory(x); > - > - return x + 1; /* Ensure that we never return 0 */ > + memory -= highmem_dirtyable_memory(memory); > + memcg_memory = mem_cgroup_page_stat(MEMCG_NR_DIRTYABLE_PAGES); > + if (memcg_memory < 0) > + return memory + 1; > + return min((unsigned long)memcg_memory, memory + 1); > } > > void > @@ -423,26 +426,28 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, > unsigned long *pbdi_dirty, struct backing_dev_info *bdi) > { > unsigned long background; > - unsigned long dirty; > + unsigned long dirty, dirty_bytes, dirty_background; > unsigned long available_memory = determine_dirtyable_memory(); > struct task_struct *tsk; > > - if (vm_dirty_bytes) > - dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); > + dirty_bytes = mem_cgroup_dirty_bytes(); > + if (dirty_bytes) > + dirty = DIV_ROUND_UP(dirty_bytes, PAGE_SIZE); > else { > int dirty_ratio; > > - dirty_ratio = vm_dirty_ratio; > + dirty_ratio = mem_cgroup_dirty_ratio(); > if (dirty_ratio < 5) > dirty_ratio = 5; > dirty = (dirty_ratio * available_memory) / 100; > } > > - if (dirty_background_bytes) > - background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); > + dirty_background = mem_cgroup_dirty_background_bytes(); > + if (dirty_background) > + background = DIV_ROUND_UP(dirty_background, PAGE_SIZE); > else > - background = (dirty_background_ratio * available_memory) / 100; > - > + background = (mem_cgroup_dirty_background_ratio() * > + available_memory) / 100; > if (background >= dirty) > background = dirty / 2; > tsk = current; > @@ -508,9 +513,13 @@ static void balance_dirty_pages(struct address_space *mapping, > get_dirty_limits(&background_thresh, &dirty_thresh, > &bdi_thresh, bdi); > > - nr_reclaimable = global_page_state(NR_FILE_DIRTY) + > + nr_reclaimable = mem_cgroup_page_stat(MEMCG_NR_RECLAIM_PAGES); > + nr_writeback = mem_cgroup_page_stat(MEMCG_NR_WRITEBACK); > + if ((nr_reclaimable < 0) || (nr_writeback < 0)) { > + nr_reclaimable = global_page_state(NR_FILE_DIRTY) + > global_page_state(NR_UNSTABLE_NFS); > - nr_writeback = global_page_state(NR_WRITEBACK); > + nr_writeback = global_page_state(NR_WRITEBACK); > + } > > bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY); > if (bdi_cap_account_unstable(bdi)) { > @@ -611,10 +620,12 @@ static void balance_dirty_pages(struct address_space *mapping, > * In normal mode, we start background writeout at the lower > * background_thresh, to keep the amount of dirty memory low. > */ > + nr_reclaimable = mem_cgroup_page_stat(MEMCG_NR_RECLAIM_PAGES); > + if (nr_reclaimable < 0) > + nr_reclaimable = global_page_state(NR_FILE_DIRTY) + > + global_page_state(NR_UNSTABLE_NFS); > if ((laptop_mode && pages_written) || > - (!laptop_mode && ((global_page_state(NR_FILE_DIRTY) > - + global_page_state(NR_UNSTABLE_NFS)) > - > background_thresh))) > + (!laptop_mode && (nr_reclaimable > background_thresh))) > bdi_start_writeback(bdi, NULL, 0); > } > > @@ -678,6 +689,8 @@ void throttle_vm_writeout(gfp_t gfp_mask) > unsigned long dirty_thresh; > > for ( ; ; ) { > + unsigned long dirty; > + > get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); > > /* > @@ -686,10 +699,14 @@ void throttle_vm_writeout(gfp_t gfp_mask) > */ > dirty_thresh += dirty_thresh / 10; /* wheeee... */ > > - if (global_page_state(NR_UNSTABLE_NFS) + > - global_page_state(NR_WRITEBACK) <= dirty_thresh) > - break; > - congestion_wait(BLK_RW_ASYNC, HZ/10); > + > + dirty = mem_cgroup_page_stat(MEMCG_NR_DIRTY_WRITEBACK_PAGES); > + if (dirty < 0) > + dirty = global_page_state(NR_UNSTABLE_NFS) + > + global_page_state(NR_WRITEBACK); > + if (dirty <= dirty_thresh) > + break; > + congestion_wait(BLK_RW_ASYNC, HZ/10); > > /* > * The caller might hold locks which can prevent IO completion > @@ -1096,6 +1113,7 @@ int __set_page_dirty_no_writeback(struct page *page) > void account_page_dirtied(struct page *page, struct address_space *mapping) > { > if (mapping_cap_account_dirty(mapping)) { > + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_DIRTY, 1); > __inc_zone_page_state(page, NR_FILE_DIRTY); > __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTY); > task_dirty_inc(current); > @@ -1297,6 +1315,8 @@ int clear_page_dirty_for_io(struct page *page) > * for more comments. > */ > if (TestClearPageDirty(page)) { > + mem_cgroup_update_stat(page, > + MEM_CGROUP_STAT_FILE_DIRTY, -1); > dec_zone_page_state(page, NR_FILE_DIRTY); > dec_bdi_stat(mapping->backing_dev_info, > BDI_DIRTY); > @@ -1332,8 +1352,10 @@ int test_clear_page_writeback(struct page *page) > } else { > ret = TestClearPageWriteback(page); > } > - if (ret) > + if (ret) { > + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_WRITEBACK, -1); > dec_zone_page_state(page, NR_WRITEBACK); > + } > return ret; > } > > @@ -1363,8 +1385,10 @@ int test_set_page_writeback(struct page *page) > } else { > ret = TestSetPageWriteback(page); > } > - if (!ret) > + if (!ret) { > + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_WRITEBACK, 1); > inc_zone_page_state(page, NR_WRITEBACK); > + } > return ret; > > } > diff --git a/mm/rmap.c b/mm/rmap.c > index 4d2fb93..8d74335 100644 > --- a/mm/rmap.c > +++ b/mm/rmap.c > @@ -832,7 +832,7 @@ void page_add_file_rmap(struct page *page) > { > if (atomic_inc_and_test(&page->_mapcount)) { > __inc_zone_page_state(page, NR_FILE_MAPPED); > - mem_cgroup_update_file_mapped(page, 1); > + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, 1); > } > } > > @@ -864,7 +864,7 @@ void page_remove_rmap(struct page *page) > __dec_zone_page_state(page, NR_ANON_PAGES); > } else { > __dec_zone_page_state(page, NR_FILE_MAPPED); > - mem_cgroup_update_file_mapped(page, -1); > + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, -1); > } > /* > * It would be tidy to reset the PageAnon mapping here, > diff --git a/mm/truncate.c b/mm/truncate.c > index 2466e0c..5f437e7 100644 > --- a/mm/truncate.c > +++ b/mm/truncate.c > @@ -73,6 +73,8 @@ void cancel_dirty_page(struct page *page, unsigned int account_size) > if (TestClearPageDirty(page)) { > struct address_space *mapping = page->mapping; > if (mapping && mapping_cap_account_dirty(mapping)) { > + mem_cgroup_update_stat(page, > + MEM_CGROUP_STAT_FILE_DIRTY, -1); > dec_zone_page_state(page, NR_FILE_DIRTY); > dec_bdi_stat(mapping->backing_dev_info, > BDI_DIRTY); > -- > 1.6.3.3 > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo(a)vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ |