Prev: [PATCH 3/3] perf probe: Fix the logic of die_compare_name
Next: [-next July 9 - s390 ] Badness at fs/sysfs/symlink.c:82 during qeth initalization
From: Andrea Righi on 9 Jul 2010 05:30 On Thu, Jul 08, 2010 at 11:16:28PM -0400, Munehiro Ikeda wrote: > Embedding hooks for iotrack to record process info, namely > cgroup ID. > This patch is based on a patch posted from Ryo Tsuruta on Oct 2, > 2009 titled "Page tracking hooks". > > Signed-off-by: Hirokazu Takahashi <taka(a)valinux.co.jp> > Signed-off-by: Ryo Tsuruta <ryov(a)valinux.co.jp> > Signed-off-by: Munehiro "Muuhh" Ikeda <m-ikeda(a)ds.jp.nec.com> > --- > fs/buffer.c | 2 ++ > fs/direct-io.c | 2 ++ > mm/bounce.c | 2 ++ > mm/filemap.c | 2 ++ > mm/memory.c | 5 +++++ > mm/page-writeback.c | 2 ++ > mm/swap_state.c | 2 ++ > 7 files changed, 17 insertions(+), 0 deletions(-) > > diff --git a/fs/buffer.c b/fs/buffer.c > index d54812b..c418fdf 100644 > --- a/fs/buffer.c > +++ b/fs/buffer.c > @@ -36,6 +36,7 @@ > #include <linux/buffer_head.h> > #include <linux/task_io_accounting_ops.h> > #include <linux/bio.h> > +#include <linux/blk-iotrack.h> > #include <linux/notifier.h> > #include <linux/cpu.h> > #include <linux/bitops.h> > @@ -667,6 +668,7 @@ static void __set_page_dirty(struct page *page, > if (page->mapping) { /* Race with truncate? */ > WARN_ON_ONCE(warn && !PageUptodate(page)); > account_page_dirtied(page, mapping); > + blk_iotrack_reset_owner_pagedirty(page, current->mm); > radix_tree_tag_set(&mapping->page_tree, > page_index(page), PAGECACHE_TAG_DIRTY); > } > diff --git a/fs/direct-io.c b/fs/direct-io.c > index 7600aac..2c1f42f 100644 > --- a/fs/direct-io.c > +++ b/fs/direct-io.c > @@ -33,6 +33,7 @@ > #include <linux/err.h> > #include <linux/blkdev.h> > #include <linux/buffer_head.h> > +#include <linux/blk-iotrack.h> > #include <linux/rwsem.h> > #include <linux/uio.h> > #include <asm/atomic.h> > @@ -846,6 +847,7 @@ static int do_direct_IO(struct dio *dio) > ret = PTR_ERR(page); > goto out; > } > + blk_iotrack_reset_owner(page, current->mm); > > while (block_in_page < blocks_per_page) { > unsigned offset_in_page = block_in_page << blkbits; > diff --git a/mm/bounce.c b/mm/bounce.c > index 13b6dad..04339df 100644 > --- a/mm/bounce.c > +++ b/mm/bounce.c > @@ -14,6 +14,7 @@ > #include <linux/init.h> > #include <linux/hash.h> > #include <linux/highmem.h> > +#include <linux/blk-iotrack.h> > #include <asm/tlbflush.h> > > #include <trace/events/block.h> > @@ -211,6 +212,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, > to->bv_len = from->bv_len; > to->bv_offset = from->bv_offset; > inc_zone_page_state(to->bv_page, NR_BOUNCE); > + blk_iotrack_copy_owner(to->bv_page, page); > > if (rw == WRITE) { > char *vto, *vfrom; > diff --git a/mm/filemap.c b/mm/filemap.c > index 20e5642..a255d0c 100644 > --- a/mm/filemap.c > +++ b/mm/filemap.c > @@ -33,6 +33,7 @@ > #include <linux/cpuset.h> > #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ > #include <linux/memcontrol.h> > +#include <linux/blk-iotrack.h> > #include <linux/mm_inline.h> /* for page_is_file_cache() */ > #include "internal.h" > > @@ -405,6 +406,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, > gfp_mask & GFP_RECLAIM_MASK); > if (error) > goto out; > + blk_iotrack_set_owner(page, current->mm); > > error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); > if (error == 0) { > diff --git a/mm/memory.c b/mm/memory.c > index 119b7cc..3eb2d0d 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -52,6 +52,7 @@ > #include <linux/init.h> > #include <linux/writeback.h> > #include <linux/memcontrol.h> > +#include <linux/blk-iotrack.h> > #include <linux/mmu_notifier.h> > #include <linux/kallsyms.h> > #include <linux/swapops.h> > @@ -2283,6 +2284,7 @@ gotten: > */ > ptep_clear_flush(vma, address, page_table); > page_add_new_anon_rmap(new_page, vma, address); > + blk_iotrack_set_owner(new_page, mm); > /* > * We call the notify macro here because, when using secondary > * mmu page tables (such as kvm shadow page tables), we want the > @@ -2718,6 +2720,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, > flush_icache_page(vma, page); > set_pte_at(mm, address, page_table, pte); > page_add_anon_rmap(page, vma, address); > + blk_iotrack_reset_owner(page, mm); > /* It's better to call commit-charge after rmap is established */ > mem_cgroup_commit_charge_swapin(page, ptr); > > @@ -2795,6 +2798,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, > > inc_mm_counter_fast(mm, MM_ANONPAGES); > page_add_new_anon_rmap(page, vma, address); > + blk_iotrack_set_owner(page, mm); We're tracking anonymous memory here. Should we charge the cost of the swap IO to the root cgroup or the actual owner of the page? there was a previous discussion about this topic, but can't remember the details, sorry. IMHO we could remove some overhead simply ignoring the tracking of anonymous pages (swap IO) and just consider direct and writeback IO of file pages. > setpte: > set_pte_at(mm, address, page_table, entry); > > @@ -2949,6 +2953,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, > if (anon) { > inc_mm_counter_fast(mm, MM_ANONPAGES); > page_add_new_anon_rmap(page, vma, address); > + blk_iotrack_set_owner(page, mm); Ditto. > } else { > inc_mm_counter_fast(mm, MM_FILEPAGES); > page_add_file_rmap(page); > diff --git a/mm/page-writeback.c b/mm/page-writeback.c > index 54f28bd..f3e6b2c 100644 > --- a/mm/page-writeback.c > +++ b/mm/page-writeback.c > @@ -23,6 +23,7 @@ > #include <linux/init.h> > #include <linux/backing-dev.h> > #include <linux/task_io_accounting_ops.h> > +#include <linux/blk-iotrack.h> > #include <linux/blkdev.h> > #include <linux/mpage.h> > #include <linux/rmap.h> > @@ -1128,6 +1129,7 @@ int __set_page_dirty_nobuffers(struct page *page) > BUG_ON(mapping2 != mapping); > WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); > account_page_dirtied(page, mapping); > + blk_iotrack_reset_owner_pagedirty(page, current->mm); > radix_tree_tag_set(&mapping->page_tree, > page_index(page), PAGECACHE_TAG_DIRTY); > } > diff --git a/mm/swap_state.c b/mm/swap_state.c > index e10f583..ab26978 100644 > --- a/mm/swap_state.c > +++ b/mm/swap_state.c > @@ -19,6 +19,7 @@ > #include <linux/pagevec.h> > #include <linux/migrate.h> > #include <linux/page_cgroup.h> > +#include <linux/blk-iotrack.h> > > #include <asm/pgtable.h> > > @@ -324,6 +325,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, > /* May fail (-ENOMEM) if radix-tree node allocation failed. */ > __set_page_locked(new_page); > SetPageSwapBacked(new_page); > + blk_iotrack_set_owner(new_page, current->mm); Ditto. > err = __add_to_swap_cache(new_page, entry); > if (likely(!err)) { > radix_tree_preload_end(); > -- > 1.6.2.5 Thanks, -Andrea -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo(a)vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
From: Munehiro Ikeda on 9 Jul 2010 19:50
Andrea Righi wrote, on 07/09/2010 05:24 AM: > On Thu, Jul 08, 2010 at 11:16:28PM -0400, Munehiro Ikeda wrote: >> Embedding hooks for iotrack to record process info, namely >> cgroup ID. >> This patch is based on a patch posted from Ryo Tsuruta on Oct 2, >> 2009 titled "Page tracking hooks". >> >> Signed-off-by: Hirokazu Takahashi<taka(a)valinux.co.jp> >> Signed-off-by: Ryo Tsuruta<ryov(a)valinux.co.jp> >> Signed-off-by: Munehiro "Muuhh" Ikeda<m-ikeda(a)ds.jp.nec.com> >> --- >> fs/buffer.c | 2 ++ >> fs/direct-io.c | 2 ++ >> mm/bounce.c | 2 ++ >> mm/filemap.c | 2 ++ >> mm/memory.c | 5 +++++ >> mm/page-writeback.c | 2 ++ >> mm/swap_state.c | 2 ++ >> 7 files changed, 17 insertions(+), 0 deletions(-) >> (snip) >> diff --git a/mm/memory.c b/mm/memory.c >> index 119b7cc..3eb2d0d 100644 >> --- a/mm/memory.c >> +++ b/mm/memory.c >> @@ -52,6 +52,7 @@ >> #include<linux/init.h> >> #include<linux/writeback.h> >> #include<linux/memcontrol.h> >> +#include<linux/blk-iotrack.h> >> #include<linux/mmu_notifier.h> >> #include<linux/kallsyms.h> >> #include<linux/swapops.h> >> @@ -2283,6 +2284,7 @@ gotten: >> */ >> ptep_clear_flush(vma, address, page_table); >> page_add_new_anon_rmap(new_page, vma, address); >> + blk_iotrack_set_owner(new_page, mm); >> /* >> * We call the notify macro here because, when using secondary >> * mmu page tables (such as kvm shadow page tables), we want the >> @@ -2718,6 +2720,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, >> flush_icache_page(vma, page); >> set_pte_at(mm, address, page_table, pte); >> page_add_anon_rmap(page, vma, address); >> + blk_iotrack_reset_owner(page, mm); >> /* It's better to call commit-charge after rmap is established */ >> mem_cgroup_commit_charge_swapin(page, ptr); >> >> @@ -2795,6 +2798,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, >> >> inc_mm_counter_fast(mm, MM_ANONPAGES); >> page_add_new_anon_rmap(page, vma, address); >> + blk_iotrack_set_owner(page, mm); > > We're tracking anonymous memory here. Should we charge the cost of the > swap IO to the root cgroup or the actual owner of the page? there was a > previous discussion about this topic, but can't remember the details, > sorry. > > IMHO we could remove some overhead simply ignoring the tracking of > anonymous pages (swap IO) and just consider direct and writeback IO of > file pages. Well, this needs a decision. Actually who should be charged for swap IO has multiple options. (1) root cgroup (2) page owner (3) memory hogger who triggered swap The choice of this patch is (2). However, I agree that there is no concrete rationale to select (2) but (3), and (1) is the simplest and the best for performance. Alright, I will remove anonymous pages from tracking target. Thanks, Muuhh >> setpte: >> set_pte_at(mm, address, page_table, entry); >> >> @@ -2949,6 +2953,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, >> if (anon) { >> inc_mm_counter_fast(mm, MM_ANONPAGES); >> page_add_new_anon_rmap(page, vma, address); >> + blk_iotrack_set_owner(page, mm); > > Ditto. > >> } else { >> inc_mm_counter_fast(mm, MM_FILEPAGES); >> page_add_file_rmap(page); >> diff --git a/mm/page-writeback.c b/mm/page-writeback.c >> index 54f28bd..f3e6b2c 100644 >> --- a/mm/page-writeback.c >> +++ b/mm/page-writeback.c >> @@ -23,6 +23,7 @@ >> #include<linux/init.h> >> #include<linux/backing-dev.h> >> #include<linux/task_io_accounting_ops.h> >> +#include<linux/blk-iotrack.h> >> #include<linux/blkdev.h> >> #include<linux/mpage.h> >> #include<linux/rmap.h> >> @@ -1128,6 +1129,7 @@ int __set_page_dirty_nobuffers(struct page *page) >> BUG_ON(mapping2 != mapping); >> WARN_ON_ONCE(!PagePrivate(page)&& !PageUptodate(page)); >> account_page_dirtied(page, mapping); >> + blk_iotrack_reset_owner_pagedirty(page, current->mm); >> radix_tree_tag_set(&mapping->page_tree, >> page_index(page), PAGECACHE_TAG_DIRTY); >> } >> diff --git a/mm/swap_state.c b/mm/swap_state.c >> index e10f583..ab26978 100644 >> --- a/mm/swap_state.c >> +++ b/mm/swap_state.c >> @@ -19,6 +19,7 @@ >> #include<linux/pagevec.h> >> #include<linux/migrate.h> >> #include<linux/page_cgroup.h> >> +#include<linux/blk-iotrack.h> >> >> #include<asm/pgtable.h> >> >> @@ -324,6 +325,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, >> /* May fail (-ENOMEM) if radix-tree node allocation failed. */ >> __set_page_locked(new_page); >> SetPageSwapBacked(new_page); >> + blk_iotrack_set_owner(new_page, current->mm); > > Ditto. > >> err = __add_to_swap_cache(new_page, entry); >> if (likely(!err)) { >> radix_tree_preload_end(); >> -- >> 1.6.2.5 > > Thanks, > -Andrea > -- IKEDA, Munehiro NEC Corporation of America m-ikeda(a)ds.jp.nec.com -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo(a)vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ |