From: Mel Gorman on
From: Wu Fengguang <fengguang.wu(a)intel.com>

This is a roll-up of the series entitled "[RFC] writeback: try to write
older pages first".

No signoff required
---
fs/fs-writeback.c | 83 ++++++++++++++++++++------------------
include/linux/writeback.h | 4 +-
include/trace/events/writeback.h | 9 +----
mm/backing-dev.c | 1 -
mm/page-writeback.c | 1 -
5 files changed, 46 insertions(+), 52 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 10f939a..5a3c764 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -213,20 +213,34 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
* Move expired dirty inodes from @delaying_queue to @dispatch_queue.
*/
static void move_expired_inodes(struct list_head *delaying_queue,
- struct list_head *dispatch_queue,
- unsigned long *older_than_this)
+ struct list_head *dispatch_queue,
+ struct writeback_control *wbc)
{
+ unsigned long expire_interval = 0;
+ unsigned long older_than_this = 0; /* reset to kill gcc warning */
LIST_HEAD(tmp);
struct list_head *pos, *node;
struct super_block *sb = NULL;
struct inode *inode;
int do_sb_sort = 0;

+ if (wbc->for_kupdate || wbc->for_background) {
+ expire_interval = msecs_to_jiffies(dirty_expire_interval * 10);
+ older_than_this = jiffies - expire_interval;
+ }
+
while (!list_empty(delaying_queue)) {
inode = list_entry(delaying_queue->prev, struct inode, i_list);
- if (older_than_this &&
- inode_dirtied_after(inode, *older_than_this))
- break;
+ if (expire_interval &&
+ inode_dirtied_after(inode, older_than_this)) {
+ if (wbc->for_background &&
+ list_empty(dispatch_queue) && list_empty(&tmp)) {
+ expire_interval >>= 1;
+ older_than_this = jiffies - expire_interval;
+ continue;
+ } else
+ break;
+ }
if (sb && sb != inode->i_sb)
do_sb_sort = 1;
sb = inode->i_sb;
@@ -254,10 +268,10 @@ static void move_expired_inodes(struct list_head *delaying_queue,
/*
* Queue all expired dirty inodes for io, eldest first.
*/
-static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
+static void queue_io(struct bdi_writeback *wb, struct writeback_control *wbc)
{
list_splice_init(&wb->b_more_io, wb->b_io.prev);
- move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
+ move_expired_inodes(&wb->b_dirty, &wb->b_io, wbc);
}

static int write_inode(struct inode *inode, struct writeback_control *wbc)
@@ -362,6 +376,8 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
int err = write_inode(inode, wbc);
if (ret == 0)
ret = err;
+ if (!err)
+ wbc->inodes_written++;
}

spin_lock(&inode_lock);
@@ -528,12 +544,8 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
iput(inode);
cond_resched();
spin_lock(&inode_lock);
- if (wbc->nr_to_write <= 0) {
- wbc->more_io = 1;
+ if (wbc->nr_to_write <= 0)
return 1;
- }
- if (!list_empty(&wb->b_more_io))
- wbc->more_io = 1;
}
/* b_io is empty */
return 1;
@@ -546,8 +558,9 @@ void writeback_inodes_wb(struct bdi_writeback *wb,

wbc->wb_start = jiffies; /* livelock avoidance */
spin_lock(&inode_lock);
- if (!wbc->for_kupdate || list_empty(&wb->b_io))
- queue_io(wb, wbc->older_than_this);
+
+ if (!(wbc->for_kupdate || wbc->for_background) || list_empty(&wb->b_io))
+ queue_io(wb, wbc);

while (!list_empty(&wb->b_io)) {
struct inode *inode = list_entry(wb->b_io.prev,
@@ -575,8 +588,8 @@ static void __writeback_inodes_sb(struct super_block *sb,

wbc->wb_start = jiffies; /* livelock avoidance */
spin_lock(&inode_lock);
- if (!wbc->for_kupdate || list_empty(&wb->b_io))
- queue_io(wb, wbc->older_than_this);
+ if (!(wbc->for_kupdate || wbc->for_background) || list_empty(&wb->b_io))
+ queue_io(wb, wbc);
writeback_sb_inodes(sb, wb, wbc, true);
spin_unlock(&inode_lock);
}
@@ -611,29 +624,19 @@ static inline bool over_bground_thresh(void)
* Try to run once per dirty_writeback_interval. But if a writeback event
* takes longer than a dirty_writeback_interval interval, then leave a
* one-second gap.
- *
- * older_than_this takes precedence over nr_to_write. So we'll only write back
- * all dirty pages if they are all attached to "old" mappings.
*/
static long wb_writeback(struct bdi_writeback *wb,
struct wb_writeback_work *work)
{
struct writeback_control wbc = {
.sync_mode = work->sync_mode,
- .older_than_this = NULL,
.for_kupdate = work->for_kupdate,
.for_background = work->for_background,
.range_cyclic = work->range_cyclic,
};
- unsigned long oldest_jif;
long wrote = 0;
struct inode *inode;

- if (wbc.for_kupdate) {
- wbc.older_than_this = &oldest_jif;
- oldest_jif = jiffies -
- msecs_to_jiffies(dirty_expire_interval * 10);
- }
if (!wbc.range_cyclic) {
wbc.range_start = 0;
wbc.range_end = LLONG_MAX;
@@ -653,9 +656,9 @@ static long wb_writeback(struct bdi_writeback *wb,
if (work->for_background && !over_bground_thresh())
break;

- wbc.more_io = 0;
wbc.nr_to_write = MAX_WRITEBACK_PAGES;
wbc.pages_skipped = 0;
+ wbc.inodes_written = 0;

trace_wbc_writeback_start(&wbc, wb->bdi);
if (work->sb)
@@ -668,20 +671,25 @@ static long wb_writeback(struct bdi_writeback *wb,
wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;

/*
- * If we consumed everything, see if we have more
+ * Did we write something? Try for more
+ *
+ * This is needed _before_ the b_more_io test because the
+ * background writeback moves inodes to b_io and works on
+ * them in batches (in order to sync old pages first). The
+ * completion of the current batch does not necessarily mean
+ * the overall work is done.
*/
- if (wbc.nr_to_write <= 0)
+ if (wbc.nr_to_write < MAX_WRITEBACK_PAGES)
+ continue;
+ if (wbc.inodes_written)
continue;
+
/*
- * Didn't write everything and we don't have more IO, bail
+ * Nothing written and no more inodes for IO, bail
*/
- if (!wbc.more_io)
+ if (list_empty(&wb->b_more_io))
break;
- /*
- * Did we write something? Try for more
- */
- if (wbc.nr_to_write < MAX_WRITEBACK_PAGES)
- continue;
+
/*
* Nothing written. Wait for some inode to
* become available for writeback. Otherwise
@@ -1026,9 +1034,6 @@ EXPORT_SYMBOL(__mark_inode_dirty);
* Write out a superblock's list of dirty inodes. A wait will be performed
* upon no inodes, all inodes or the final one, depending upon sync_mode.
*
- * If older_than_this is non-NULL, then only write out inodes which
- * had their first dirtying at a time earlier than *older_than_this.
- *
* If `bdi' is non-zero then we're being asked to writeback a specific queue.
* This function assumes that the blockdev superblock's inodes are backed by
* a variety of queues, so all inodes are searched. For other superblocks,
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index c24eca7..494edd6 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -28,14 +28,13 @@ enum writeback_sync_modes {
*/
struct writeback_control {
enum writeback_sync_modes sync_mode;
- unsigned long *older_than_this; /* If !NULL, only write back inodes
- older than this */
unsigned long wb_start; /* Time writeback_inodes_wb was
called. This is needed to avoid
extra jobs and livelock */
long nr_to_write; /* Write this many pages, and decrement
this for each page written */
long pages_skipped; /* Pages which were not written */
+ long inodes_written; /* Number of inodes(metadata) synced */

/*
* For a_ops->writepages(): is start or end are non-zero then this is
@@ -51,7 +50,6 @@ struct writeback_control {
unsigned for_background:1; /* A background writeback */
unsigned for_reclaim:1; /* Invoked from the page allocator */
unsigned range_cyclic:1; /* range_start is cyclic */
- unsigned more_io:1; /* more io to be dispatched */
};

/*
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 0be26ac..dc8001f 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -99,8 +99,6 @@ DECLARE_EVENT_CLASS(wbc_class,
__field(int, for_background)
__field(int, for_reclaim)
__field(int, range_cyclic)
- __field(int, more_io)
- __field(unsigned long, older_than_this)
__field(long, range_start)
__field(long, range_end)
),
@@ -114,15 +112,12 @@ DECLARE_EVENT_CLASS(wbc_class,
__entry->for_background = wbc->for_background;
__entry->for_reclaim = wbc->for_reclaim;
__entry->range_cyclic = wbc->range_cyclic;
- __entry->more_io = wbc->more_io;
- __entry->older_than_this = wbc->older_than_this ?
- *wbc->older_than_this : 0;
__entry->range_start = (long)wbc->range_start;
__entry->range_end = (long)wbc->range_end;
),

TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d "
- "bgrd=%d reclm=%d cyclic=%d more=%d older=0x%lx "
+ "bgrd=%d reclm=%d cyclic=%d "
"start=0x%lx end=0x%lx",
__entry->name,
__entry->nr_to_write,
@@ -132,8 +127,6 @@ DECLARE_EVENT_CLASS(wbc_class,
__entry->for_background,
__entry->for_reclaim,
__entry->range_cyclic,
- __entry->more_io,
- __entry->older_than_this,
__entry->range_start,
__entry->range_end)
)
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index ac78a33..eaea7e0 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -270,7 +270,6 @@ static void bdi_flush_io(struct backing_dev_info *bdi)
{
struct writeback_control wbc = {
.sync_mode = WB_SYNC_NONE,
- .older_than_this = NULL,
.range_cyclic = 1,
.nr_to_write = 1024,
};
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d556cd8..d4a9e3d 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -497,7 +497,6 @@ static void balance_dirty_pages(struct address_space *mapping,
for (;;) {
struct writeback_control wbc = {
.sync_mode = WB_SYNC_NONE,
- .older_than_this = NULL,
.nr_to_write = write_chunk,
.range_cyclic = 1,
};
--
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/