From: Chris Mason on
When IPC semaphores are used in a bulk post and wait system, we
can end up waking a very large number of processes per semtimedop call.
At least one major database will use a single process to kick hundreds
of other processes at a time.

This patch tries to reduce the runqueue lock contention by ordering the
wakeups based on the CPU the waiting process was on when it went to
sleep.

A later patch could add some code in the scheduler to help
wake these up in bulk and take the various runqueue locks less often.

Signed-off-by: Chris Mason <chris.mason(a)oracle.com>
---
include/linux/sem.h | 1 +
ipc/sem.c | 37 ++++++++++++++++++++++++++++++++++++-
2 files changed, 37 insertions(+), 1 deletions(-)

diff --git a/include/linux/sem.h b/include/linux/sem.h
index 8b97b51..4a37319 100644
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -104,6 +104,7 @@ struct sem_array {
struct sem_queue {
struct list_head list; /* queue of pending operations */
struct task_struct *sleeper; /* this process */
+ unsigned long sleep_cpu;
struct sem_undo *undo; /* undo structure */
int pid; /* process id of requesting process */
int status; /* completion status of operation */
diff --git a/ipc/sem.c b/ipc/sem.c
index 335cd35..07fe1d5 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -544,6 +544,25 @@ static void wake_up_sem_queue(struct sem_queue *q, int error)
preempt_enable();
}

+/*
+ * sorting helper for struct sem_queues in a list. This is used to
+ * sort by the CPU they are likely to be on when waking them.
+ */
+int list_comp(void *priv, struct list_head *a, struct list_head *b)
+{
+ struct sem_queue *qa;
+ struct sem_queue *qb;
+
+ qa = list_entry(a, struct sem_queue, list);
+ qb = list_entry(b, struct sem_queue, list);
+
+ if (qa->sleep_cpu < qb->sleep_cpu)
+ return -1;
+ if (qa->sleep_cpu > qb->sleep_cpu)
+ return 1;
+ return 0;
+}
+
/**
* update_queue(sma, semnum): Look for tasks that can be completed.
* @sma: semaphore array.
@@ -557,6 +576,7 @@ static void update_queue(struct sem_array *sma, struct list_head *pending_list)
struct sem_queue *q;
LIST_HEAD(new_pending);
LIST_HEAD(work_list);
+ LIST_HEAD(wake_list);

/*
* this seems strange, but what we want to do is process everything
@@ -591,7 +611,10 @@ again:
spin_unlock(&blocker->lock);
continue;
}
- wake_up_sem_queue(q, error);
+ if (error)
+ wake_up_sem_queue(q, error);
+ else
+ list_add_tail(&q->list, &wake_list);

}

@@ -599,6 +622,13 @@ again:
list_splice_init(&new_pending, &work_list);
goto again;
}
+
+ list_sort(NULL, &wake_list, list_comp);
+ while (!list_empty(&wake_list)) {
+ q = list_entry(wake_list.next, struct sem_queue, list);
+ list_del_init(&q->list);
+ wake_up_sem_queue(q, 0);
+ }
}

/* The following counts are associated to each semaphore:
@@ -1440,6 +1470,11 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
queue.status = -EINTR;
queue.sleeper = current;

+ /*
+ * the sleep_cpu number allows sorting by the CPU we expect
+ * their runqueue entry to be on..hopefully faster for waking up
+ */
+ queue.sleep_cpu = my_cpu_offset;
current->state = TASK_INTERRUPTIBLE;

/*
--
1.7.0.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/